侧边栏壁纸
  • 累计撰写 208 篇文章
  • 累计创建 16 个标签
  • 累计收到 5 条评论

目 录CONTENT

文章目录

Prometheus rules.yaml 报警规则配置文件

Wake
2022-08-07 / 0 评论 / 0 点赞 / 439 阅读 / 550 字
groups:

- name: example #定义规则组

  rules:

  - alert: InstanceDown  #定义报警名称

    expr: up{instance!=~"10.*:9100"} == 0   #Promql语句,触发规则

    for: 1m            # 一分钟

    labels:       #标签定义报警的级别和主机

      name: instance

      severity: Critical

    annotations:  #注解

      summary: " {{ $labels.appname }}" #报警摘要,取报警信息的appname名称

      description: " 服务停止运行 "   #报警信息

      value: "{{ $value }}%"  # 当前报警状态值

- name: Host

  rules:

  - alert: HostMemory Usage

    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 >  90

    for: 15m

    labels:

      name: Memory

      severity: Warning

    annotations:

      summary: " {{ $labels.appname }} "

      description: "主机内存使用率超过90%."

      value: "{{ $value }}"

  #- alert: HostCPU Usage

   # expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname){instance!="192.169.2.87:9100"} > 0.99

   # for: 15m

   # labels:

   #  name: CPU

   #  severity: Warning

   # annotations:

   #  summary: " {{ $labels.appname }} "

   #   description: "主机CPU使用率超过99%."

   #   value: "{{ $value }}"

  - alert: HostLoad 

    expr: node_load5 > 4

    for: 5m

    labels:

      name: Load

      severity: Warning

    annotations:

      summary: "{{ $labels.appname }} "

      description: " 主机负载5分钟超过4."

      value: "{{ $value }}"

  - alert: HostFilesystem Usage

    expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) >  0.85

    for: 1m

    labels:

      name: Disk

      severity: Warning

    annotations:

      summary: " {{ $labels.appname }} "

      description: " 主机 [ {{ $labels.mountpoint }} ]分区使用超过85%."

      value: "{{ $value }}%"

  - alert: HostDiskio

    expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10

    for: 1m

    labels:

      name: Diskio

      severity: Warning

    annotations:

      summary: " {{ $labels.appname }} "

      description: " 主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."

      value: "{{ $value }}iops"

  - alert: Network_receive

    expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 3 

    for: 3m

    labels:

      name: Network_receive

      severity: Warning

    annotations:

      summary: " {{ $labels.appname }} "

      description: " 主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps."

      value: "{{ $value }}3Mbps"

  - alert: Network_transmit

    expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10

    for: 3m

    labels:

      name: Network_transmit

      severity: Warning

    annotations:

      summary: " {{ $labels.appname }} "

      description: " 主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps."

      value: "{{ $value }}10Mbps"

- name: Container

  rules:

  - alert: ContainerCPU Usage

    expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 99

    for: 15m

    labels:

      name: CPU

      severity: Warning

    annotations:

      summary: "{{ $labels.name }} "

      description: " 容器CPU使用超过99%."

      value: "{{ $value }}%"
0

评论区