Prometheus监控企业微信告警

Prometheus监控企业微信告警

prometheus监控并企业微信告警,只用docker-compose快速部署

docker-compose.yaml

version: '3.2'
services:
  prometheus:
    image: prom/prometheus
    restart: "always"
    ports:
      - 9090:9090
    container_name: "prometheus"
    volumes:
      - "./data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml"
      - "./data/rules:/etc/prometheus/rules"
      - "./data/prometheus/data:/prometheus"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'          #设置yml路径  跟上面挂载对应
      - '--storage.tsdb.path=/prometheus'                      #设置数据路径   跟上面挂载对应

#告警模块
  alertmanager:
    image: prom/alertmanager:latest
    restart: "always"
    ports:
      - 9093:9093
    container_name: "alertmanager"
    volumes:
      - "./data/alert/alertmanager.yml:/etc/alertmanager/alertmanager.yml"

#web界面
  grafana:
    image: grafana/grafana
    restart: "always"
    ports:
      - 3000:3000
    container_name: "grafana"
    volumes:
      - "./data/grafana/grafana.ini:/etc/grafana/grafana.ini"              #配置文件自行拷贝出来
      - "./data/grafana/grafana-storage:/var/lib/grafana"

#企业微信钉钉报警
  webhook-adapter:
    image: guyongquan/webhook-adapter
    restart: "always"
    ports:
      - 8060:80
    container_name: "webhook-adapter"
    command:
      - '--adapter=/app/prometheusalert/wx.js=/wx=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=*'
#      - '--adapter=/app/prometheusalert/dingtalk.js=/dingtalk=https://oapi.dingtalk.com/robot/send?access_token={token}#{secret}'

alertmanager.yml

global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'webhook'

receivers:
- name: 'webhook'
  webhook_configs:
  - url: 'http://172.20.57.238:8060/adapter/wx'
    send_resolved: true

prometheus.yml

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 172.20.57.238:9093
rule_files:
  - "rules/*.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: 'myself'
    static_configs:
      - targets: ['172.20.54.113:9114']

test1_rules.yml

groups:
  - name: MemoryUsage
    rules:
    - alert: HighMemoryUsage
      expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 #内存容量不足10%
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} MEM usgae high"
        description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})"
  - name: Instance
    rules:
    - alert: InstanceDown
      expr: up == 0
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} down"
        description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
  - name: 根分区剩余空间
    rules:
    - alert: 根分区剩余空间
      expr: node_filesystem_avail_bytes{job="node-exporter",mountpoint="/"} / node_filesystem_size_bytes{job="node-exporter",mountpoint="/"} * 100 < 10
      for: 3m
      labels:
        severity: warning
      annotations:
        description: ' {{ $labels.instance }} 节点 {{ $labels.device }} 根分区文件系统剩余空间: {{ printf "%.2f" $value }}% '
        summary: '根分区剩余空间不足 10%'
  - name: /mnt/aps所在磁盘空间
    rules:
    - alert: /mnt/aps所在磁盘空间剩余空间
      expr: 100 * (1 - node_filesystem_avail_bytes{mountpoint="/mnt/aps"} / node_filesystem_size_bytes{mountpoint="/mnt/aps"}) > 90
      for: 3m
      labels:
        severity: critical
      annotations:
        summary: "High disk usage detected"
        description: "/mnt/aps所在磁盘空间剩余空间不足 10%"
  - name: 主机CPU使用率告警
    rules:
    - alert: 主机CPU使用率告警
      expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "CPU近5分钟使用率大于90%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
  - name: 主机iowait较高
    rules:
    - alert: 主机iowait较高
      expr: (sum(increase(node_cpu_seconds_total{mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance))  *100 >= 90
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "CPU ioWait近5分钟占比大于等于90%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
  - name: 主机磁盘读过大
    rules:
    - alert: 主机磁盘读过大
      expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 180 * 1024 * 1024
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
  - name: 主机磁盘写过大
    rules:
    - alert: 主机磁盘写过大
      expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 180 * 1024 * 1024
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
  - name: TCP连接数
    rules:
    - alert: TCPTimeWait数量过多告警
      expr: sum by(instance) (rate(node_sockstat_TCP_tw[5m])) >= 3800
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: "TCP TimeWait数量大于3800, 实例: {{$labels.instance}},当前值: {{ $value }}%"
  - name: Pod 监控
    rules:
    - alert: Pod重启次数过多
      expr: sum by (instance,namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])) > 15
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: "Pod {{ $labels.pod }} 连续重启"
        description: "实例:{{$labels.job}}的{{$labels.instance}}命名空间{{$labels.namespace}}下的Pod {{ $labels.pod }} 在过去1小时内重启次数超过15次,请及时处理。"
    - alert: Pod状态异常
      expr: kube_pod_status_phase{phase="Pending",instance !~ "172.20.53.122:.*"} == 1 or kube_pod_status_phase{phase="Failed",instance !~ "172.20.53.122:.*",instance !~ "172.20.54.112:.*"} == 1
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: "Pod {{ $labels.pod }} 状态异常"
        description: "实例:{{$labels.job}}的{{$labels.instance}}命名空间{{$labels.namespace}}下的Pod {{ $labels.pod }} 当前状态异常,请及时处理。"

目录结构

prometheus/
├── data
│   ├── alert
│   │   └── alertmanager.yml
│   ├── grafana
│   │   ├── grafana.ini
│   │   └── grafana-storage
│   │       ├── alerting
│   │       │   └── 1
│   │       │       └── __default__.tmpl
│   │       ├── csv
│   │       ├── grafana.db
│   │       ├── plugins
│   │       └── png
│   ├── prometheus
│   │   ├── data
│   │   │   ├── 01H78F28QM9WK74NRM1M8JHST8
│   │   │   │   ├── chunks
│   │   │   │   │   ├── 000001
│   │   │   │   │   └── 000002
│   │   │   │   ├── index
│   │   │   │   ├── meta.json
│   │   │   │   └── tombstones
│   │   │   ├── 01H793NE7B147SQC3G732RTMEV
│   │   │   │   ├── chunks
│   │   │   │   │   └── 000001
│   │   │   │   ├── index
│   │   │   │   ├── meta.json
│   │   │   │   └── tombstones
│   │   │   ├── 01H79HCJ0HGJM838GVTP49M435
│   │   │   │   ├── chunks
│   │   │   │   │   └── 000001
│   │   │   │   ├── index
│   │   │   │   ├── meta.json
│   │   │   │   └── tombstones
│   │   │   ├── 01H79HCXG5Z793VZQD75PD3XGR
│   │   │   │   ├── chunks
│   │   │   │   │   └── 000001
│   │   │   │   ├── index
│   │   │   │   ├── meta.json
│   │   │   │   └── tombstones
│   │   │   ├── chunks_head
│   │   │   │   ├── 000025
│   │   │   │   ├── 000026
│   │   │   │   └── 000027
│   │   │   ├── lock
│   │   │   ├── queries.active
│   │   │   └── wal
│   │   │       ├── 00000081
│   │   │       ├── 00000082
│   │   │       ├── 00000083
│   │   │       ├── 00000084
│   │   │       ├── 00000085
│   │   │       └── checkpoint.00000080
│   │   │           └── 00000000
│   │   └── prometheus.yml
│   └── rules
│       └── test1_rules.yml
├── docker-compose.yaml
└── test_alert.sh

参考链接

https://blog.51cto.com/erdong/4755694

https://www.cszhi.com/2019/10/18/Prometheus告警大屏/

https://it.cha138.com/android/show-1101477.html