二进制部署Prometheus alertmanager
来源:原创
时间:2020-04-02
作者:脚本小站
分类:Linux
prometheus.service:
cat > /usr/lib/systemd/system/prometheus.service <<EOF [Unit] Description=Prometheus [Service] ExecStart=/usr/local/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --storage.tsdb.path=/usr/local/prometheus/data \ --storage.tsdb.retention=30d \ --web.enable-lifecycle Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
alertmanager.service:
cat > /usr/lib/systemd/system/alertmanager.service <<EOF [Unit] Description=alertmanager [Service] ExecStart=/usr/local/alertmanager/alertmanager \ --config.file=/usr/local/alertmanager/alertmanager.yml \ --storage.path=/usr/local/alertmanager/data Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
prometheusalert.service:
cat > /usr/lib/systemd/system/prometheusalert.service <<EOF [Unit] Description=PrometheusAlert [Service] WorkingDirectory=/usr/local/PrometheusAlert ExecStart=/usr/local/PrometheusAlert/PrometheusAlert Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF 默认账号和密码: prometheusalert prometheusalert
node_exporter.service:
cat > /usr/lib/systemd/system/node_exporter.service <<EOF [Unit] Description=node_exporter After=network.target [Service] ExecStart=/usr/local/node_exporter/node_exporter Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
dingtalk.service:
cat > /usr/lib/systemd/system/dingtalk.service <<EOF [Unit] Description=dingtalk After=network.target [Service] ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \ --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml \ --web.listen-address=0.0.0.0:8060 \ --web.enable-ui \ --web.enable-lifecycle Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
blackbox_exporter.service:
cat > /usr/lib/systemd/system/blackbox_exporter.service <<EOF [Unit] Description=blackbox_exporter After=network.target [Service] Type=simple User=root Group=root ExecStart=/usr/local/blackbox_exporter/blackbox_exporter \ --config.file=/usr/local/blackbox_exporter/blackbox.yml \ --web.listen-address ":9115" Restart=on-failure [Install] WantedBy=multi-user.target EOF
blackbox.yml:
modules: http_2xx: prober: http timeout: 8s http: method: GET preferred_ip_protocol: "ip4" # 改成ipv4的默认不写是ipv6 ip_protocol_fallback: false http_post_2xx: prober: http http: method: POST preferred_ip_protocol: "ip4" ip_protocol_fallback: false tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
prometheus.yml
# 全局配置 global: scrape_interval: 15s # 采集数据间隔 evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager # 报警配置 alerting: alertmanagers: - static_configs: - targets: - 192.168.0.19:9093 # 报警规则 rule_files: - "first_rules.yml" - "second_rules.yml" # 静态配置监控 scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] labels: region: local # 添加标签 - job_name: 'node_exporter' static_configs: - targets: ['192.168.0.19:9100'] # 文件发现 - job_name: 'nodes' file_sd_configs: - files: - targets/nodes.yml refresh_interval: 60m
targets/nodes.yaml
- labels: region: local type: virtual targets: - 10.32.0.12:9100 - 10.32.0.13:9100 - labels: region: dev type: physical targets: - 10.32.0.14:9100 - 10.32.0.15:9100
重载配置:
curl -X POST http://localhost:9090/-/reload
alertmanager.yml
global: resolve_timeout: 5m smtp_smarthost: "smtp.163.com:465" smtp_from: "chuxiangyi_com@163.com" smtp_auth_username: "chuxiangyi_com@163.com" smtp_auth_password: "123456" smtp_require_tls: false route: group_by: ['alertname','instance'] # 消息按照这个标签分组,一起发送一组消息(一条消息中有多个告警信息),这个分组写不写不影响路由,主要是和group_wait一起配合使用 group_wait: 10s # 在这个时间内将收到的告警消息合在一起发送出去,group_by组合使用 group_interval: 10s # 相同组告警时间间隔 repeat_interval: 1h # 报警未解决重复报警时间间隔 receiver: 'email' # 选择填写报警名称,默认路由 routes: # 子路由,将消息再分组发送到不同的消息出口 - match: # 匹配标签,多个标签为逻辑与 env: prod # 这个标签在prometheus的配置文件或alertmanager的配置文件中都可以自己添加 group_by: ['severity'] # 消息分组一起发送 receiver: 'prd' - match_re: # 使用正则匹配标签 cluster: redis|mysql receiver: 'dev' receivers: # 接收报警的方式 - name: "email" # 报警方式名称 email_configs: # 邮件报警,需要在全局中配置SMTP - to: "250994559@qq.com" - name: 'dev' webhook_configs: # webhook方式发送报警信息 - url: 'http://127.0.0.1:5001/' - name: 'prd' webhook_configs: # webhook方式发送报警信息 - url: 'http://127.0.0.1:5001/' inhibit_rules: # 告警抑制 - source_match: # 匹配到如下两个标签开始记录 alertname: node_exporter_26 severity: critical target_match: # 匹配到如下标签和equal中定义的标签,且equal中的标签要和匹配的标签值一样 severity: critical equal: - cluster
重载配置:
curl -XPOST localhost:9093/-/reload
grafana:
常用的dashboard图:
https://github.com/starsliao/Prometheus
配置文件在:
/etc/grafana/grafana.ini
Node Exporter的Grafana模版:
英文版本:
https://grafana.com/grafana/dashboards/11074
中文版本:
https://grafana.com/grafana/dashboards/8919
报警规则:
这个规则保存为一个文件,配置到prometheus的配置文件中的rule_files字段下,以列表的形式。
rule_files: - "first_rules.yml" - "second_rules.yml"
规则:
groups: - name: node_exporter rules: - alert: node_exporter expr: up{job="node_exporter"} == 0 for: 5m labels: severity: critical annotations: summary: node_exporter down description: node_exporter down
告警抑制:
inhibit_rules: - source_match: alertname: node_exporter_26 # alertname是规则中 rules.alert 定义的值 severity: critical target_match: severity: critical equal: - cluster
如下两个告警实例:
A告警:
groups: - name: node_exporter_26 down rules: - alert: node_exporter_26 expr: up{job="node_exporter-26"} == 0 for: 1m labels: severity: critical cluster: gitlab annotations: summary: node_exporter_26 down description: node_exporter_26 down
B告警:
groups: - name: haproxy down rules: - alert: haproxy expr: up{job="haproxy_exporter"} == 0 for: 1m labels: severity: critical cluster: gitlab annotations: summary: haproxy down
当有告警时且匹配到source_match下定义的标签时开始记录,当再有告警出现时匹配到target_match中的标签,且这个告警中的标签和equal中的定义的标签名的值要相同时就会抑制告警。如上示例这种A告警出现后B告警就会被抑制。
常用报警规则: 对节点、CPU、内存、磁盘进行监控和报警。
groups: - name: example rules: - alert: InstanceDown expr: up == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} {{ $labels.region }} has been down for more than 5 minutes." - alert: MemUsageHigh expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 for: 10m labels: severity: critical annotations: summary: "High memory usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}" - alert: CPUUsageHigh expr: (1-(sum(increase(node_cpu_seconds_total{mode="idle"}[1m]))by(instance))/(sum(increase(node_cpu_seconds_total[1m]))by(instance)))*100 > 60 for: 10m labels: severity: critical annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}" - alert: DiskUsageHigh expr: (1 - (node_filesystem_free_bytes{device=~"/dev.*"} / node_filesystem_size_bytes{device=~"/dev.*"})) * 100 > 80 for: 10m labels: severity: critical annotations: summary: "High Disk usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}"
几个时间参数:
prometheus.yml
global: scrape_interval: 15s # 数据采集间隔 evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager scrape_timeout: 10s # 数据采集超时时间默认10s
alertmanager.yml
# route标记:告警如何发送分配 route: group_by: ['alertname'] # group_by:采用哪个标签作为分组的依据 group_wait: 10s # group_wait:分组等待的时间 group_interval: 10s # 上下两组发送告警的间隔时间 repeat_interval: 1m # 重复发送告警时间。默认1h