alertmanager

# alertmanager ## [下载安装](https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz) ```bash wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz cd /etc/alertmanager tar zxvf alertmanager-0.22.2.linux-amd64.tar.gz vim alertmanager.yml nohup ./alertmanager --config.file=alertmanager.yml ``` ## docker 运行alertmanager 编写/etc/alertmanager/alertmanager.yml文件 ```yaml global: # 在没有报警的情况下声明为已解决的时间 resolve_timeout: 5m #配置发送邮箱,我这里用的qq smtp_from: '2864048202@qq.com' smtp_smarthost: 'smtp.qq.com:465' smtp_auth_username: '2864048202@qq.com' # 注意这里需要配置QQ邮箱的授权码,不是登录密码,授权码在账户配置中查看 smtp_auth_password: 'orjgfmltorubdgih' smtp_require_tls: false route: group_by: ['alert_node'] group_wait: 5s group_interval: 5s repeat_interval: 5m receiver: 'email' receivers: - name: 'email' email_configs: # 请注意这里的收件箱请改为你自己的邮箱地址,多个用逗号隔开 - to: '2864048202@qq.com,675045743@qq.com' send_resolved: true inhibit_rules: - source_match: ``` 启动 ```bash # 启动 alertmanager docker run -d --restart=always --name=alertmanager -p 9093:9093 -v /etc/alertmanager:/etc/alertmanager prom/alertmanager:latest # 启动node_exporter docker run -d --name=node-exporter -p 9100:9100 \ -v /proc:/host/proc:ro \ -v /sys:/host/sys:ro \ -v /:/rootfs:ro \ -v /var/run:/var/host/run:rw \ prom/node-exporter ``` ## 配置告警 ```yaml # 修改 prometheus.yml global: alerting: alertmanagers: - static_configs: - targets: - 192.168.111.134:9093 rule_files: - '/etc/prometheus/rules/*.yaml' - '/etc/prometheus/rules/*.yml' scrape_configs: - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] - job_name: "node_exporter" static_configs: - targets: ["localhost:9100"] ``` 创建rules ```bash mkdir /etc/prometheus/rules/ cd /etc/prometheus/rules/ vim /etc/prometheus/rules/node.yaml groups: - name: node_rule rules: - alert: 磁盘discard操作耗时告警 expr: (node_disk_discard_time_seconds_total ) == 0 for: 30s labels: severity: error annotations: summary: "{{ $labels.instance }}: 系统中磁盘上执行丢弃(discard)操作所花费的总时间为0,已经超过30秒" value: "{{ $value }}" # 检查配置是否正确 [root@node134 prometheus]# ./promtool check rules /etc/prometheus/rules/node.yaml Checking /etc/prometheus/rules/node.yaml SUCCESS: 1 rules found ``` 重启prometheus服务 `curl -X POST http://localhost:9090/-/reload` `systemctl restart prometheus.service` ``` 验证配置文件语法: promtool check config <config-file> 验证告警规则和记录规则: promtool check rules <rules-file> 验证规则文件中的表达式: promtool check rules <rules-file> --v=<level> ``` ``` # HELP node_disk_discard_time_seconds_total This is the total number of seconds spent by all discards. # TYPE node_disk_discard_time_seconds_total counter node_disk_discard_time_seconds_total{device="sda"} 0 node_disk_discard_time_seconds_total{device="sdb"} 0 node_disk_discard_time_seconds_total{device="sr0"} 0 ``` 符合条件会报警 ![image.png](https://cos.easydoc.net/97954506/files/ltbk3qs4.png) # [Prometheus + Alertmanager 实现 钉钉监控告警](https://www.jianshu.com/p/eae67b770c3e) ```bash # 安装包下载 https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 prometheus-webhook cd prometheus-webhook/ cp config.example.yml config.yml # cat /usr/lib/systemd/system/prometheus-webhook.service [Unit] Description=Prometheus Dingding Webhook [Service] ExecStart=/opt/prometheus-webhook/prometheus-webhook-dingtalk --config.file=/opt/prometheus-webhook/config.yml ExecReload=/bin/kill -HUP $MAINPID KillMode=process Restart=on-failure [Install] WantedBy=multi-user.target # 启动服务 systemctl start prometheus-webhook.service ``` 修改prometheus-webook配置文件绑定申请的机器人 ```bash cat config.yml ## Customizable templates path templates: ## - templates/alertmanager-dingtalk.tmpl - /opt/alertmanager/dingding3.tmpl # 配置告警模板的所在位置 #default_message: # title: '{{ template "legacy.title" . }}' # text: '{{ template "legacy.content" . }}' ## Targets, previously was known as "profiles" targets: webhook1: url: https://oapi.dingtalk.com/robot/send?access_token=e110ec606d92fd65e39f6bdededd8529b1df2dfec9b757931909d234e5167884 # 配置机器人的webhook_url # secret for signature secret: SEC0286d5a674e155301b454aa5dd608094717145dd273794ae5736da64c41b27e0 # 配置加签(申请的时候那串数字) message: title: '{{ template "ops.title" . }}' # 给这个webhook应用上 模板标题 (ops.title是我们模板文件中的title 可在下面给出的模板文件中看到) text: '{{ template "ops.content" . }}' # 给这个webhook应用上 模板内容 (ops.content是我们模板文件中的content 可在下面给出的模板文件中看到) ``` 告警模板文件 ```bash # cat /opt/prometheus-webhook/dingding3.tmpl {{ define "__subject" }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ end }} {{ define "__alert_list" }}{{ range . }} --- **告警类型**: {{ .Labels.alertname }} **告警级别**: {{ .Labels.level }} **故障主机**: {{ .Labels.instance }} **告警信息**: {{ .Annotations.description }} **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end }} {{ define "__resolved_list" }}{{ range . }} --- **告警类型**: {{ .Labels.alertname }} **告警级别**: {{ .Labels.level }} **故障主机**: {{ .Labels.instance }} **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} **恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end }} {{ define "ops.title" }} {{ template "__subject" . }} {{ end }} {{ define "ops.content" }} {{ if gt (len .Alerts.Firing) 0 }} **====侦测到{{ .Alerts.Firing | len }}个故障====** {{ template "__alert_list" .Alerts.Firing }} --- {{ end }} {{ if gt (len .Alerts.Resolved) 0 }} **====恢复{{ .Alerts.Resolved | len }}个故障====** {{ template "__resolved_list" .Alerts.Resolved }} {{ end }} {{ end }} {{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }} {{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }} {{ template "ops.title" . }} {{ template "ops.content" . }} ``` 修改alertmanager配置文件为如下内容 ```bash global: resolve_timeout: 5m smtp_smarthost: 'smtp.163.com:25' smtp_from: 'xxx@163.com' smtp_auth_username: 'xxxx@163.com' smtp_auth_password: '邮箱的授权码' smtp_require_tls: false templates: - '/opt/prometheus-webhook/*.tmpl' #告警模板位置 route: group_by: ['servers_survival','servers_status'] # 根据告警规则组名进行分组 group_wait: 30s # 分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警 group_interval: 5m # 发送新告警间隔时间 repeat_interval: 30m #重复告警间隔发送时间,如果没处理过多久再次发送一次 receiver: 'dingtalk_webhook' # 接收人 receivers: - name: 'ops' email_configs: - to: 'tianye@163.com' html: '{{ template "email.to.html" .}}' headers: { Subject: "[WARNING]Prometheus告警邮件" } send_resolved: true - name: 'dingtalk_webhook' webhook_configs: - url: 'http://192.168.111.134:8060/dingtalk/webhook1/send' # 填写prometheus-webhook的webhook1 url send_resolved: true # 在恢复后是否发送恢复消息给接收人 ``` 重启服务 ```bash systemctl restart prometheus-webhook.service systemctl restart alertmanager.service # 前面docker启动的重启方式 docker restart alertmanager ``` 查看钉钉告警