alertmanager
# alertmanager
## [下载安装](https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz)
```bash
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz
cd /etc/alertmanager
tar zxvf alertmanager-0.22.2.linux-amd64.tar.gz
vim alertmanager.yml
nohup ./alertmanager --config.file=alertmanager.yml
```
## docker 运行alertmanager
编写/etc/alertmanager/alertmanager.yml文件
```yaml
global:
# 在没有报警的情况下声明为已解决的时间
resolve_timeout: 5m
#配置发送邮箱,我这里用的qq
smtp_from: '2864048202@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '2864048202@qq.com'
# 注意这里需要配置QQ邮箱的授权码,不是登录密码,授权码在账户配置中查看
smtp_auth_password: 'orjgfmltorubdgih'
smtp_require_tls: false
route:
group_by: ['alert_node']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
# 请注意这里的收件箱请改为你自己的邮箱地址,多个用逗号隔开
- to: '2864048202@qq.com,675045743@qq.com'
send_resolved: true
inhibit_rules:
- source_match:
```
启动
```bash
# 启动 alertmanager
docker run -d --restart=always --name=alertmanager -p 9093:9093 -v /etc/alertmanager:/etc/alertmanager prom/alertmanager:latest
# 启动node_exporter
docker run -d --name=node-exporter -p 9100:9100 \
-v /proc:/host/proc:ro \
-v /sys:/host/sys:ro \
-v /:/rootfs:ro \
-v /var/run:/var/host/run:rw \
prom/node-exporter
```
## 配置告警
```yaml
# 修改 prometheus.yml
global:
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.111.134:9093
rule_files:
- '/etc/prometheus/rules/*.yaml'
- '/etc/prometheus/rules/*.yml'
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "node_exporter"
static_configs:
- targets: ["localhost:9100"]
```
创建rules
```bash
mkdir /etc/prometheus/rules/
cd /etc/prometheus/rules/
vim /etc/prometheus/rules/node.yaml
groups:
- name: node_rule
rules:
- alert: 磁盘discard操作耗时告警
expr: (node_disk_discard_time_seconds_total ) == 0
for: 30s
labels:
severity: error
annotations:
summary: "{{ $labels.instance }}: 系统中磁盘上执行丢弃(discard)操作所花费的总时间为0,已经超过30秒"
value: "{{ $value }}"
# 检查配置是否正确
[root@node134 prometheus]# ./promtool check rules /etc/prometheus/rules/node.yaml
Checking /etc/prometheus/rules/node.yaml
SUCCESS: 1 rules found
```
重启prometheus服务
`curl -X POST http://localhost:9090/-/reload`
`systemctl restart prometheus.service`
```
验证配置文件语法:
promtool check config <config-file>
验证告警规则和记录规则:
promtool check rules <rules-file>
验证规则文件中的表达式:
promtool check rules <rules-file> --v=<level>
```
```
# HELP node_disk_discard_time_seconds_total This is the total number of seconds spent by all discards.
# TYPE node_disk_discard_time_seconds_total counter
node_disk_discard_time_seconds_total{device="sda"} 0
node_disk_discard_time_seconds_total{device="sdb"} 0
node_disk_discard_time_seconds_total{device="sr0"} 0
```
符合条件会报警

# [Prometheus + Alertmanager 实现 钉钉监控告警](https://www.jianshu.com/p/eae67b770c3e)
```bash
# 安装包下载
https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 prometheus-webhook
cd prometheus-webhook/
cp config.example.yml config.yml
# cat /usr/lib/systemd/system/prometheus-webhook.service
[Unit]
Description=Prometheus Dingding Webhook
[Service]
ExecStart=/opt/prometheus-webhook/prometheus-webhook-dingtalk --config.file=/opt/prometheus-webhook/config.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
# 启动服务
systemctl start prometheus-webhook.service
```
修改prometheus-webook配置文件绑定申请的机器人
```bash
cat config.yml
## Customizable templates path
templates:
## - templates/alertmanager-dingtalk.tmpl
- /opt/alertmanager/dingding3.tmpl # 配置告警模板的所在位置
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=e110ec606d92fd65e39f6bdededd8529b1df2dfec9b757931909d234e5167884 # 配置机器人的webhook_url
# secret for signature
secret: SEC0286d5a674e155301b454aa5dd608094717145dd273794ae5736da64c41b27e0 # 配置加签(申请的时候那串数字)
message:
title: '{{ template "ops.title" . }}' # 给这个webhook应用上 模板标题 (ops.title是我们模板文件中的title 可在下面给出的模板文件中看到)
text: '{{ template "ops.content" . }}' # 给这个webhook应用上 模板内容 (ops.content是我们模板文件中的content 可在下面给出的模板文件中看到)
```
告警模板文件
```bash
# cat /opt/prometheus-webhook/dingding3.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
**告警类型**: {{ .Labels.alertname }}
**告警级别**: {{ .Labels.level }}
**故障主机**: {{ .Labels.instance }}
**告警信息**: {{ .Annotations.description }}
**触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
**告警类型**: {{ .Labels.alertname }}
**告警级别**: {{ .Labels.level }}
**故障主机**: {{ .Labels.instance }}
**触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}
```
修改alertmanager配置文件为如下内容
```bash
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xxx@163.com'
smtp_auth_username: 'xxxx@163.com'
smtp_auth_password: '邮箱的授权码'
smtp_require_tls: false
templates:
- '/opt/prometheus-webhook/*.tmpl' #告警模板位置
route:
group_by: ['servers_survival','servers_status'] # 根据告警规则组名进行分组
group_wait: 30s # 分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警
group_interval: 5m # 发送新告警间隔时间
repeat_interval: 30m #重复告警间隔发送时间,如果没处理过多久再次发送一次
receiver: 'dingtalk_webhook' # 接收人
receivers:
- name: 'ops'
email_configs:
- to: 'tianye@163.com'
html: '{{ template "email.to.html" .}}'
headers: { Subject: "[WARNING]Prometheus告警邮件" }
send_resolved: true
- name: 'dingtalk_webhook'
webhook_configs:
- url: 'http://192.168.111.134:8060/dingtalk/webhook1/send' # 填写prometheus-webhook的webhook1 url
send_resolved: true # 在恢复后是否发送恢复消息给接收人
```
重启服务
```bash
systemctl restart prometheus-webhook.service
systemctl restart alertmanager.service
# 前面docker启动的重启方式
docker restart alertmanager
```
查看钉钉告警