alertmanager 安装,它监听在 9093/9094 端口
安装
二进制安装
tar -zxvf alertmanager-0.21.0.linux-amd64.tar.gz -C /usr/local
ln -sn /usr/local/alertmanager-0.21.0.linux-amd64 /usr/local/alertmanager
- vim /etc/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target
[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --web.external-url=100.80.0.128:9093
ExecStop=/bin/kill -KILL $MAINPID
ExecReload=/bin/kill -HUP $MAINPID
KillMode=control-group
Restart=on-failure
RestartSec=3s
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl start alertmanager
systemctl status alertmanager
systemctl enable alertmanager
docker-compose
version: '3'
services:
alertmanager:
image: prom/alertmanager
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
docker-compose up -d
配置
# global 指定了默认的接收者配置项
global:
# 默认smtp 配置项,如果recivers中没有配置则采用全局配置项
[ smtp_from: <tmpl_string> ]
[ smtp_smarthost: <string> ]
[ smtp_hello: <string> | default = "localhost" ]
[ smtp_auth_username: <string> ]
[ smtp_auth_password: <secret> ]
[ smtp_auth_identity: <string> ]
[ smtp_auth_secret: <secret> ]
[ smtp_require_tls: <bool> | default = true ]
# 企业微信告警配置
[ wechat_api_url: <string> | default = "https://qyapi.weixin.qq.com/cgi-bin/" ]
[ wechat_api_secret: <secret> ]
[ wechat_api_corp_id: <string> ]
# 默认http客户端配置,不推荐配置。参考官方文档: https://prometheus.io/docs/alerting/latest/clients/
[ http_config: <http_config> ]
# 如果警报不包含EndsAt,则ResolveTimeout是Alertmanager使用的默认值,经过此时间后,如果尚未更新,则可以将警报声明为已解决。
# 这对Prometheus的警报没有影响,因为它们始终包含EndsAt。
[ resolve_timeout: <duration> | default = 5m ]
# 定义通知模板,最好一个列表元素可以使用Linux通配符,如 *.tmpl
templates:
[ - <filepath> ... ]
# 定义路由
route: <route>
# 定义通知的接收者
receivers:
- <receiver> ...
# 告警抑制规则
inhibit_rules:
[ - <inhibit_rule> ... ]
global:
resolve_timeout //解析超时时间,也就是报警恢复不是立马发送的,而是在一个时间范围内不在触发报警,才能发送恢复报警,默认为5分钟
smtp_from //收件人的邮箱地址
smtp_smarthost //邮箱提供商的smtp地址
smtp_auth_username //收件人的邮箱账号
smtp_auth_password //邮箱授权码
smtp_require_tls //是否需要tls协议,默认是true
wechart_api_url //微信api地址
wbchart_api_secret //密码
wechat_api_corp_id //机器人应用的id
route:
group_by //采用哪个标签作为分组
group_wait //分组等待的时间,收到报警不是立马发送出去,而是等待一段时间,看看同一组中是否有其他报警,如果有一并发送
group_interval //告警时间间隔
repeat_interval //重复告警时间间隔,可以减少发送告警的频率
receiver //接收者是谁
routes //子路由配置
receivers:
name //接收者的名字,这里和route中的receiver对应
email_configs
- to //接收者的邮箱地址
route 规则
# 告警接收者
[ receiver: <string> ]
# 告警根据标签进行分组,相同标签的作为一组进行聚合,发送单条告警信息。特殊值 '...' 表示告警不聚合
[ group_by: '[' <labelname>, ... ']' ]
# 告警是否匹后续的同级节点,如果为true还会继续进行规则匹配,否则匹配成功就截止
[ continue: <boolean> | default = false ]
# 报警必须匹配到labelname,否则无法匹配到该组路由,一般用于发送给不同联系人时使用
match:
[ <labelname>: <labelvalue>, ... ]
match_re:
[ <labelname>: <regex>, ... ]
# 第一次发送当前group报警等待的时间,目的是实现同组告警的聚合
[ group_wait: <duration> | default = 30s ]
# 当上一次group告警发送成功后,改组又出现新的告警,那么等待多久再发送,一般设置为5分钟或者更久
[ group_interval: <duration> | default = 5m ]
# 已经发送成功的告警,但是一直没消除,那么等待多久再发送。一般推荐三个小时以上
[ repeat_interval: <duration> | default = 4h ]
# 子路由
routes:
[ - <route> ... ]
route:
group_by: ['alertname'] //定义分组,根据label标签进行分组
group_wait: 10s //分组等待时间,也就是说在10秒内同一个组中有没有一起报警的,如果有则同时发出报警邮件,也就是有2个报警同时发在一个邮件
group_interval: 10s //告警时间间隔
repeat_interval: 10m //重复告警间隔,也就是触发的一个告警在10分钟内没有处理则再次发一封邮件。
continue: false //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;
receiver: 'devops' //默认邮箱
routes: //启用一个子路由
- receiver: 'dba' //接收者为dba
group_wait: 10s //分组等待时间
match_re: //匹配一个正则
service: mysql|db //service标签包含mysql和db的统一发送给dba的邮箱
continue: false //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;
- receiver: 'devops' //接收者为devops
group_wait: 10s //分组时间
match_re:
serverity: error //将serverity标签值包含error的发送给devops的邮箱
continue: false //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;
receivers: //定义接收者的邮箱
- name: 'devops' //接收者名字,要和routes中的receiver对应
email_configs:
- to: 'xx@qq.com' //devops的邮箱地址
- name: 'dba' //接收者名字,要和routes中的receiver对应
email_configs:
- to: 'yy@qq.com' //dba的邮箱地址
inhibit_rules
target_match:
[ <labelname>: <labelvalue>, ... ]
target_match_re:
[ <labelname>: <regex>, ... ]
source_match:
[ <labelname>: <labelvalue>, ... ]
source_match_re:
[ <labelname>: <regex>, ... ]
[ equal: '[' <labelname>, ... ']' ]
inhibit_rules:
- source_match:
severity: critical
alertname: ServiceDown
target_match:
severity: warning
equal: ['service']
inhibit_rules:
- source_match:
serverity: 'critical' //源标签警报触发时抑制含有目标标签的警报,在当前警报匹配serverity=critical
target_match:
serverity: 'warning' //抑制`serverity=warning`类型告警
equal: ['alertname', 'dev', 'instance'] //告警中包含的分组名称。标签内容相同才会抑制,也就是说警报中三个标签值相同才会被抑制
示例 1
$ alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
http_config:
bearer_token: xxxx
tls_config:
insecure_skip_verify: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
示例 2
# /opt/apps/alertmanager/alertmanager.yml
global:
smtp_from: xx@xx.com
smtp_smarthost: smtp.163.com:25
smtp_auth_username: xx@163.com
smtp_auth_password: xxx
smtp_require_tls: false
# templates:
# - /opt/apps/alertmanager/templates/*.tmpl
route:
group_by: ["alertname"]
group_wait: 60s
group_interval: 10m
repeat_interval: 6h
receiver: ops_email
routes:
- match_re:
level: info|warn|error|fatal
receiver: ops_email
continue: true
- match:
level: fatal
receiver: admin_email
continue: true
receivers:
- name: ops_email
email_configs:
- to: yy@xx.com
send_resolved: true
- name: admin_email
email_configs:
- to: xx@xx.com
send_resolved: true
{{ define "email.html" }}
<style type="text/css">
table.gridtable {
font-family: verdana,arial,sans-serif;
font-size:11px;
color:#333333;
border-width: 1px;
border-color: #666666;
border-collapse: collapse;
}
table.gridtable th {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #dedede;
}
table.gridtable td {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #ffffff;
}
</style>
<table class="gridtable">
<tr>
<th>报警项</th>
<th>实例</th>
<th>当前值</th>
<th>告警级别</th>
<th>开始时间</th>
<th>告警摘要</th>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ index $alert.Labels "level" }}</td>
<td>{{ $alert.StartsAt.Format "2006-01-02 15:04:05 MST" }}</td>
<td>{{ index $alert.Annotations "summary" }}</td>
</tr>
{{ end }}
</table>
{{ end }}
amtool --help
amtool --alertmanager.url=http://127.0.0.1:9093 silence query
amtool --alertmanager.url=http://127.0.0.1:9093 silence add alertname="InstanceDown" -c "忽略instance故障告警"
amtool --alertmanager.url=http://127.0.0.1:9093 silence add alertname="InstanceDown" job=~".*CADvisor.*" -c "忽略cadvsor instance故障告警"
amtool --alertmanager.url=http://127.0.0.1:9093 silence expire <uuid>
访问
- 端口:9093
- http://:9093/#/alerts
告警示例
接受端
package main
import (
"fmt"
"net/http"
)
func main() {
http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
fmt.Printf("%#v\n", request.Body)
writer.Write([]byte(`{"status": 200}`))
})
fmt.Println("license on :5001")
fmt.Println(http.ListenAndServe(":5001", nil))
}
告警示例
{"receiver":"web\\.hook","status":"resolved","alerts":[{"status":"resolved","labels":{"alertname":"target is down","instance":"100.80.0.128:9100","job":"node","level":"warning"},"annotations":{"description":"节点故障","summary":"节点故障"},"startsAt":"2022-04-29T03:41:53.092589547Z","endsAt":"2022-04-29T03:49:23.092589547Z","generatorURL":"http://100.80.0.128:9090/graph?g0.expr=up+%30026g0.tab=1","fingerprint":"d863629cfb977c6d"}],"groupLabels":{"alertname":"target is down"},"commonLabels":{"alertname":"target is down","instance":"100.80.0.128:9100","job":"node","level":"warning"},"commonAnnotations":{"description":"节点故障","summary":"节点故障"},"externalURL":"http://golang-dev:9093","version":"4","groupKey":"{}:{alertname=\"target is down\"}","truncatedAlerts
webhook
https://github.com/xiexianbin/go-alertmanager-webhook