alertmanager 安装

发布时间: 更新时间: 总字数:3261 阅读时间:7m 作者: IP上海 分享 网址

alertmanager 安装,它监听在 9093/9094 端口

安装

二进制安装

tar -zxvf alertmanager-0.21.0.linux-amd64.tar.gz -C /usr/local
ln -sn /usr/local/alertmanager-0.21.0.linux-amd64 /usr/local/alertmanager
  • vim /etc/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target

[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --web.external-url=100.80.0.128:9093
ExecStop=/bin/kill -KILL $MAINPID
ExecReload=/bin/kill -HUP $MAINPID
KillMode=control-group
Restart=on-failure
RestartSec=3s

[Install]
WantedBy=multi-user.target
  • 启动服务命令
systemctl daemon-reload
systemctl start alertmanager
systemctl status alertmanager
systemctl enable alertmanager

docker-compose

version: '3'

services:
  alertmanager:
    image: prom/alertmanager
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"
  • 启动服务
docker-compose up -d

配置

# global 指定了默认的接收者配置项
global:
  # 默认smtp 配置项,如果recivers中没有配置则采用全局配置项
  [ smtp_from: <tmpl_string> ]
  [ smtp_smarthost: <string> ]
  [ smtp_hello: <string> | default = "localhost" ]
  [ smtp_auth_username: <string> ]
  [ smtp_auth_password: <secret> ]
  [ smtp_auth_identity: <string> ]
  [ smtp_auth_secret: <secret> ]
  [ smtp_require_tls: <bool> | default = true ]

  # 企业微信告警配置
  [ wechat_api_url: <string> | default = "https://qyapi.weixin.qq.com/cgi-bin/" ]
  [ wechat_api_secret: <secret> ]
  [ wechat_api_corp_id: <string> ]

  # 默认http客户端配置,不推荐配置。参考官方文档: https://prometheus.io/docs/alerting/latest/clients/
  [ http_config: <http_config> ]

  # 如果警报不包含EndsAt,则ResolveTimeout是Alertmanager使用的默认值,经过此时间后,如果尚未更新,则可以将警报声明为已解决。
  # 这对Prometheus的警报没有影响,因为它们始终包含EndsAt。
  [ resolve_timeout: <duration> | default = 5m ]

# 定义通知模板,最好一个列表元素可以使用Linux通配符,如 *.tmpl
templates:
  [ - <filepath> ... ]

# 定义路由
route: <route>

# 定义通知的接收者
receivers:
  - <receiver> ...

# 告警抑制规则
inhibit_rules:
  [ - <inhibit_rule> ... ]
global:
​  resolve_timeout      //解析超时时间,也就是报警恢复不是立马发送的,而是在一个时间范围内不在触发报警,才能发送恢复报警,默认为5分钟
​  smtp_from            //收件人的邮箱地址
​  smtp_smarthost       //邮箱提供商的smtp地址
​  smtp_auth_username   //收件人的邮箱账号
​  smtp_auth_password   //邮箱授权码
​  smtp_require_tls     //是否需要tls协议,默认是true
​  wechart_api_url      //微信api地址
​  wbchart_api_secret   //密码
​  wechat_api_corp_id   //机器人应用的id
route:
​  group_by             //采用哪个标签作为分组
​  group_wait           //分组等待的时间,收到报警不是立马发送出去,而是等待一段时间,看看同一组中是否有其他报警,如果有一并发送
​  group_interval       //告警时间间隔
​  repeat_interval      //重复告警时间间隔,可以减少发送告警的频率
​  receiver             //接收者是谁
  routes               //子路由配置

receivers:
​  name                 //接收者的名字,这里和route中的receiver对应
​  email_configs
​  - to                 //接收者的邮箱地址

route 规则

# 告警接收者
[ receiver: <string> ]
# 告警根据标签进行分组,相同标签的作为一组进行聚合,发送单条告警信息。特殊值 '...' 表示告警不聚合
[ group_by: '[' <labelname>, ... ']' ]

# 告警是否匹后续的同级节点,如果为true还会继续进行规则匹配,否则匹配成功就截止
[ continue: <boolean> | default = false ]

# 报警必须匹配到labelname,否则无法匹配到该组路由,一般用于发送给不同联系人时使用
match:
  [ <labelname>: <labelvalue>, ... ]
match_re:
  [ <labelname>: <regex>, ... ]

# 第一次发送当前group报警等待的时间,目的是实现同组告警的聚合
[ group_wait: <duration> | default = 30s ]

# 当上一次group告警发送成功后,改组又出现新的告警,那么等待多久再发送,一般设置为5分钟或者更久
[ group_interval: <duration> | default = 5m ]

# 已经发送成功的告警,但是一直没消除,那么等待多久再发送。一般推荐三个小时以上
[ repeat_interval: <duration> | default = 4h ]

# 子路由
routes:
  [ - <route> ... ]
route:
  group_by: ['alertname']             //定义分组,根据label标签进行分组
  group_wait: 10s                     //分组等待时间,也就是说在10秒内同一个组中有没有一起报警的,如果有则同时发出报警邮件,也就是有2个报警同时发在一个邮件
  group_interval: 10s                 //告警时间间隔
  repeat_interval: 10m                //重复告警间隔,也就是触发的一个告警在10分钟内没有处理则再次发一封邮件。
  continue: false                     //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;
  receiver: 'devops'                  //默认邮箱
  routes:                             //启用一个子路由
  - receiver: 'dba'                   //接收者为dba
    group_wait: 10s                   //分组等待时间
    match_re:         	              //匹配一个正则
      service: mysql|db	              //service标签包含mysql和db的统一发送给dba的邮箱
    continue: false                   //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;
  - receiver: 'devops'                //接收者为devops
    group_wait: 10s                   //分组时间
    match_re:
      serverity: error                //将serverity标签值包含error的发送给devops的邮箱
    continue: false                   //若路由上的continue字段的值为false,则遇到第一个匹配的路由分支后即终止。否则,将继续匹配后续的子节点;

receivers:                            //定义接收者的邮箱
- name: 'devops'                      //接收者名字,要和routes中的receiver对应
  email_configs:
  - to: 'xx@qq.com'              //devops的邮箱地址
- name: 'dba'                         //接收者名字,要和routes中的receiver对应
  email_configs:
  - to: 'yy@qq.com'            //dba的邮箱地址

inhibit_rules

target_match:
  [ <labelname>: <labelvalue>, ... ]
target_match_re:
  [ <labelname>: <regex>, ... ]
source_match:
  [ <labelname>: <labelvalue>, ... ]
source_match_re:
  [ <labelname>: <regex>, ... ]
[ equal: '[' <labelname>, ... ']' ]
  • 示例:当关键服务宕机时,屏蔽非关键告警
inhibit_rules:
  - source_match:
      severity: critical
      alertname: ServiceDown
    target_match:
      severity: warning
    equal: ['service']
inhibit_rules:
  - source_match:
      serverity: 'critical'                        //源标签警报触发时抑制含有目标标签的警报,在当前警报匹配serverity=critical
    target_match:
      serverity: 'warning'                         //抑制`serverity=warning`类型告警
    equal: ['alertname', 'dev', 'instance']       //告警中包含的分组名称。标签内容相同才会抑制,也就是说警报中三个标签值相同才会被抑制

示例 1

$ alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://127.0.0.1:5001/'
    http_config:
      bearer_token: xxxx
      tls_config:
        insecure_skip_verify: true
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

示例 2

# /opt/apps/alertmanager/alertmanager.yml
global:
  smtp_from: xx@xx.com
  smtp_smarthost: smtp.163.com:25
  smtp_auth_username: xx@163.com
  smtp_auth_password: xxx
  smtp_require_tls: false
# templates:
#   - /opt/apps/alertmanager/templates/*.tmpl

route:
  group_by: ["alertname"]
  group_wait: 60s
  group_interval: 10m
  repeat_interval: 6h
  receiver: ops_email
  routes:
    - match_re:
        level: info|warn|error|fatal
        receiver: ops_email
        continue: true
    - match:
        level: fatal
        receiver: admin_email
        continue: true

receivers:
  - name: ops_email
    email_configs:
      - to: yy@xx.com
        send_resolved: true

  - name: admin_email
    email_configs:
      - to: xx@xx.com
        send_resolved: true
  • templates/email.tmpl
{{ define "email.html" }}

<style type="text/css">
table.gridtable {
  font-family: verdana,arial,sans-serif;
  font-size:11px;
  color:#333333;
  border-width: 1px;
  border-color: #666666;
  border-collapse: collapse;
}
table.gridtable th {
  border-width: 1px;
  padding: 8px;
  border-style: solid;
  border-color: #666666;
  background-color: #dedede;
}
table.gridtable td {
  border-width: 1px;
  padding: 8px;
  border-style: solid;
  border-color: #666666;
  background-color: #ffffff;
}
</style>

<table class="gridtable">
  <tr>
    <th>报警项</th>
    <th>实例</th>
    <th>当前值</th>
    <th>告警级别</th>
    <th>开始时间</th>
    <th>告警摘要</th>
  </tr>
  {{ range $i, $alert := .Alerts }}
    <tr>
      <td>{{ index $alert.Labels "alertname" }}</td>
      <td>{{ index $alert.Labels "instance" }}</td>
      <td>{{ index $alert.Annotations "value" }}</td>
      <td>{{ index $alert.Labels "level" }}</td>
      <td>{{ $alert.StartsAt.Format "2006-01-02 15:04:05 MST" }}</td>
      <td>{{ index $alert.Annotations "summary" }}</td>
    </tr>
  {{ end }}
</table>
{{ end }}

amtool 检测配置文件

amtool --help

amtool --alertmanager.url=http://127.0.0.1:9093 silence query

amtool --alertmanager.url=http://127.0.0.1:9093 silence add alertname="InstanceDown" -c "忽略instance故障告警"

amtool --alertmanager.url=http://127.0.0.1:9093 silence add alertname="InstanceDown" job=~".*CADvisor.*"  -c "忽略cadvsor instance故障告警"

amtool --alertmanager.url=http://127.0.0.1:9093 silence expire <uuid>

访问

  • 端口:9093
  • http://:9093/#/alerts

告警示例

接受端

  • golang 代码
package main

import (
	"fmt"
	"net/http"
)

func main() {
	http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
		fmt.Printf("%#v\n", request.Body)

		writer.Write([]byte(`{"status": 200}`))
	})

	fmt.Println("license on :5001")
	fmt.Println(http.ListenAndServe(":5001", nil))
}

告警示例

{"receiver":"web\\.hook","status":"resolved","alerts":[{"status":"resolved","labels":{"alertname":"target is down","instance":"100.80.0.128:9100","job":"node","level":"warning"},"annotations":{"description":"节点故障","summary":"节点故障"},"startsAt":"2022-04-29T03:41:53.092589547Z","endsAt":"2022-04-29T03:49:23.092589547Z","generatorURL":"http://100.80.0.128:9090/graph?g0.expr=up+%30026g0.tab=1","fingerprint":"d863629cfb977c6d"}],"groupLabels":{"alertname":"target is down"},"commonLabels":{"alertname":"target is down","instance":"100.80.0.128:9100","job":"node","level":"warning"},"commonAnnotations":{"description":"节点故障","summary":"节点故障"},"externalURL":"http://golang-dev:9093","version":"4","groupKey":"{}:{alertname=\"target is down\"}","truncatedAlerts

webhook

https://github.com/xiexianbin/go-alertmanager-webhook

参考

  1. https://hub.docker.com/r/prom/alertmanager
  2. https://prometheus.io/docs/alerting/latest/configuration/
  3. https://github.com/prometheus/alertmanager/tree/main?tab=readme-ov-file#example
  4. https://system51.github.io/2021/07/12/Alertmanager/
Home Archives Categories Tags Statistics
本文总阅读量 次 本站总访问量 次 本站总访客数