cd /usr/local/prometheus wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
tar -zxvf alertmanager-0.24.0.linux-amd64.tar.gz mv -f alertmanager-0.24.0.linux-amd64 alertmanager
nohup /usr/local/prometheus/alertmanager/alertmanager --config.file="/usr/local/prometheus/alertmanager/alertmanager.yml" --log.level=info >/usr/local/prometheus/alertmanager/alertmanager.log 2>&1 &
global: resolve_timeout: 5m # 发送告警邮件的邮箱服务器,用户名,密码 smtp_from: 'wxhntmy@163.com' smtp_smarthost: 'smtp.163.com:465' smtp_auth_username: 'wxhntmy@163.com' smtp_auth_password: '' smtp_require_tls: false smtp_hello: '163.com' route: group_by: ['alertname'] group_wait: 5s group_interval: 5s repeat_interval: 5m receiver: 'email' receivers: # 告警邮件接收人 - name: 'email' email_configs: - to: '1527895421@qq.com' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
systemctl status firewalld firewall-cmd --zone=public --add-port=9093/tcp --permanent firewall-cmd --reload firewall-cmd --zone=public --query-port=9093/tcp
浏览器访问
http://IP:9093/
把alertmanager注册到consul
注册服务(重启失效)
curl --request PUT --data '{"ID":"alertmanager","Name":"alertmanager","Tags":["primary","v1"],"address":"localhost","port":9093,"check":{"http":"http://localhost:9093/metrics","interval":"10s"}}' http://localhost:8500/v1/agent/service/register
删除服务(url最后面为服务ID)
curl --request PUT http://localhost:8500/v1/agent/service/deregister/alertmanager
添加告警规则,每一个在target里创建的job都可以添加一个rules
mkdir /usr/local/prometheus/rule_files touch /usr/local/prometheus/rule_files/consul-up.rules touch /usr/local/prometheus/rule_files/consul_exporter-up.rules touch /usr/local/prometheus/rule_files/mysqld5_exporter-up.rules touch /usr/local/prometheus/rule_files/node_exporter-up.rules touch /usr/local/prometheus/rule_files/prometheus.rules touch /usr/local/prometheus/rule_files/pushgateway.rules
示例配置文件
groups: - name: consul_exporter 实例存活规则 rules: - alert: consul_exporter 实例存活告警 # 告警规则的名称(alertname) # 这里的job就是prometheus.yml里的job名称 expr: up{job="consul_exporter"} == 0 for: 15s labels: severity: Disaster team: node annotations: summary: "consul_exporter 服务节点失联告警" description: "consul_exporter 服务节点 {{ $labels.instance }} 断联已超过 30 秒!"
修改 prometheus 的 prometheus.yml 文件,然后重启 prometheus
# my global config global: scrape_interval: 15s # 将抓取间隔设置为每 15 秒。 默认为每 1 分钟。 evaluation_interval: 15s # 每 15 秒评估一次规则。 默认值为每 1 分钟。 # scrape_timeout 设置为全局默认值(10 秒)。 # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # 如果有alertmanager,则需要取消下面这一行注释,指定ip和端口 - localhost:9093 # 加载规则一次并根据全局“evaluation_interval”定期评估它们。 rule_files: - "/usr/local/prometheus/rule_files/*.rules" - "/usr/local/prometheus/rule_files/*.groups" - "/usr/local/prometheus/rule_files/*.yml" # - "first_rules.yml" # - "second_rules.yml" # 一个抓取配置,只包含一个要抓取的端点: # 这里是prometheus本身。 scrape_configs: # job名称作为标签 `job=<job_name>` 添加到从此配置中抓取的任何时间序列中。 - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] #可以添加新的job,需要设置job名称和对应的IP、端口 - job_name: "consul_exporter" static_configs: - targets: ["localhost:9107"] #可以添加新的job,需要设置job名称和对应的IP、端口 - job_name: "node_exporter" static_configs: - targets: ["localhost:9100"] - job_name: "alertmanager" static_configs: - targets: ["localhost:9093"]
这里测试alertmanager发现停止的服务发送邮件,kill掉consul_exporter的进程
当把服务跑起来的时候
touch alertmanager.sh && chmod +x alertmanager.sh
#!/bin/bash #用于安装alertmanager #版本号 ALERTMANAGER_VER=0.25.0 #安装位置 ALERTMANAGER_DIR=/app/alertmanager #安装包存放路径 INSTALL_PACKAGE=/app/install #日志文件 LOG_FILE=/tmp/alertmanager.log #Prometheus安装位置 PROMETHEUS_DIR=/app/prometheus # PROMETHEUS监听端口 PROMETHEUS_PORT=9090 # ALERTMANAGER的用户名 ALERTMANAGER_USER=prometheus # ALERTMANAGER监听端口 ALERTMANAGER_PORT=9093 check_err() { if [ $? -ne 0 ]; then echo "$1" exit 1 else echo "$2" fi } start() { [ ! -d /app ] && mkdir -p /app #检查是否存在wget命令,没有则安装 type wget &> /dev/null if [ $? -ne 0 ]; then yum install wget -y check_err "\033[31myum安装依赖包 wget 失败,请检查\033[0m" "\033[36myum安装依赖包 wget 成功\033[0m" fi #检查是否存在netstat命令,没有则安装 type netstat &> /dev/null if [ $? -ne 0 ]; then yum install net-tools -y check_err "\033[31myum安装依赖包 net-tools 失败,请检查\033[0m" "\033[36myum安装依赖包 net-tools 成功\033[0m" fi netstat -tln | grep ":$ALERTMANAGER_PORT" CHECK_RESULT=$? if [ $CHECK_RESULT -eq 0 ];then echo "ALERTMANAGER 已启动,请勿重复启动应用!" exit 1 fi echo "正在启动 ALERTMANAGER....." [ ! -d $INSTALL_PACKAGE ] && mkdir -p $INSTALL_PACKAGE [ ! -f $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz ] && wget -O $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VER}/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz [ ! -d $ALERTMANAGER_DIR ] && mkdir -p $ALERTMANAGER_DIR count=`ls $ALERTMANAGER_DIR | wc -l` if [ $count -gt 0 ]; then echo "目录 $ALERTMANAGER_DIR 不为空,不再重复解压压缩包" else tar -xvzf $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz --strip-components 1 -C $ALERTMANAGER_DIR fi cat /etc/group | grep $ALERTMANAGER_USER if [ $? -ne 0 ]; then groupadd -g 10002 $ALERTMANAGER_USER fi cat /etc/passwd | grep $ALERTMANAGER_USER if [ $? -ne 0 ]; then useradd -u 10002 -g $ALERTMANAGER_USER -d /home/$ALERTMANAGER_USER -m $ALERTMANAGER_USER fi chown -R $ALERTMANAGER_USER:$ALERTMANAGER_USER $ALERTMANAGER_DIR [ ! -f $LOG_FILE ] && touch $LOG_FILE chown -R $ALERTMANAGER_USER:$ALERTMANAGER_USER $LOG_FILE # 切换用户 su - $ALERTMANAGER_USER << EOF # 不加反斜杠,取的值是切换用户前的 nohup $ALERTMANAGER_DIR/alertmanager --log.level=info --config.file="$ALERTMANAGER_DIR/alertmanager.yml" >> $LOG_FILE 2>&1 & #每2s检查一次监听端口是否启动,总共检查60次共2分钟,超时未启动则退出 CHECK_RESULT=1 CHECK_COUNT=60 while [ \$CHECK_RESULT -ne 0 ];do if [ \$CHECK_COUNT -gt 0 ];then echo "等待监听端口启动..." sleep 2 netstat -tln | grep ":$ALERTMANAGER_PORT" CHECK_RESULT=\$? CHECK_COUNT=\$((\$CHECK_COUNT-1)) else echo "ERROR: 启动$ALERTMANAGER_PORT端口超时,请检查!" exit 1 fi done echo "ALERTMANAGER 启动成功!" EOF #检查防火墙是否已启动,如果启动则开放端口 systemctl status firewalld > /dev/null if [ $? -eq 0 ];then firewall-cmd --add-port=$ALERTMANAGER_PORT/tcp --permanent firewall-cmd --reload fi } stop(){ netstat -tln | grep ":$ALERTMANAGER_PORT" check_err "ALERTMANAGER 未启动!" "ALERTMANAGER 正在运行,开始停止应用....." su - $ALERTMANAGER_USER << EOF PROCESS=`ps aux | grep $ALERTMANAGER_DIR/alertmanager | grep -v "grep" | awk '{print $2}'` for i in \${PROCESS} do echo "Kill the ALERTMANAGER process [ \$i ]" kill -9 \$i done EOF CHECK_RESULT=0 CHECK_COUNT=30 while [ $CHECK_RESULT -eq 0 ];do if [ $CHECK_COUNT -gt 0 ];then echo "等待监听端口关闭..." sleep 2 netstat -tln | grep ":$ALERTMANAGER_PORT" CHECK_RESULT=$? CHECK_COUNT=$(($CHECK_COUNT-1)) else echo "ERROR: 关闭$ALERTMANAGER_PORT端口超时,请检查!" exit 1 fi done echo "关闭 ALERTMANAGER 成功!" } restart(){ stop start } status(){ su $ALERTMANAGER_USER -c "ps aux | grep $ALERTMANAGER_DIR/alertmanager | grep -v \"grep\" | grep -v \"su $ALERTMANAGER_USER\"" } remove(){ rm -rf $LOG_FILE rm -rf $ALERTMANAGER_DIR #userdel -r $ALERTMANAGER_USER } restart_prometheus(){ netstat -tln | grep ":$PROMETHEUS_PORT" if [ $? -ne 0 ]; then echo "PROMETHEUS 未启动!" else echo "PROMETHEUS 正在运行,开始停止应用....." su - $ALERTMANAGER_USER << EOF PROCESS=`ps aux | grep $PROMETHEUS_DIR/prometheus | grep -v "grep" | awk '{print $2}'` for i in \${PROCESS} do echo "Kill the PROMETHEUS process [ \$i ]" kill -9 \$i done EOF CHECK_RESULT=0 CHECK_COUNT=30 while [ $CHECK_RESULT -eq 0 ];do if [ $CHECK_COUNT -gt 0 ];then echo "等待监听端口关闭..." sleep 2 netstat -tln | grep ":$PROMETHEUS_PORT" CHECK_RESULT=$? CHECK_COUNT=$(($CHECK_COUNT-1)) else echo "ERROR: 关闭$PROMETHEUS_PORT端口超时,请检查!" exit 1 fi done echo "关闭 PROMETHEUS 成功!" fi # 切换用户 su - $ALERTMANAGER_USER << EOF # 不加反斜杠,取的值是切换用户前的 nohup $PROMETHEUS_DIR/prometheus --log.level=info --config.file="$PROMETHEUS_DIR/prometheus.yml" >> $LOG_FILE 2>&1 & #每2s检查一次监听端口是否启动,总共检查60次共2分钟,超时未启动则退出 CHECK_RESULT=1 CHECK_COUNT=60 while [ \$CHECK_RESULT -ne 0 ];do if [ \$CHECK_COUNT -gt 0 ];then echo "等待监听端口启动..." sleep 2 netstat -tln | grep ":$PROMETHEUS_PORT" CHECK_RESULT=\$? CHECK_COUNT=\$((\$CHECK_COUNT-1)) else echo "ERROR: 启动$PROMETHEUS_PORT端口超时,请检查!" exit 1 fi done echo "PROMETHEUS 启动成功!" EOF } add(){ if [[ ! -n "$1" ]] ; then echo "alertmanagers 配置不能为空!" exit 1 fi ROWS=`cat $PROMETHEUS_DIR/prometheus.yml | grep -n "alertmanagers" | awk -F ":" '{print $1}'` ROWE=$(($ROWS+3)) CNT=`sed -n "${ROWE}"',$p' $PROMETHEUS_DIR/prometheus.yml | grep "$1" | wc -l` if [ $CNT -gt 0 ]; then echo "alertmanagers 配置已添加,请勿重复添加!" exit 1 fi sed -i "${ROWE}"'s/.*/ - '"$1"'/g' $PROMETHEUS_DIR/prometheus.yml restart_prometheus } add_rules(){ FILE=$1 if [[ ! -n "$FILE" ]] || [[ ! -f $FILE ]]; then echo "$FILE 文件不存在!" exit 1 fi ROWS=`cat $PROMETHEUS_DIR/prometheus.yml | grep -n "rule_files" | awk -F ":" '{print $1}'` #添加job配置 sed -i "${ROWS}"'a \\ - "'"$1"'"' $PROMETHEUS_DIR/prometheus.yml restart_prometheus } usage () { echo " " echo "请输入 start 启动 ALERTMANAGER。" echo "请输入 stop 停止 ALERTMANAGER。" echo "请输入 status 获取 ALERTMANAGER 状态。" echo "请输入 restart 重启 ALERTMANAGER。" echo "请输入 remove 删除 ALERTMANAGER。" echo "请输入 add 把 ALERTMANAGER 添加到 Prometheus。(示例:./alertmanager add \"localhost:9093\")。add 后面第一个参数是 alertmanager 服务的 IP 和端口。" echo "请输入 add_rules 添加告警规则到 Prometheus。(示例:./alertmanager add_rules \"/app/alertmanager/rules/node_exporter.rules\")。add_rules 后面第一个参数是告警规则文件的路径。" echo " " } INPUT_ACTIVE=$1 ACTIVE=${INPUT_ACTIVE:=start} case ${ACTIVE} in start) start ;; stop) stop ;; status) status ;; restart) restart ;; remove) remove ;; add) add $2 ;; add_rules) add_rules $2 ;; *) usage ;; esac