cd /usr/local/prometheus wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
tar -zxvf alertmanager-0.24.0.linux-amd64.tar.gz mv -f alertmanager-0.24.0.linux-amd64 alertmanager
nohup /usr/local/prometheus/alertmanager/alertmanager --config.file="/usr/local/prometheus/alertmanager/alertmanager.yml" --log.level=info >/usr/local/prometheus/alertmanager/alertmanager.log 2>&1 &
global: resolve_timeout: 5m # 发送告警邮件的邮箱服务器,用户名,密码 smtp_from: 'wxhntmy@163.com' smtp_smarthost: 'smtp.163.com:465' smtp_auth_username: 'wxhntmy@163.com' smtp_auth_password: '' smtp_require_tls: false smtp_hello: '163.com' route: group_by: ['alertname'] group_wait: 5s group_interval: 5s repeat_interval: 5m receiver: 'email' receivers: # 告警邮件接收人 - name: 'email' email_configs: - to: '1527895421@qq.com' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
systemctl status firewalld firewall-cmd --zone=public --add-port=9093/tcp --permanent firewall-cmd --reload firewall-cmd --zone=public --query-port=9093/tcp
浏览器访问
http://IP:9093/

把alertmanager注册到consul
注册服务(重启失效)
curl --request PUT --data '{"ID":"alertmanager","Name":"alertmanager","Tags":["primary","v1"],"address":"localhost","port":9093,"check":{"http":"http://localhost:9093/metrics","interval":"10s"}}' http://localhost:8500/v1/agent/service/register
删除服务(url最后面为服务ID)
curl --request PUT http://localhost:8500/v1/agent/service/deregister/alertmanager

添加告警规则,每一个在target里创建的job都可以添加一个rules
mkdir /usr/local/prometheus/rule_files touch /usr/local/prometheus/rule_files/consul-up.rules touch /usr/local/prometheus/rule_files/consul_exporter-up.rules touch /usr/local/prometheus/rule_files/mysqld5_exporter-up.rules touch /usr/local/prometheus/rule_files/node_exporter-up.rules touch /usr/local/prometheus/rule_files/prometheus.rules touch /usr/local/prometheus/rule_files/pushgateway.rules
示例配置文件
groups:
- name: consul_exporter 实例存活规则
rules:
- alert: consul_exporter 实例存活告警 # 告警规则的名称(alertname)
# 这里的job就是prometheus.yml里的job名称
expr: up{job="consul_exporter"} == 0
for: 15s
labels:
severity: Disaster
team: node
annotations:
summary: "consul_exporter 服务节点失联告警"
description: "consul_exporter 服务节点 {{ $labels.instance }} 断联已超过 30 秒!"
修改 prometheus 的 prometheus.yml 文件,然后重启 prometheus
# my global config global: scrape_interval: 15s # 将抓取间隔设置为每 15 秒。 默认为每 1 分钟。 evaluation_interval: 15s # 每 15 秒评估一次规则。 默认值为每 1 分钟。 # scrape_timeout 设置为全局默认值(10 秒)。 # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # 如果有alertmanager,则需要取消下面这一行注释,指定ip和端口 - localhost:9093 # 加载规则一次并根据全局“evaluation_interval”定期评估它们。 rule_files: - "/usr/local/prometheus/rule_files/*.rules" - "/usr/local/prometheus/rule_files/*.groups" - "/usr/local/prometheus/rule_files/*.yml" # - "first_rules.yml" # - "second_rules.yml" # 一个抓取配置,只包含一个要抓取的端点: # 这里是prometheus本身。 scrape_configs: # job名称作为标签 `job=<job_name>` 添加到从此配置中抓取的任何时间序列中。 - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] #可以添加新的job,需要设置job名称和对应的IP、端口 - job_name: "consul_exporter" static_configs: - targets: ["localhost:9107"] #可以添加新的job,需要设置job名称和对应的IP、端口 - job_name: "node_exporter" static_configs: - targets: ["localhost:9100"] - job_name: "alertmanager" static_configs: - targets: ["localhost:9093"]



这里测试alertmanager发现停止的服务发送邮件,kill掉consul_exporter的进程




当把服务跑起来的时候
touch alertmanager.sh && chmod +x alertmanager.sh
#!/bin/bash
#用于安装alertmanager
#版本号
ALERTMANAGER_VER=0.25.0
#安装位置
ALERTMANAGER_DIR=/app/alertmanager
#安装包存放路径
INSTALL_PACKAGE=/app/install
#日志文件
LOG_FILE=/tmp/alertmanager.log
#Prometheus安装位置
PROMETHEUS_DIR=/app/prometheus
# PROMETHEUS监听端口
PROMETHEUS_PORT=9090
# ALERTMANAGER的用户名
ALERTMANAGER_USER=prometheus
# ALERTMANAGER监听端口
ALERTMANAGER_PORT=9093
check_err()
{
if [ $? -ne 0 ]; then
echo "$1"
exit 1
else
echo "$2"
fi
}
start() {
[ ! -d /app ] && mkdir -p /app
#检查是否存在wget命令,没有则安装
type wget &> /dev/null
if [ $? -ne 0 ]; then
yum install wget -y
check_err "\033[31myum安装依赖包 wget 失败,请检查\033[0m" "\033[36myum安装依赖包 wget 成功\033[0m"
fi
#检查是否存在netstat命令,没有则安装
type netstat &> /dev/null
if [ $? -ne 0 ]; then
yum install net-tools -y
check_err "\033[31myum安装依赖包 net-tools 失败,请检查\033[0m" "\033[36myum安装依赖包 net-tools 成功\033[0m"
fi
netstat -tln | grep ":$ALERTMANAGER_PORT"
CHECK_RESULT=$?
if [ $CHECK_RESULT -eq 0 ];then
echo "ALERTMANAGER 已启动,请勿重复启动应用!"
exit 1
fi
echo "正在启动 ALERTMANAGER....."
[ ! -d $INSTALL_PACKAGE ] && mkdir -p $INSTALL_PACKAGE
[ ! -f $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz ] && wget -O $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VER}/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz
[ ! -d $ALERTMANAGER_DIR ] && mkdir -p $ALERTMANAGER_DIR
count=`ls $ALERTMANAGER_DIR | wc -l`
if [ $count -gt 0 ]; then
echo "目录 $ALERTMANAGER_DIR 不为空,不再重复解压压缩包"
else
tar -xvzf $INSTALL_PACKAGE/alertmanager-${ALERTMANAGER_VER}.linux-amd64.tar.gz --strip-components 1 -C $ALERTMANAGER_DIR
fi
cat /etc/group | grep $ALERTMANAGER_USER
if [ $? -ne 0 ]; then
groupadd -g 10002 $ALERTMANAGER_USER
fi
cat /etc/passwd | grep $ALERTMANAGER_USER
if [ $? -ne 0 ]; then
useradd -u 10002 -g $ALERTMANAGER_USER -d /home/$ALERTMANAGER_USER -m $ALERTMANAGER_USER
fi
chown -R $ALERTMANAGER_USER:$ALERTMANAGER_USER $ALERTMANAGER_DIR
[ ! -f $LOG_FILE ] && touch $LOG_FILE
chown -R $ALERTMANAGER_USER:$ALERTMANAGER_USER $LOG_FILE
# 切换用户
su - $ALERTMANAGER_USER << EOF
# 不加反斜杠,取的值是切换用户前的
nohup $ALERTMANAGER_DIR/alertmanager --log.level=info --config.file="$ALERTMANAGER_DIR/alertmanager.yml" >> $LOG_FILE 2>&1 &
#每2s检查一次监听端口是否启动,总共检查60次共2分钟,超时未启动则退出
CHECK_RESULT=1
CHECK_COUNT=60
while [ \$CHECK_RESULT -ne 0 ];do
if [ \$CHECK_COUNT -gt 0 ];then
echo "等待监听端口启动..."
sleep 2
netstat -tln | grep ":$ALERTMANAGER_PORT"
CHECK_RESULT=\$?
CHECK_COUNT=\$((\$CHECK_COUNT-1))
else
echo "ERROR: 启动$ALERTMANAGER_PORT端口超时,请检查!"
exit 1
fi
done
echo "ALERTMANAGER 启动成功!"
EOF
#检查防火墙是否已启动,如果启动则开放端口
systemctl status firewalld > /dev/null
if [ $? -eq 0 ];then
firewall-cmd --add-port=$ALERTMANAGER_PORT/tcp --permanent
firewall-cmd --reload
fi
}
stop(){
netstat -tln | grep ":$ALERTMANAGER_PORT"
check_err "ALERTMANAGER 未启动!" "ALERTMANAGER 正在运行,开始停止应用....."
su - $ALERTMANAGER_USER << EOF
PROCESS=`ps aux | grep $ALERTMANAGER_DIR/alertmanager | grep -v "grep" | awk '{print $2}'`
for i in \${PROCESS}
do
echo "Kill the ALERTMANAGER process [ \$i ]"
kill -9 \$i
done
EOF
CHECK_RESULT=0
CHECK_COUNT=30
while [ $CHECK_RESULT -eq 0 ];do
if [ $CHECK_COUNT -gt 0 ];then
echo "等待监听端口关闭..."
sleep 2
netstat -tln | grep ":$ALERTMANAGER_PORT"
CHECK_RESULT=$?
CHECK_COUNT=$(($CHECK_COUNT-1))
else
echo "ERROR: 关闭$ALERTMANAGER_PORT端口超时,请检查!"
exit 1
fi
done
echo "关闭 ALERTMANAGER 成功!"
}
restart(){
stop
start
}
status(){
su $ALERTMANAGER_USER -c "ps aux | grep $ALERTMANAGER_DIR/alertmanager | grep -v \"grep\" | grep -v \"su $ALERTMANAGER_USER\""
}
remove(){
rm -rf $LOG_FILE
rm -rf $ALERTMANAGER_DIR
#userdel -r $ALERTMANAGER_USER
}
restart_prometheus(){
netstat -tln | grep ":$PROMETHEUS_PORT"
if [ $? -ne 0 ]; then
echo "PROMETHEUS 未启动!"
else
echo "PROMETHEUS 正在运行,开始停止应用....."
su - $ALERTMANAGER_USER << EOF
PROCESS=`ps aux | grep $PROMETHEUS_DIR/prometheus | grep -v "grep" | awk '{print $2}'`
for i in \${PROCESS}
do
echo "Kill the PROMETHEUS process [ \$i ]"
kill -9 \$i
done
EOF
CHECK_RESULT=0
CHECK_COUNT=30
while [ $CHECK_RESULT -eq 0 ];do
if [ $CHECK_COUNT -gt 0 ];then
echo "等待监听端口关闭..."
sleep 2
netstat -tln | grep ":$PROMETHEUS_PORT"
CHECK_RESULT=$?
CHECK_COUNT=$(($CHECK_COUNT-1))
else
echo "ERROR: 关闭$PROMETHEUS_PORT端口超时,请检查!"
exit 1
fi
done
echo "关闭 PROMETHEUS 成功!"
fi
# 切换用户
su - $ALERTMANAGER_USER << EOF
# 不加反斜杠,取的值是切换用户前的
nohup $PROMETHEUS_DIR/prometheus --log.level=info --config.file="$PROMETHEUS_DIR/prometheus.yml" >> $LOG_FILE 2>&1 &
#每2s检查一次监听端口是否启动,总共检查60次共2分钟,超时未启动则退出
CHECK_RESULT=1
CHECK_COUNT=60
while [ \$CHECK_RESULT -ne 0 ];do
if [ \$CHECK_COUNT -gt 0 ];then
echo "等待监听端口启动..."
sleep 2
netstat -tln | grep ":$PROMETHEUS_PORT"
CHECK_RESULT=\$?
CHECK_COUNT=\$((\$CHECK_COUNT-1))
else
echo "ERROR: 启动$PROMETHEUS_PORT端口超时,请检查!"
exit 1
fi
done
echo "PROMETHEUS 启动成功!"
EOF
}
add(){
if [[ ! -n "$1" ]] ; then
echo "alertmanagers 配置不能为空!"
exit 1
fi
ROWS=`cat $PROMETHEUS_DIR/prometheus.yml | grep -n "alertmanagers" | awk -F ":" '{print $1}'`
ROWE=$(($ROWS+3))
CNT=`sed -n "${ROWE}"',$p' $PROMETHEUS_DIR/prometheus.yml | grep "$1" | wc -l`
if [ $CNT -gt 0 ]; then
echo "alertmanagers 配置已添加,请勿重复添加!"
exit 1
fi
sed -i "${ROWE}"'s/.*/ - '"$1"'/g' $PROMETHEUS_DIR/prometheus.yml
restart_prometheus
}
add_rules(){
FILE=$1
if [[ ! -n "$FILE" ]] || [[ ! -f $FILE ]]; then
echo "$FILE 文件不存在!"
exit 1
fi
ROWS=`cat $PROMETHEUS_DIR/prometheus.yml | grep -n "rule_files" | awk -F ":" '{print $1}'`
#添加job配置
sed -i "${ROWS}"'a \\ - "'"$1"'"' $PROMETHEUS_DIR/prometheus.yml
restart_prometheus
}
usage ()
{
echo " "
echo "请输入 start 启动 ALERTMANAGER。"
echo "请输入 stop 停止 ALERTMANAGER。"
echo "请输入 status 获取 ALERTMANAGER 状态。"
echo "请输入 restart 重启 ALERTMANAGER。"
echo "请输入 remove 删除 ALERTMANAGER。"
echo "请输入 add 把 ALERTMANAGER 添加到 Prometheus。(示例:./alertmanager add \"localhost:9093\")。add 后面第一个参数是 alertmanager 服务的 IP 和端口。"
echo "请输入 add_rules 添加告警规则到 Prometheus。(示例:./alertmanager add_rules \"/app/alertmanager/rules/node_exporter.rules\")。add_rules 后面第一个参数是告警规则文件的路径。"
echo " "
}
INPUT_ACTIVE=$1
ACTIVE=${INPUT_ACTIVE:=start}
case ${ACTIVE} in
start)
start
;;
stop)
stop
;;
status)
status
;;
restart)
restart
;;
remove)
remove
;;
add)
add $2
;;
add_rules)
add_rules $2
;;
*)
usage
;;
esac