1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env bash
# 0.定义webhook url
webhookurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=4b7128c5-0e5a-46f5-b5ef-77dff4eb5c99
# 1.定义变量值,namespace不能为空
if [ -z "$1" ]; then
exit 1
else
nameSpace=$1
fi
# 节点cpu限制值(%)
cpuVPT=85
# 节点mem限制值(%)
memVPT=85
# pod cpu限制值(m)
podCpuVPT=500
# pod mem限制值(Mi)
podMemVPT=500
# 将触发警告的node放入数组,统一进行webhook发送
overVPTNodes=()
# 将触发警告的pod放入数组,统一进行webhook发送
overVPTPods=()
# 告警信息
nodeViewMsg=""
podViewMsg=""
nodeMsg="### <font color='warning'>机器资源警告: </font>\n
> node: <font color='info'>"'$NODE_NAME'"</font>\n
> cpu: <font color='comment'>"'$NODE_CPU'"</font>\n
> 内存: <font color='comment'>"'$NODE_MEM'"</font>"
podMsg="### <font color='warning'>pod资源警告: </font>\n
> namespace: <font color='info'>"'$NAME_SPACE'"</font>\n
> pod: <font color='info'>"'$POD_NAME'"</font>\n
> cpu: <font color='comment'>"'$POD_CPU'"</font>\n
> 内存: <font color='comment'>"'$POD_MEM'"</font>"
# 2.先查看当前集群下哪个node状态异样
# 2.0跳过name一行,再获取节点名称及cpu/内存百分比值
# 2.1先替换换行符为|
# 2.2替换空格为,
# 2.3替换|为空格转数组
nodesStatus=$(kubectl top nodes | awk '{if (NR>1){print $1,$3,$5}}' | tr "\r\n" "|" | tr " " ",")
nodesStatus=(${nodesStatus//|/ })
# 3.循环数组
#NAME,CPU%,MEMORY%
#k3s-node1,2%,38%
#k3s-node2,6%,70%
#k3s-node3,7%,53%
for ((i = 0; i < ${#nodesStatus[@]}; i++)); do
# 去掉字符(%)
node=${nodesStatus[i]//%/}
node=(${node//,/ })
#判断是否是数字
#判断cpu是否超过阈值, 判断内存是否超过阈值
if [[ "${node[1]}" =~ ^[0-9]+$ ]] && [[ "${node[2]}" =~ ^[0-9]+$ ]] && ([ "${node[1]}" -ge ${cpuVPT} ] || [ "${node[2]}" -ge ${memVPT} ]); then
# echo "当前节点cpu已超过阈值${cpuVPT}% -- ${node[0]}"
# echo "${nodesStatus[i]}"
overVPTNodes+=(${nodesStatus[i]})
fi
done
# 4.查看当前集群下哪个pod状态异样
# NAME_SPACE NAME CPU(cores) MEMORY(bytes)
# 判断长度是否为0(-z,长度为0则exit,否则-n)
podsStatus=$(kubectl top pod -n "${nameSpace}" | awk '{if (NR>1){print $1,$2,$3,$4}}' | tr "\r\n" "|" | tr " " ",")
podsStatus=(${podsStatus//|/ })
for (( i = 0; i < ${#podsStatus[@]}; i++ )); do
pod=${podsStatus[i]//m/}
pod=${pod//Mi/}
pod=(${pod//,/ })
#判断cpu是否超过阈值, 判断内存是否超过阈值
if [ "${pod[1]}" -ge ${podCpuVPT} ] || [ "${pod[2]}" -ge ${podMemVPT} ]; then
echo "当前pod cpu已超过阈值${podCpuVPT}% -- ${pod[0]}"
echo "${podsStatus[i]}"
#数组+=方式赋值
overVPTPods+=(${podsStatus[i]})
fi
done
for (( i = 0; i < ${#overVPTNodes[@]}; i++ )); do
node=${overVPTNodes[i]}
nodes=(${node//,/ })
nodeViewMsg=$nodeViewMsg"\n\n"$(echo ${nodeMsg}|sed "s/\$NODE_NAME/${nodes[0]}/g"|sed "s/\$NODE_CPU/${nodes[1]}/g"|sed "s/\$NODE_MEM/${nodes[2]}/g")
done
for (( i = 0; i < ${#overVPTPods[@]}; i++ )); do
pod=${overVPTPods[i]}
pods=(${pod//,/ })
podViewMsg=$podViewMsg"\n\n"$(echo ${podMsg}|sed "s/\$NAME_SPACE/${nameSpace}/g"|sed "s/\$POD_NAME/${pods[0]}/g"|sed "s/\$POD_CPU/${pods[1]}/g"|sed "s/\$POD_MEM/${pods[2]}/g")
done
# 截取4000,企业微信webhook有contentlengh限制
podViewMsg=${podViewMsg:0:4000}
# 发送webhook
if [[ -n $nodeViewMsg ]]; then
curl --location --request POST "${webhookurl}" \
--header 'Content-Type: application/json' \
--data '{
"msgtype":"markdown",
"markdown":{
"content":"'"${nodeViewMsg}"'"
}
}'
fi
if [[ -n $podViewMsg ]]; then
curl --location --request POST "${webhookurl}" \
--header 'Content-Type: application/json' \
--data '{
"msgtype":"markdown",
"markdown":{
"content":"'"${podViewMsg}"'"
}
}'
fi