mrj-crom commited on
Commit
cb979bc
·
verified ·
1 Parent(s): 289258c

sync: scripts/health_guard.sh

Browse files
Files changed (1) hide show
  1. scripts/health_guard.sh +44 -0
scripts/health_guard.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # CROM-IA OOM/Swap Watchdog SRE Daemon
3
+
4
+ LOG_FILE="../logs/health_guard.log"
5
+ mkdir -p ../logs
6
+
7
+ echo "[$(date)] SRE Health Guard Ativado." | tee -a "$LOG_FILE"
8
+
9
+ while true; do
10
+ # Ler percentual de Swap usado (via free -m)
11
+ SWAP_INFO=$(free -m | grep Swap)
12
+ SWAP_TOTAL=$(echo "$SWAP_INFO" | awk '{print $2}')
13
+ SWAP_USED=$(echo "$SWAP_INFO" | awk '{print $3}')
14
+
15
+ if [ "$SWAP_TOTAL" -gt 0 ]; then
16
+ SWAP_PCT=$(( 100 * SWAP_USED / SWAP_TOTAL ))
17
+ else
18
+ SWAP_PCT=0
19
+ fi
20
+
21
+ # Analisar RSS (Memoria Residente) do servidor Python
22
+ PYTHON_PID=$(pgrep -f "server.py" | head -n 1)
23
+
24
+ if [ ! -z "$PYTHON_PID" ]; then
25
+ RSS_KB=$(ps -o rss= -p "$PYTHON_PID")
26
+ RSS_MB=$(( RSS_KB / 1024 ))
27
+
28
+ # Guard: Se RSS Vazar acima de 2.5GB, abortar o motor pra salvar o Kernel
29
+ if [ "$RSS_MB" -gt 2500 ]; then
30
+ echo "[$(date)] [FATAL] Memory Leak Critico detectado no python (RSS: ${RSS_MB}MB)! Disparando SIGKILL. 🚀💥" | tee -a "$LOG_FILE"
31
+ kill -9 "$PYTHON_PID"
32
+ fi
33
+ fi
34
+
35
+ # Guard: Se SWAP passar de 96%, estamos prestes a travar = Abortar
36
+ if [ "$SWAP_PCT" -gt 96 ]; then
37
+ echo "[$(date)] [FATAL] Swap critical mass (96%). Destruindo montagens para resgatar Edge Device! 💥" | tee -a "$LOG_FILE"
38
+ fusermount -u ../mnt_crom 2>/dev/null || true
39
+ kill -9 "$PYTHON_PID" 2>/dev/null || true
40
+ exit 1
41
+ fi
42
+
43
+ sleep 5
44
+ done