frankenstallm / source /scripts /clickhouse-watchdog.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
raw
history blame
6.04 kB
#!/usr/bin/env bash
#
# clickhouse-watchdog.sh β€” ClickHouse ν—¬μŠ€μ²΄ν¬ + μžλ™ μž¬μ‹œμž‘
# crontab에 λ“±λ‘ν•˜μ—¬ 1λΆ„λ§ˆλ‹€ μ‹€ν–‰
#
# Usage:
# */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/clickhouse-watchdog.sh
#
set -euo pipefail
# ── μ„€μ • ──────────────────────────────────────────────
CH_BIN="/PROJECT/0325120031_A/ghong/taketimes/clickhouse-bin"
CH_CONFIG="/PROJECT/0325120031_A/ghong/taketimes/llm-bang/configs/clickhouse-config.xml"
TCP_PORT=9000
HTTP_PORT=8123
HOST="127.0.0.1"
LOG_DIR="/tmp/clickhouse"
LOG_FILE="${LOG_DIR}/watchdog.log"
MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB λ‘œν…Œμ΄μ…˜
RESTART_COOLDOWN=180 # 초 β€” μž¬μ‹œμž‘ ν›„ 이 μ‹œκ°„ λ‚΄ μž¬μ‹œλ„ λ°©μ§€
LAST_RESTART_FILE="/tmp/clickhouse-last-restart"
HEALTH_CHECK_TIMEOUT=5 # 초 β€” ν—¬μŠ€μ²΄ν¬ curl/query νƒ€μž„μ•„μ›ƒ
# ── ν•¨μˆ˜ ──────────────────────────────────────────────
mkdir -p "$LOG_DIR"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [clickhouse-watchdog] $*" >> "$LOG_FILE"
}
rotate_log() {
local file="$1"
if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
mv "$file" "${file}.old"
log "Log rotated: $file"
fi
}
is_tcp_port_open() {
if command -v ss &>/dev/null; then
ss -tlnH "sport = :${TCP_PORT}" 2>/dev/null | grep -q "$TCP_PORT"
else
(echo > /dev/tcp/"$HOST"/"$TCP_PORT") 2>/dev/null
fi
}
is_http_responding() {
# HTTP μΈν„°νŽ˜μ΄μŠ€ ν•‘ β€” ClickHouseλŠ” GET / 에 "Ok.\n" 응닡
if command -v curl &>/dev/null; then
local resp
resp=$(curl -s --max-time "$HEALTH_CHECK_TIMEOUT" "http://${HOST}:${HTTP_PORT}/ping" 2>/dev/null || true)
[[ "$resp" == "Ok." ]]
else
# curl μ—†μœΌλ©΄ TCP 포트만 확인
(echo > /dev/tcp/"$HOST"/"$HTTP_PORT") 2>/dev/null
fi
}
is_process_alive() {
# ClickHouse λ‚΄λΆ€ watchdog ν”„λ‘œμ„ΈμŠ€λͺ…: "clickhouse-watchdog" (λ°”μ΄λ„ˆλ¦¬ 자체)
# 이 슀크립트(clickhouse-watchdog.sh)와 κ΅¬λΆ„ν•˜κΈ° μœ„ν•΄ --daemon ν”Œλž˜κ·Έ 포함 νŒ¨ν„΄ μ‚¬μš©
pgrep -f "clickhouse.*server.*--daemon" >/dev/null 2>&1
}
can_execute_query() {
# μ‹€μ œ 쿼리 μ‹€ν–‰μœΌλ‘œ μ„œλ²„κ°€ μ‘λ‹΅ν•˜λŠ”μ§€ 확인
local result
result=$("$CH_BIN" client --port "$TCP_PORT" --query "SELECT 1" 2>/dev/null || true)
[[ "$result" == "1" ]]
}
cooldown_active() {
if [[ -f "$LAST_RESTART_FILE" ]]; then
local last_restart now diff
last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
now=$(date +%s)
diff=$(( now - last_restart ))
if [[ $diff -lt $RESTART_COOLDOWN ]]; then
return 0 # μΏ¨λ‹€μš΄ 쀑
fi
fi
return 1 # μΏ¨λ‹€μš΄ μ•„λ‹˜
}
stop_existing() {
log "Stopping existing ClickHouse processes..."
local my_pid=$$
local pids
# 정상 μ’…λ£Œ μ‹œλ„ (μ„œλ²„ ν”„λ‘œμ„ΈμŠ€)
pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
if [[ -n "$pids" ]]; then
log "Sending TERM to PIDs: $pids"
echo "$pids" | xargs kill -TERM 2>/dev/null || true
sleep 3
# 아직 μ‚΄μ•„μžˆμœΌλ©΄ κ°•μ œ μ’…λ£Œ
pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
if [[ -n "$pids" ]]; then
log "Force killing PIDs: $pids"
echo "$pids" | xargs kill -9 2>/dev/null || true
sleep 2
fi
fi
}
start_server() {
log "Starting ClickHouse server (daemon mode)..."
# κΈ°μ‘΄ ν”„λ‘œμ„ΈμŠ€ 정리
stop_existing
# ν•„μš”ν•œ 디렉토리 생성
mkdir -p /tmp/clickhouse/logs
mkdir -p /tmp/clickhouse-tmp
# 데λͺ¬ λͺ¨λ“œλ‘œ μ‹œμž‘
"$CH_BIN" server --config-file="$CH_CONFIG" --daemon
# μ‹œμž‘ ν›„ λŒ€κΈ° + 확인 (μ΅œλŒ€ 15초)
local attempts=0
local max_attempts=15
while [[ $attempts -lt $max_attempts ]]; do
sleep 1
attempts=$((attempts + 1))
if is_tcp_port_open && can_execute_query; then
date +%s > "$LAST_RESTART_FILE"
log "ClickHouse started successfully (took ${attempts}s)"
return 0
fi
done
date +%s > "$LAST_RESTART_FILE"
log "ERROR: ClickHouse did not respond within ${max_attempts}s after start"
return 1
}
# ── 메인 둜직 ─────────────────────────────────────────
rotate_log "$LOG_FILE"
# 1) λ°”μ΄λ„ˆλ¦¬ 쑴재 확인
if [[ ! -x "$CH_BIN" ]]; then
log "FATAL: ClickHouse binary not found or not executable: $CH_BIN"
exit 1
fi
# 2) ν”„λ‘œμ„ΈμŠ€ + 포트 + 쿼리 체크
process_ok=false
port_ok=false
query_ok=false
if is_process_alive; then
process_ok=true
fi
if is_tcp_port_open; then
port_ok=true
fi
if $port_ok && can_execute_query; then
query_ok=true
fi
# 3) νŒλ‹¨
if $process_ok && $port_ok && $query_ok; then
# μ™„μ „ 정상 β€” 아무것도 μ•ˆ 함
exit 0
fi
# HTTP도 확인 (진단 둜그용)
http_ok=false
if is_http_responding; then
http_ok=true
fi
# 비정상 μƒνƒœ λ‘œκΉ…
if $process_ok && $port_ok && ! $query_ok; then
log "WARN: Process alive, port open, but query failed. Possible hung state."
elif $process_ok && ! $port_ok; then
log "WARN: Process alive but TCP port $TCP_PORT not listening."
elif ! $process_ok; then
log "WARN: ClickHouse is completely down (no process found)."
fi
log "Status: process=$process_ok port=$port_ok query=$query_ok http=$http_ok"
# 4) μΏ¨λ‹€μš΄ 체크
if cooldown_active; then
log "Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
exit 0
fi
# 5) μž¬μ‹œμž‘
log "Attempting ClickHouse restart..."
if start_server; then
log "ClickHouse restart SUCCESS"
else
log "ClickHouse restart FAILED"
exit 1
fi