File size: 12,995 Bytes
ef72157 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | #!/usr/bin/env bash
# hf-entrypoint.sh - HF Spaces 容器入口
set -euo pipefail
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting..."
# ============================================
# 0. 保存环境变量到 /etc/profile.d,供后续 bash 会话使用
# ============================================
if [[ -x /usr/local/bin/save-env.sh ]]; then
/usr/local/bin/save-env.sh
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: warning: save-env.sh not found, skipping env export"
fi
# 加载已保存的环境变量
if [[ -f /etc/profile.d/openclaw-env.sh ]]; then
# shellcheck source=/dev/null
source /etc/profile.d/openclaw-env.sh
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: loaded environment from /etc/profile.d/openclaw-env.sh"
fi
# ============================================
# 1. 启动 supervisord(管理 cron + openclaw-gateway)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting supervisord..."
mkdir -p /var/run /var/log/supervisor /var/log/hf-entrypoint
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf \
>> /var/log/hf-entrypoint/supervisord-stdout.log \
2>> /var/log/hf-entrypoint/supervisord-stderr.log &
SUPERVISORD_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord started (pid=$SUPERVISORD_PID)"
while [[ ! -f /var/run/supervisord.pid ]]; do
sleep 0.5
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord is ready"
# 1.0 启动 SSH 服务和看门狗(确保SSH持续可用)
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH service and watchdog..."
# 0. 设置root密码(如果已设置ROOT_PASSWORD环境变量)
if [ -n "${ROOT_PASSWORD:-}" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: setting root password..."
echo "root:${ROOT_PASSWORD}" | chpasswd 2>/dev/null
if [ $? -eq 0 ]; then
# 确保root账户未锁定
passwd -u root 2>/dev/null || true
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: root password set successfully"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: failed to set root password"
fi
fi
# 0.1 创建 SSH 权限分离目录并清理残留 PID/套接字
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: preparing SSH runtime directories..."
mkdir -p /run/sshd /var/run/sshd 2>/dev/null || true
chmod 755 /run/sshd /var/run/sshd 2>/dev/null || true
rm -f /var/run/sshd.pid /var/run/sshd.init.pid /tmp/ssh-* 2>/dev/null || true
# 0.2 生成 SSH 主机密钥(如果不存在)
if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: generating SSH host keys..."
ssh-keygen -A 2>/dev/null || true
fi
# 1. 确保SSH服务启动
if ! pgrep -x "sshd" > /dev/null 2>&1; then
_sshd_bin=""
if [ -x "/usr/sbin/sshd" ]; then
_sshd_bin="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
_sshd_bin="/usr/bin/sshd"
fi
if [ -n "$_sshd_bin" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting sshd from $_sshd_bin..."
$_sshd_bin
sleep 2
if pgrep -x "sshd" > /dev/null 2>&1; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd started successfully"
# 保护 sshd 不被 OOM killer 杀死(降低优先级)
for _pid in $(pgrep -x "sshd" 2>/dev/null); do
echo -500 > /proc/$_pid/oom_score_adj 2>/dev/null || true
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd OOM protection applied (oom_score_adj=-500)"
# 调整内核内存策略,降低 OOM 误杀关键服务的概率
echo "2" > /proc/sys/vm/overcommit_memory 2>/dev/null || true
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: vm.overcommit_memory=2 set (never overcommit)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd failed to start, will be handled by watchdog"
fi
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd executable not found"
fi
fi
# 启动SSH看门狗(确保SSH服务持续可用)
# 环境变量 SSH_WATCHDOG_DOCKER_LOG: 控制看门狗日志是否重定向到 Docker logs(默认 true)
SSH_WATCHDOG_DOCKER_LOG="${SSH_WATCHDOG_DOCKER_LOG:-false}"
if [ -x "/usr/local/bin/ssh_service_watchdog.sh" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH watchdog (docker_log=$SSH_WATCHDOG_DOCKER_LOG)..."
if [ "$SSH_WATCHDOG_DOCKER_LOG" = "true" ]; then
# 将看门狗输出重定向到Docker标准输出,这样即使xterm无法连接,也可以通过docker logs查看
nohup /usr/local/bin/ssh_service_watchdog.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 &
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog logs are available via: docker logs <container>"
else
# 不重定向到 Docker logs,日志仅写入本地文件 /var/log/ssh_watchdog.log
nohup /usr/local/bin/ssh_service_watchdog.sh > /dev/null 2>&1 &
fi
SSH_WATCHDOG_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog started (pid=$SSH_WATCHDOG_PID)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: ssh_service_watchdog.sh not found"
fi
# 1.1 启动 BT Panel(与 restore 并行启动,节省时间)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting BT Panel..."
if [[ -f "/www/server/panel/default.pl" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel is installed, starting..."
bt start 2>/dev/null || true
bt default 2>/dev/null || true
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel not installed, skipping"
fi
# 1.2 等待 openclaw-gateway 完成恢复
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for openclaw-gateway to complete restore..."
RESTORE_COMPLETED_FILE="/tmp/openclaw-restore-completed"
OPENCLAW_LOG_FILE="/var/log/hf-entrypoint/openclaw-gateway-stdout.log"
RESTORE_LOG_FILE="/var/log/openclaw/restore.log"
WAITED=0
LAST_RESTORE_LINE=0
LAST_LOG_SIZE=0
PROGRESS_CHECK_INTERVAL=20
SLOW_WARN_THRESHOLD=900 # 15分钟超时预警
MAX_WAIT_TIMEOUT=3600 # 最大等待3600秒(1小时),超时后强制继续启动
IDLE_WARN_THRESHOLD=120 # 日志无新内容120秒则告警
TIME_NO_NEW_LOG=0
mkdir -p "$(dirname "$RESTORE_LOG_FILE")"
show_restore_progress() {
if [[ ! -f "$RESTORE_LOG_FILE" ]]; then
return
fi
local current_lines
current_lines=$(wc -l < "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ -n "$current_lines" ]] && [[ "$current_lines" -gt "$LAST_RESTORE_LINE" ]]; then
local new_lines=$((current_lines - LAST_RESTORE_LINE))
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Restore log ($new_lines new lines) ---"
tail -n "$new_lines" "$RESTORE_LOG_FILE" | while IFS= read -r line; do
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
done
LAST_RESTORE_LINE="$current_lines"
TIME_NO_NEW_LOG=0
fi
# Track log file size (bytes) as an activity indicator
local current_size
current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ "$current_size" != "$LAST_LOG_SIZE" ]]; then
LAST_LOG_SIZE="$current_size"
fi
}
while true; do
# 首次进入循环时显示诊断信息
if [[ $WAITED -eq 0 ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for restore completion..."
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_LOG_FILE=$RESTORE_LOG_FILE"
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_COMPLETED_FILE=$RESTORE_COMPLETED_FILE"
fi
# 每2秒:检查是否完成 + 获取最新日志
show_restore_progress
# 超时兜底:超过3600秒仍未完成,强制继续启动流程
if [[ $WAITED -ge $MAX_WAIT_TIMEOUT ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore timed out after ${WAITED}s ($((WAITED / 60))min), forcing proceed"
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ The restore process may still be running in the background"
break
fi
# 只检查恢复完成标志文件
if [[ -f "$RESTORE_COMPLETED_FILE" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore completed"
# 显示恢复日志的最后几行
if [[ -f "$RESTORE_LOG_FILE" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Final restore log (last 10 lines) ---"
tail -n 10 "$RESTORE_LOG_FILE" | while IFS= read -r line; do
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
done
fi
break
fi
sleep 2
WAITED=$((WAITED + 2))
# 如果日志无新内容,累计无更新时长
if [[ -f "$RESTORE_LOG_FILE" ]]; then
current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ "$current_size" -eq "$LAST_LOG_SIZE" ]]; then
TIME_NO_NEW_LOG=$((TIME_NO_NEW_LOG + 2))
else
TIME_NO_NEW_LOG=0
fi
fi
# 每20秒输出一次聚合状态
if [[ $((WAITED % PROGRESS_CHECK_INTERVAL)) -eq 0 ]]; then
elapsed_min=$((WAITED / 60))
log_size_str=""
if [[ -f "$RESTORE_LOG_FILE" ]]; then
file_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ $file_size -ge 1048576 ]]; then
log_size_str="$((file_size / 1048576)).$(( (file_size % 1048576) * 10 / 1048576 ))MB"
elif [[ $file_size -ge 1024 ]]; then
log_size_str="$((file_size / 1024)).$(( (file_size % 1024) * 10 / 1024 ))KB"
else
log_size_str="${file_size}B"
fi
fi
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: still waiting for restore... (${WAITED}s / ${elapsed_min}min, log: ${log_size_str:-N/A})"
# 如果超过15分钟还没完成,发出预警
if [[ $WAITED -ge $SLOW_WARN_THRESHOLD ]] && [[ $((WAITED % 60)) -eq 0 ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore is taking longer than expected (>${elapsed_min}min). Large backup (>10GB) may require more time."
fi
show_restore_progress
fi
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore wait completed (${WAITED}s / $((WAITED / 60))min), proceeding with PM2 startup"
# 1.2 确保 cron daemon 运行
if ! pgrep -x cron >/dev/null 2>&1; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting cron daemon..."
/usr/sbin/cron
fi
# ============================================
# 2. 启动 PM2 管理, 附加的 node 进程(如果需要)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting PM2 for others..."
mkdir -p /root/.pm2 /var/log/hf-entrypoint
if grep -qE '"name"\s*:' /app/pm2/ecosystem.config.js 2>/dev/null; then
/usr/bin/pm2-runtime /app/pm2/ecosystem.config.js \
>> /var/log/hf-entrypoint/pm2-stdout.log \
2>> /var/log/hf-entrypoint/pm2-stderr.log &
PM2_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2 started (pid=$PM2_PID)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2: no applications defined in ecosystem.config.js, skipping..."
PM2_PID=""
fi
# ============================================
# 3. 信号转发(确保 PID 1 的 SIGTERM 能传到 supervisord)
# ============================================
signal_handler() {
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: received SIGTERM, forwarding to all processes..."
# 停止SSH看门狗
if [ -n "${SSH_WATCHDOG_PID:-}" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH watchdog (pid=$SSH_WATCHDOG_PID)..."
kill -TERM "$SSH_WATCHDOG_PID" 2>/dev/null || true
fi
# 停止supervisord
kill -TERM "$SUPERVISORD_PID" 2>/dev/null || true
# 停止PM2
kill -TERM "$PM2_PID" 2>/dev/null || true
# 停止SSH服务
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH service..."
if pgrep -x "sshd" > /dev/null 2>&1; then
killall sshd 2>/dev/null || true
fi
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: all services stopped"
}
trap signal_handler TERM INT QUIT
# ============================================
# 5. 启动 node hf-server.js 作为 PID 1
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting node server.js..."
cd /app
exec node hf-server.js |