somratpro Claude Sonnet 4.6 commited on
Commit
b6e5beb
·
1 Parent(s): 1ca1144

fix: 12-point production audit — bugs, security, perf, reliability

Browse files

start.sh:
- BACKUP_DATASET_NAME/SYNC_INTERVAL defaulted before export — empty string
was passed to deerflow-sync.py causing invalid repo name (bug)
- Export FRONTEND_PORT and BACKEND_PORT so health-server.js and child
processes receive correct values from environment
- openssl JWT generation now has Python fallback — openssl CLI not
guaranteed in python:3.12-slim-bookworm slim image
- Remove dead os.environ["CORS_ORIGINS"] in Python heredoc — Python
subprocess env never propagates back to parent bash (dead code)
- Graceful shutdown uses explicit PID list instead of $(jobs -p) which
is unreliable in bash subshells
- Truncate log files on startup to prevent unbounded disk growth

health-server.js:
- sync "error" status now renders as "off" (red) not "neutral" (grey)
- Dashboard response includes Cache-Control: no-store header

nginx.conf:
- gzip compression enabled for JSON/HTML/JS — large research reports
can be 60-80% smaller over the wire
- server_tokens off — stop leaking nginx version in error responses

Dockerfile:
- Replace jq (unused) with openssl (explicit, needed for JWT generation)
- HEALTHCHECK start-period 60s → 120s — restore + backend + frontend
can legitimately take up to 2 min; 60s caused premature unhealthy marks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. Dockerfile +3 -3
  2. health-server.js +2 -1
  3. nginx.conf +11 -0
  4. start.sh +18 -16
Dockerfile CHANGED
@@ -47,7 +47,7 @@ ARG NODE_MAJOR=22
47
 
48
  # Layer 1: nginx + base tools (rarely changes — stays cached)
49
  RUN apt-get update && apt-get install -y --no-install-recommends \
50
- curl ca-certificates gnupg nginx jq \
51
  && rm -rf /var/lib/apt/lists/*
52
 
53
  # Layer 2: Node.js (separate layer — apt network stall doesn't force pip re-run)
@@ -114,8 +114,8 @@ WORKDIR /app
114
 
115
  EXPOSE 7860
116
 
117
- # 60s start period — no compilation, just config generation + service startup
118
- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s \
119
  CMD curl -fsS http://localhost:7860/health || exit 1
120
 
121
  CMD ["/app/start.sh"]
 
47
 
48
  # Layer 1: nginx + base tools (rarely changes — stays cached)
49
  RUN apt-get update && apt-get install -y --no-install-recommends \
50
+ curl ca-certificates gnupg nginx openssl \
51
  && rm -rf /var/lib/apt/lists/*
52
 
53
  # Layer 2: Node.js (separate layer — apt network stall doesn't force pip re-run)
 
114
 
115
  EXPOSE 7860
116
 
117
+ # 120s start period — restore + backend + frontend startup can take up to 2 min
118
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=120s \
119
  CMD curl -fsS http://localhost:7860/health || exit 1
120
 
121
  CMD ["/app/start.sh"]
health-server.js CHANGED
@@ -95,6 +95,7 @@ function renderDashboard({ backendUp, frontendUp, uptimeHuman, sync, keepalive }
95
  const appOnline = backendUp && frontendUp;
96
  const syncStatus = String(sync?.status || "unknown");
97
  const syncTone = ["success","restored","synced","configured"].includes(syncStatus) ? "ok"
 
98
  : syncStatus === "disabled" ? "warn" : "neutral";
99
  const kaOk = keepalive?.configured === true;
100
  const kaTone = kaOk ? "ok" : process.env.CLOUDFLARE_WORKERS_TOKEN ? "warn" : "neutral";
@@ -255,7 +256,7 @@ const server = http.createServer(async (req, res) => {
255
  probe(NGINX_HOST, NGINX_PORT, "/health"),
256
  tcpProbe(NGINX_HOST, FRONTEND_PORT),
257
  ]);
258
- res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" });
259
  return res.end(renderDashboard({
260
  backendUp, frontendUp,
261
  uptimeHuman: formatUptime(Date.now() - startTime),
 
95
  const appOnline = backendUp && frontendUp;
96
  const syncStatus = String(sync?.status || "unknown");
97
  const syncTone = ["success","restored","synced","configured"].includes(syncStatus) ? "ok"
98
+ : syncStatus === "error" ? "off"
99
  : syncStatus === "disabled" ? "warn" : "neutral";
100
  const kaOk = keepalive?.configured === true;
101
  const kaTone = kaOk ? "ok" : process.env.CLOUDFLARE_WORKERS_TOKEN ? "warn" : "neutral";
 
256
  probe(NGINX_HOST, NGINX_PORT, "/health"),
257
  tcpProbe(NGINX_HOST, FRONTEND_PORT),
258
  ]);
259
+ res.writeHead(200, { "Content-Type": "text/html; charset=utf-8", "Cache-Control": "no-store" });
260
  return res.end(renderDashboard({
261
  backendUp, frontendUp,
262
  uptimeHuman: formatUptime(Date.now() - startTime),
nginx.conf CHANGED
@@ -21,6 +21,17 @@ http {
21
  tcp_nopush on;
22
  tcp_nodelay on;
23
  keepalive_timeout 65;
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # ── DeerFlow on HF Spaces ─────────────────────────────────────
26
  server {
 
21
  tcp_nopush on;
22
  tcp_nodelay on;
23
  keepalive_timeout 65;
24
+ server_tokens off;
25
+
26
+ # Gzip — compresses API JSON, HTML, JS (big win for research reports)
27
+ gzip on;
28
+ gzip_vary on;
29
+ gzip_proxied any;
30
+ gzip_comp_level 5;
31
+ gzip_min_length 1024;
32
+ gzip_types text/plain text/css text/xml text/javascript
33
+ application/json application/javascript application/xml
34
+ application/x-javascript image/svg+xml;
35
 
36
  # ── DeerFlow on HF Spaces ─────────────────────────────────────
37
  server {
start.sh CHANGED
@@ -17,12 +17,16 @@ SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
17
  BACKEND_READY_TIMEOUT="${BACKEND_READY_TIMEOUT:-120}"
18
  FRONTEND_READY_TIMEOUT="${FRONTEND_READY_TIMEOUT:-120}"
19
 
 
 
 
 
20
  # Export shell vars so inline Python scripts can read them via os.environ
21
- export DATA_DIR CONFIG_PATH BACKUP_DATASET_NAME SYNC_INTERVAL
22
  export DEER_FLOW_HOME="$DATA_DIR"
23
  export DEER_FLOW_CONFIG_PATH="$CONFIG_PATH"
24
  export DEER_FLOW_SKILLS_PATH="/app/skills"
25
- export NGINX_PORT PUBLIC_PORT
26
 
27
  echo ""
28
  echo " ╔══════════════════════════════════════════╗"
@@ -69,7 +73,8 @@ if [ -z "${AUTH_JWT_SECRET:-}" ]; then
69
  AUTH_JWT_SECRET=$(cat "$AUTH_JWT_SECRET_FILE")
70
  echo "AUTH_JWT_SECRET loaded from disk."
71
  else
72
- AUTH_JWT_SECRET=$(openssl rand -base64 48 | tr -d '\n')
 
73
  printf '%s' "$AUTH_JWT_SECRET" > "$AUTH_JWT_SECRET_FILE"
74
  chmod 600 "$AUTH_JWT_SECRET_FILE"
75
  echo "AUTH_JWT_SECRET generated and saved to disk."
@@ -298,15 +303,6 @@ base["skills"]["path"] = "/app/skills"
298
  base.setdefault("agents_api", {})
299
  base["agents_api"]["enabled"] = True
300
 
301
- # CORS: allow HF Space URL + localhost
302
- space_host = os.environ.get("SPACE_HOST", "")
303
- cors_origins = ["http://localhost:3000", "http://localhost:7860"]
304
- if space_host:
305
- cors_origins.append(f"https://{space_host}")
306
-
307
- # Set via env (picked up by gateway config loader)
308
- os.environ["CORS_ORIGINS"] = ",".join(cors_origins)
309
-
310
  config_path.parent.mkdir(parents=True, exist_ok=True)
311
  config_path.write_text(yaml.safe_dump(base, sort_keys=False, allow_unicode=True))
312
  config_path.chmod(0o600)
@@ -353,15 +349,21 @@ graceful_shutdown() {
353
  echo "Saving state to HF Dataset..."
354
  python3 "$APP_DIR/deerflow-sync.py" sync-once || echo "Warning: shutdown sync failed."
355
  fi
356
- # Stop nginx daemon (nginx -s quit = graceful drain)
357
  nginx -s quit 2>/dev/null || true
358
- # Stop background shell jobs (health-server, backend, frontend, sync loop)
359
- kill $(jobs -p) 2>/dev/null || true
360
- sleep 2
 
 
361
  exit 0
362
  }
363
  trap graceful_shutdown SIGTERM SIGINT
364
 
 
 
 
 
 
365
  # ── Start health-server (public port 7860) ────────────────────────
366
  echo "Starting health-server on port $PUBLIC_PORT..."
367
  node "$APP_DIR/health-server.js" 2>&1 | tee -a "$DATA_DIR/logs/health-server.log" &
 
17
  BACKEND_READY_TIMEOUT="${BACKEND_READY_TIMEOUT:-120}"
18
  FRONTEND_READY_TIMEOUT="${FRONTEND_READY_TIMEOUT:-120}"
19
 
20
+ # Apply defaults before exporting so downstream tools never see empty strings
21
+ export BACKUP_DATASET_NAME="${BACKUP_DATASET_NAME:-huggingflow-backup}"
22
+ export SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
23
+
24
  # Export shell vars so inline Python scripts can read them via os.environ
25
+ export DATA_DIR CONFIG_PATH
26
  export DEER_FLOW_HOME="$DATA_DIR"
27
  export DEER_FLOW_CONFIG_PATH="$CONFIG_PATH"
28
  export DEER_FLOW_SKILLS_PATH="/app/skills"
29
+ export NGINX_PORT PUBLIC_PORT FRONTEND_PORT BACKEND_PORT
30
 
31
  echo ""
32
  echo " ╔══════════════════════════════════════════╗"
 
73
  AUTH_JWT_SECRET=$(cat "$AUTH_JWT_SECRET_FILE")
74
  echo "AUTH_JWT_SECRET loaded from disk."
75
  else
76
+ AUTH_JWT_SECRET=$(openssl rand -base64 48 2>/dev/null | tr -d '\n' || \
77
+ python3 -c "import secrets; print(secrets.token_urlsafe(64))")
78
  printf '%s' "$AUTH_JWT_SECRET" > "$AUTH_JWT_SECRET_FILE"
79
  chmod 600 "$AUTH_JWT_SECRET_FILE"
80
  echo "AUTH_JWT_SECRET generated and saved to disk."
 
303
  base.setdefault("agents_api", {})
304
  base["agents_api"]["enabled"] = True
305
 
 
 
 
 
 
 
 
 
 
306
  config_path.parent.mkdir(parents=True, exist_ok=True)
307
  config_path.write_text(yaml.safe_dump(base, sort_keys=False, allow_unicode=True))
308
  config_path.chmod(0o600)
 
349
  echo "Saving state to HF Dataset..."
350
  python3 "$APP_DIR/deerflow-sync.py" sync-once || echo "Warning: shutdown sync failed."
351
  fi
 
352
  nginx -s quit 2>/dev/null || true
353
+ # Kill tracked PIDs explicitly more reliable than $(jobs -p) in bash
354
+ for pid in "${FRONTEND_PID:-}" "${HEALTH_PID:-}" "${BACKEND_PID:-}"; do
355
+ [ -n "$pid" ] && kill "$pid" 2>/dev/null || true
356
+ done
357
+ sleep 3
358
  exit 0
359
  }
360
  trap graceful_shutdown SIGTERM SIGINT
361
 
362
+ # ── Truncate logs on startup (prevent unbounded growth) ──────────
363
+ for _log in health-server backend frontend; do
364
+ : > "$DATA_DIR/logs/$_log.log" 2>/dev/null || true
365
+ done
366
+
367
  # ── Start health-server (public port 7860) ────────────────────────
368
  echo "Starting health-server on port $PUBLIC_PORT..."
369
  node "$APP_DIR/health-server.js" 2>&1 | tee -a "$DATA_DIR/logs/health-server.log" &