Rick commited on
Commit
2c7b558
·
1 Parent(s): b937983

Fix HF submit gating and add runtime debug log panel in Settings

Browse files
Files changed (3) hide show
  1. app.py +393 -11
  2. static/js/settings.js +95 -0
  3. templates/index.html +21 -1
app.py CHANGED
@@ -50,6 +50,8 @@ import re
50
  import mimetypes
51
  import io
52
  import logging
 
 
53
 
54
  import medgemma4
55
  import medgemma27b
@@ -232,6 +234,12 @@ HF_REMOTE_TOKEN = (
232
  )
233
  # Default remote text model; when MedGemma 4B/27B is selected we pass that through instead.
234
  REMOTE_MODEL = os.environ.get("REMOTE_MODEL") or "google/medgemma-1.5-4b-it"
 
 
 
 
 
 
235
 
236
  _dbg(
237
  "env flags: "
@@ -322,6 +330,74 @@ if LEGACY_CACHE.exists() and not (CACHE_DIR / ".migrated").exists() and CACHE_DI
322
 
323
  BACKUP_ROOT = BASE_STORE / "backups"
324
  BACKUP_ROOT.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  # Prefer keeping the DB alongside app.py; migrate from legacy data/ path if present
326
  DB_PATH = APP_HOME / "app.db"
327
  LEGACY_DB = APP_HOME / "data" / "app.db"
@@ -495,6 +571,15 @@ async def _log_db_path():
495
  """Print the fully-resolved database path at startup for operational visibility."""
496
  try:
497
  print(f"[startup] Database path: {DB_PATH.resolve()}", flush=True)
 
 
 
 
 
 
 
 
 
498
  except Exception as exc:
499
  print(f"[startup] Database path unavailable: {exc}", flush=True)
500
 
@@ -2423,6 +2508,39 @@ def _local_model_availability_payload() -> dict:
2423
  Local installs are always tracked; runnable models may include remote-mode
2424
  execution when local inference is disabled and a remote token is configured.
2425
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2426
  models = []
2427
  available_models = []
2428
  missing_models = []
@@ -2492,6 +2610,49 @@ async def chat_queue_status(request: Request, _=Depends(require_auth)):
2492
  return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
2493
 
2494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2495
  @app.get("/api/triage/options")
2496
  async def triage_options(_=Depends(require_auth)):
2497
  """Expose dropdown options for triage form (table-backed)."""
@@ -3283,7 +3444,7 @@ def _cap_new_tokens(max_new_tokens, input_len, model_max_len):
3283
  return max_new_tokens
3284
 
3285
 
3286
- def _generate_response_local(model_choice: str, force_cpu_slow: bool, prompt: str, cfg: dict):
3287
  """
3288
  Generate Response Local helper.
3289
  Detailed inline notes are included to support safe maintenance and future edits.
@@ -3305,6 +3466,15 @@ def _generate_response_local(model_choice: str, force_cpu_slow: bool, prompt: st
3305
  raise RuntimeError(f"CUDA_NOT_AVAILABLE{detail}")
3306
  if is_large_model and runtime_device != "cuda" and not force_cpu_slow:
3307
  raise RuntimeError("SLOW_28B_CPU")
 
 
 
 
 
 
 
 
 
3308
  try:
3309
  if is_large_model:
3310
  # Ensure only one model family occupies VRAM at a time.
@@ -3347,6 +3517,16 @@ def _generate_response_local(model_choice: str, force_cpu_slow: bool, prompt: st
3347
  )
3348
  )
3349
  if not cuda_runtime_failure:
 
 
 
 
 
 
 
 
 
 
3350
  raise
3351
 
3352
  _dbg(f"generate_response: cuda runtime failure detected: {err_txt}")
@@ -3388,11 +3568,26 @@ def _generate_response_local(model_choice: str, force_cpu_slow: bool, prompt: st
3388
  )
3389
  models["active_name"] = model_name
3390
  models["is_text"] = True
 
 
 
 
 
 
 
 
3391
  _dbg(f"generate_response: local response_len={len(res)}")
3392
  return res
3393
 
3394
 
3395
- def _generate_response(model_choice: str, force_cpu_slow: bool, prompt: str, cfg: dict, prelocked: bool = False):
 
 
 
 
 
 
 
3396
  """
3397
  Generate Response helper.
3398
  Detailed inline notes are included to support safe maintenance and future edits.
@@ -3407,38 +3602,80 @@ def _generate_response(model_choice: str, force_cpu_slow: bool, prompt: str, cfg
3407
  if DISABLE_LOCAL_INFERENCE:
3408
  if not HF_REMOTE_TOKEN:
3409
  _dbg("generate_response: remote path selected but HF_REMOTE_TOKEN missing")
 
 
 
 
 
 
3410
  raise RuntimeError("REMOTE_TOKEN_MISSING")
3411
- client = InferenceClient(token=HF_REMOTE_TOKEN)
3412
  # Use requested model when provided (e.g., MedGemma) else default
3413
  model_name = model_choice or REMOTE_MODEL
3414
  _dbg(f"generate_response: remote inference model={model_name}")
3415
- resp = client.text_generation(
3416
- prompt,
 
 
3417
  model=model_name,
3418
- max_new_tokens=cfg["tk"],
3419
- temperature=cfg["t"],
3420
- top_p=cfg["p"],
 
3421
  top_k=cfg.get("k"),
 
3422
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3423
  out = resp.strip()
 
 
 
 
 
 
 
 
3424
  _dbg(f"generate_response: remote response_len={len(out)}")
3425
  return out
3426
 
3427
  if prelocked:
3428
- return _generate_response_local(model_choice, force_cpu_slow, prompt, cfg)
3429
  with MODEL_MUTEX:
3430
- return _generate_response_local(model_choice, force_cpu_slow, prompt, cfg)
3431
 
3432
 
3433
  @app.post("/api/chat")
3434
  async def chat(request: Request, _=Depends(require_auth)):
3435
  """Session-based chat endpoint (triage + inquiry)."""
 
3436
  try:
3437
  store = request.state.store
3438
  start_time = datetime.now()
 
3439
  form = await request.form()
3440
  msg = (form.get("message") or "").strip()
3441
  if not msg:
 
3442
  return JSONResponse({"error": "Message is required."}, status_code=status.HTTP_400_BAD_REQUEST)
3443
  user_msg_raw = msg
3444
  p_name = form.get("patient") or ""
@@ -3459,10 +3696,32 @@ async def chat(request: Request, _=Depends(require_auth)):
3459
  is_start = session_action == "start" or not session_id
3460
  if not session_id:
3461
  session_id = f"session-{uuid.uuid4().hex}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3462
 
3463
  if not DISABLE_LOCAL_INFERENCE:
3464
  model_availability = _local_model_availability_payload()
3465
  if not model_availability.get("has_any_local_model"):
 
 
 
 
 
 
 
3466
  return JSONResponse(
3467
  {
3468
  "error": (
@@ -3480,6 +3739,14 @@ async def chat(request: Request, _=Depends(require_auth)):
3480
  if not chosen_model:
3481
  model_choice = available_models[0]
3482
  elif chosen_model not in available_models:
 
 
 
 
 
 
 
 
3483
  return JSONResponse(
3484
  {
3485
  "error": (
@@ -3534,11 +3801,33 @@ async def chat(request: Request, _=Depends(require_auth)):
3534
  if override_prompt.strip():
3535
  prompt = override_prompt.strip()
3536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3537
  # Persist the exact prompt submitted for debug visibility in Settings.
3538
  try:
3539
  set_settings_meta(last_prompt_verbatim=prompt)
3540
  except Exception:
3541
  logger.exception("Unable to persist last_prompt_verbatim")
 
 
 
 
 
 
3542
 
3543
  model_lock_acquired = False
3544
  busy_meta_set = False
@@ -3547,6 +3836,15 @@ async def chat(request: Request, _=Depends(require_auth)):
3547
  queue_snapshot = {}
3548
  queue_ticket, queue_snapshot = _chat_queue_enqueue(model_choice or "", mode, session_id, p_name)
3549
  if queue_ticket is None:
 
 
 
 
 
 
 
 
 
3550
  return JSONResponse(
3551
  {
3552
  "error": (
@@ -3559,10 +3857,26 @@ async def chat(request: Request, _=Depends(require_auth)):
3559
  status_code=status.HTTP_429_TOO_MANY_REQUESTS,
3560
  )
3561
  queue_position = safe_int(queue_snapshot.get("position"), 1)
 
 
 
 
 
 
 
 
 
3562
  if queue_position > 1 and not queue_wait_requested:
3563
  _chat_queue_release(queue_ticket)
3564
  active = queue_snapshot.get("active") or {}
3565
  active_model = (active.get("model_choice") or "").strip() or "another consultation"
 
 
 
 
 
 
 
3566
  return JSONResponse(
3567
  {
3568
  "error": (
@@ -3579,10 +3893,25 @@ async def chat(request: Request, _=Depends(require_auth)):
3579
  status_code=status.HTTP_429_TOO_MANY_REQUESTS,
3580
  )
3581
  queue_wait_seconds = await asyncio.to_thread(_chat_queue_wait_turn, queue_ticket)
 
 
 
 
 
 
 
3582
  _set_model_busy_meta(model_choice or "", mode, session_id, p_name)
3583
  busy_meta_set = True
3584
 
3585
  try:
 
 
 
 
 
 
 
 
3586
  if not DISABLE_LOCAL_INFERENCE:
3587
  MODEL_MUTEX.acquire()
3588
  model_lock_acquired = True
@@ -3593,11 +3922,34 @@ async def chat(request: Request, _=Depends(require_auth)):
3593
  prompt,
3594
  cfg,
3595
  True,
 
3596
  )
3597
  else:
3598
- res = await asyncio.to_thread(_generate_response, model_choice, force_cpu_slow, prompt, cfg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3599
  except RuntimeError as e:
3600
  _dbg(f"chat runtime error: {e}")
 
 
 
 
 
 
 
3601
  if str(e) == "CHAT_QUEUE_TICKET_MISSING":
3602
  return JSONResponse(
3603
  {"error": "Queue ticket was lost. Please submit again."},
@@ -3624,6 +3976,18 @@ async def chat(request: Request, _=Depends(require_auth)):
3624
  },
3625
  status_code=status.HTTP_400_BAD_REQUEST,
3626
  )
 
 
 
 
 
 
 
 
 
 
 
 
3627
  if "Missing model cache" in str(e) or str(e) in {"REMOTE_TOKEN_MISSING", "LOCAL_INFERENCE_DISABLED"}:
3628
  return JSONResponse(
3629
  {"error": str(e), "offline_missing": True},
@@ -3712,6 +4076,16 @@ async def chat(request: Request, _=Depends(require_auth)):
3712
  upsert_history_entry(entry)
3713
 
3714
  metrics = _update_chat_metrics(store, models["active_name"])
 
 
 
 
 
 
 
 
 
 
3715
 
3716
  return JSONResponse(
3717
  {
@@ -3738,6 +4112,14 @@ async def chat(request: Request, _=Depends(require_auth)):
3738
  }
3739
  )
3740
  except Exception as e:
 
 
 
 
 
 
 
 
3741
  return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
3742
 
3743
 
 
50
  import mimetypes
51
  import io
52
  import logging
53
+ import traceback
54
+ from logging.handlers import RotatingFileHandler
55
 
56
  import medgemma4
57
  import medgemma27b
 
234
  )
235
  # Default remote text model; when MedGemma 4B/27B is selected we pass that through instead.
236
  REMOTE_MODEL = os.environ.get("REMOTE_MODEL") or "google/medgemma-1.5-4b-it"
237
+ try:
238
+ HF_REMOTE_TIMEOUT_SECONDS = float(os.environ.get("HF_REMOTE_TIMEOUT_SECONDS", "300").strip())
239
+ except Exception:
240
+ HF_REMOTE_TIMEOUT_SECONDS = 300.0
241
+ if HF_REMOTE_TIMEOUT_SECONDS < 10:
242
+ HF_REMOTE_TIMEOUT_SECONDS = 10.0
243
 
244
  _dbg(
245
  "env flags: "
 
330
 
331
  BACKUP_ROOT = BASE_STORE / "backups"
332
  BACKUP_ROOT.mkdir(parents=True, exist_ok=True)
333
+
334
+ # Runtime log file (used heavily for HF debugging and support snapshots).
335
+ LOG_ROOT = BASE_STORE / "logs"
336
+ LOG_ROOT.mkdir(parents=True, exist_ok=True)
337
+ RUNTIME_LOG_PATH = LOG_ROOT / "runtime.log"
338
+ try:
339
+ RUNTIME_LOG_MAX_BYTES = int((os.environ.get("RUNTIME_LOG_MAX_BYTES") or "5242880").strip())
340
+ except Exception:
341
+ RUNTIME_LOG_MAX_BYTES = 5242880
342
+ if RUNTIME_LOG_MAX_BYTES < 65536:
343
+ RUNTIME_LOG_MAX_BYTES = 65536
344
+ try:
345
+ RUNTIME_LOG_BACKUPS = int((os.environ.get("RUNTIME_LOG_BACKUPS") or "3").strip())
346
+ except Exception:
347
+ RUNTIME_LOG_BACKUPS = 3
348
+ if RUNTIME_LOG_BACKUPS < 1:
349
+ RUNTIME_LOG_BACKUPS = 1
350
+ RUNTIME_LOG_ENABLED = (os.environ.get("RUNTIME_LOG_ENABLED", "1").strip().lower() not in {"0", "false", "no", "off"})
351
+
352
+ runtime_logger = logging.getLogger("sma.runtime")
353
+ runtime_logger.setLevel(logging.INFO)
354
+ runtime_logger.propagate = False
355
+ if RUNTIME_LOG_ENABLED:
356
+ has_runtime_handler = any(getattr(h, "_sma_runtime_handler", False) for h in runtime_logger.handlers)
357
+ if not has_runtime_handler:
358
+ runtime_handler = RotatingFileHandler(
359
+ str(RUNTIME_LOG_PATH),
360
+ maxBytes=RUNTIME_LOG_MAX_BYTES,
361
+ backupCount=RUNTIME_LOG_BACKUPS,
362
+ encoding="utf-8",
363
+ )
364
+ runtime_handler._sma_runtime_handler = True # type: ignore[attr-defined]
365
+ runtime_handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
366
+ runtime_logger.addHandler(runtime_handler)
367
+
368
+
369
+ def _trim_log_value(value, limit: int = 900) -> str:
370
+ """Convert values to compact single-line strings for runtime logs."""
371
+ text = str(value)
372
+ if len(text) > limit:
373
+ text = text[: limit - 3] + "..."
374
+ return text.replace("\n", "\\n")
375
+
376
+
377
+ def _runtime_log(event: str, level: int = logging.INFO, **fields):
378
+ """Write structured runtime log entries to rotating file."""
379
+ if not RUNTIME_LOG_ENABLED:
380
+ return
381
+ parts = [f"event={_trim_log_value(event, 120)}"]
382
+ for key, val in fields.items():
383
+ parts.append(f"{key}={_trim_log_value(val)}")
384
+ runtime_logger.log(level, " | ".join(parts))
385
+
386
+
387
+ def _runtime_log_tail(lines: int = 500) -> str:
388
+ """Read the tail of the runtime log file as UTF-8 text."""
389
+ max_lines = max(50, min(int(lines or 500), 5000))
390
+ if not RUNTIME_LOG_PATH.exists():
391
+ return ""
392
+ try:
393
+ text = RUNTIME_LOG_PATH.read_text(encoding="utf-8", errors="replace")
394
+ except Exception:
395
+ return ""
396
+ log_lines = text.splitlines()
397
+ if len(log_lines) <= max_lines:
398
+ return text
399
+ return "\n".join(log_lines[-max_lines:])
400
+
401
  # Prefer keeping the DB alongside app.py; migrate from legacy data/ path if present
402
  DB_PATH = APP_HOME / "app.db"
403
  LEGACY_DB = APP_HOME / "data" / "app.db"
 
571
  """Print the fully-resolved database path at startup for operational visibility."""
572
  try:
573
  print(f"[startup] Database path: {DB_PATH.resolve()}", flush=True)
574
+ _runtime_log(
575
+ "startup.ready",
576
+ db_path=DB_PATH.resolve(),
577
+ runtime_log_path=RUNTIME_LOG_PATH.resolve(),
578
+ is_hf_space=IS_HF_SPACE,
579
+ disable_local_inference=DISABLE_LOCAL_INFERENCE,
580
+ remote_timeout_s=HF_REMOTE_TIMEOUT_SECONDS,
581
+ remote_token_set=bool(HF_REMOTE_TOKEN),
582
+ )
583
  except Exception as exc:
584
  print(f"[startup] Database path unavailable: {exc}", flush=True)
585
 
 
2508
  Local installs are always tracked; runnable models may include remote-mode
2509
  execution when local inference is disabled and a remote token is configured.
2510
  """
2511
+ if DISABLE_LOCAL_INFERENCE:
2512
+ remote_models_env = (os.environ.get("REMOTE_AVAILABLE_MODELS") or "").strip()
2513
+ if remote_models_env:
2514
+ remote_models = [
2515
+ item.strip()
2516
+ for item in remote_models_env.split(",")
2517
+ if item.strip()
2518
+ ]
2519
+ else:
2520
+ remote_models = [REMOTE_MODEL] if (REMOTE_MODEL or "").strip() else list(REQUIRED_MODELS)
2521
+ has_remote = bool(HF_REMOTE_TOKEN)
2522
+ models = [
2523
+ {
2524
+ "model": model_name,
2525
+ "installed": has_remote,
2526
+ "error": "" if has_remote else "remote inference token missing",
2527
+ }
2528
+ for model_name in remote_models
2529
+ ]
2530
+ message = "" if has_remote else (
2531
+ "Remote MedGemma is not configured. Set HF_REMOTE_TOKEN in Hugging Face Space secrets."
2532
+ )
2533
+ return {
2534
+ "mode": "remote",
2535
+ "models": models,
2536
+ "required_models": remote_models,
2537
+ "available_models": remote_models if has_remote else [],
2538
+ "missing_models": [] if has_remote else remote_models,
2539
+ "has_any_local_model": has_remote,
2540
+ "disable_submit": not has_remote,
2541
+ "message": message,
2542
+ }
2543
+
2544
  models = []
2545
  available_models = []
2546
  missing_models = []
 
2610
  return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
2611
 
2612
 
2613
+ @app.get("/api/runtime-log")
2614
+ async def runtime_log_view(request: Request, lines: int = 500, _=Depends(require_auth)):
2615
+ """Return tail lines from runtime log for in-app debugging."""
2616
+ try:
2617
+ text = _runtime_log_tail(lines)
2618
+ exists = RUNTIME_LOG_PATH.exists()
2619
+ size = RUNTIME_LOG_PATH.stat().st_size if exists else 0
2620
+ return {
2621
+ "path": str(RUNTIME_LOG_PATH),
2622
+ "exists": bool(exists),
2623
+ "size": size,
2624
+ "lines_requested": lines,
2625
+ "text": text,
2626
+ }
2627
+ except Exception as e:
2628
+ return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
2629
+
2630
+
2631
+ @app.get("/api/runtime-log/download")
2632
+ async def runtime_log_download(request: Request, _=Depends(require_auth)):
2633
+ """Download complete runtime log file as text/plain."""
2634
+ try:
2635
+ if not RUNTIME_LOG_PATH.exists():
2636
+ return Response(content="", media_type="text/plain; charset=utf-8")
2637
+ content = RUNTIME_LOG_PATH.read_text(encoding="utf-8", errors="replace")
2638
+ headers = {"Content-Disposition": 'attachment; filename="runtime.log"'}
2639
+ return Response(content=content, media_type="text/plain; charset=utf-8", headers=headers)
2640
+ except Exception as e:
2641
+ return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
2642
+
2643
+
2644
+ @app.post("/api/runtime-log/clear")
2645
+ async def runtime_log_clear(request: Request, _=Depends(require_auth)):
2646
+ """Clear runtime log content while preserving file and handler."""
2647
+ try:
2648
+ RUNTIME_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
2649
+ RUNTIME_LOG_PATH.write_text("", encoding="utf-8")
2650
+ _runtime_log("runtime.log.cleared", actor=request.session.get("user") or "unknown")
2651
+ return {"status": "cleared", "path": str(RUNTIME_LOG_PATH)}
2652
+ except Exception as e:
2653
+ return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
2654
+
2655
+
2656
  @app.get("/api/triage/options")
2657
  async def triage_options(_=Depends(require_auth)):
2658
  """Expose dropdown options for triage form (table-backed)."""
 
3444
  return max_new_tokens
3445
 
3446
 
3447
+ def _generate_response_local(model_choice: str, force_cpu_slow: bool, prompt: str, cfg: dict, trace_id: str = ""):
3448
  """
3449
  Generate Response Local helper.
3450
  Detailed inline notes are included to support safe maintenance and future edits.
 
3466
  raise RuntimeError(f"CUDA_NOT_AVAILABLE{detail}")
3467
  if is_large_model and runtime_device != "cuda" and not force_cpu_slow:
3468
  raise RuntimeError("SLOW_28B_CPU")
3469
+ local_started = time.perf_counter()
3470
+ _runtime_log(
3471
+ "inference.local.start",
3472
+ trace_id=trace_id,
3473
+ model=model_name,
3474
+ runtime_device=runtime_device,
3475
+ is_large_model=is_large_model,
3476
+ force_cpu_slow=force_cpu_slow,
3477
+ )
3478
  try:
3479
  if is_large_model:
3480
  # Ensure only one model family occupies VRAM at a time.
 
3517
  )
3518
  )
3519
  if not cuda_runtime_failure:
3520
+ _runtime_log(
3521
+ "inference.local.error",
3522
+ level=logging.ERROR,
3523
+ trace_id=trace_id,
3524
+ model=model_name,
3525
+ runtime_device=runtime_device,
3526
+ error_type=type(exc).__name__,
3527
+ error=err_txt,
3528
+ elapsed_ms=int((time.perf_counter() - local_started) * 1000),
3529
+ )
3530
  raise
3531
 
3532
  _dbg(f"generate_response: cuda runtime failure detected: {err_txt}")
 
3568
  )
3569
  models["active_name"] = model_name
3570
  models["is_text"] = True
3571
+ _runtime_log(
3572
+ "inference.local.success",
3573
+ trace_id=trace_id,
3574
+ model=model_name,
3575
+ runtime_device=runtime_device,
3576
+ response_len=len(res),
3577
+ elapsed_ms=int((time.perf_counter() - local_started) * 1000),
3578
+ )
3579
  _dbg(f"generate_response: local response_len={len(res)}")
3580
  return res
3581
 
3582
 
3583
+ def _generate_response(
3584
+ model_choice: str,
3585
+ force_cpu_slow: bool,
3586
+ prompt: str,
3587
+ cfg: dict,
3588
+ prelocked: bool = False,
3589
+ trace_id: str = "",
3590
+ ):
3591
  """
3592
  Generate Response helper.
3593
  Detailed inline notes are included to support safe maintenance and future edits.
 
3602
  if DISABLE_LOCAL_INFERENCE:
3603
  if not HF_REMOTE_TOKEN:
3604
  _dbg("generate_response: remote path selected but HF_REMOTE_TOKEN missing")
3605
+ _runtime_log(
3606
+ "inference.remote.token_missing",
3607
+ level=logging.ERROR,
3608
+ trace_id=trace_id,
3609
+ model_choice=model_choice,
3610
+ )
3611
  raise RuntimeError("REMOTE_TOKEN_MISSING")
3612
+ client = InferenceClient(token=HF_REMOTE_TOKEN, timeout=HF_REMOTE_TIMEOUT_SECONDS)
3613
  # Use requested model when provided (e.g., MedGemma) else default
3614
  model_name = model_choice or REMOTE_MODEL
3615
  _dbg(f"generate_response: remote inference model={model_name}")
3616
+ call_started = time.perf_counter()
3617
+ _runtime_log(
3618
+ "inference.remote.start",
3619
+ trace_id=trace_id,
3620
  model=model_name,
3621
+ prompt_len=len(prompt or ""),
3622
+ max_new_tokens=cfg.get("tk"),
3623
+ temperature=cfg.get("t"),
3624
+ top_p=cfg.get("p"),
3625
  top_k=cfg.get("k"),
3626
+ timeout_s=HF_REMOTE_TIMEOUT_SECONDS,
3627
  )
3628
+ try:
3629
+ resp = client.text_generation(
3630
+ prompt,
3631
+ model=model_name,
3632
+ max_new_tokens=cfg["tk"],
3633
+ temperature=cfg["t"],
3634
+ top_p=cfg["p"],
3635
+ top_k=cfg.get("k"),
3636
+ )
3637
+ except Exception as exc:
3638
+ elapsed_ms = int((time.perf_counter() - call_started) * 1000)
3639
+ _runtime_log(
3640
+ "inference.remote.error",
3641
+ level=logging.ERROR,
3642
+ trace_id=trace_id,
3643
+ model=model_name,
3644
+ elapsed_ms=elapsed_ms,
3645
+ error_type=type(exc).__name__,
3646
+ error=str(exc),
3647
+ )
3648
+ raise RuntimeError(f"REMOTE_INFERENCE_FAILED: {exc}") from exc
3649
  out = resp.strip()
3650
+ elapsed_ms = int((time.perf_counter() - call_started) * 1000)
3651
+ _runtime_log(
3652
+ "inference.remote.success",
3653
+ trace_id=trace_id,
3654
+ model=model_name,
3655
+ elapsed_ms=elapsed_ms,
3656
+ response_len=len(out),
3657
+ )
3658
  _dbg(f"generate_response: remote response_len={len(out)}")
3659
  return out
3660
 
3661
  if prelocked:
3662
+ return _generate_response_local(model_choice, force_cpu_slow, prompt, cfg, trace_id=trace_id)
3663
  with MODEL_MUTEX:
3664
+ return _generate_response_local(model_choice, force_cpu_slow, prompt, cfg, trace_id=trace_id)
3665
 
3666
 
3667
  @app.post("/api/chat")
3668
  async def chat(request: Request, _=Depends(require_auth)):
3669
  """Session-based chat endpoint (triage + inquiry)."""
3670
+ trace_id = "unassigned"
3671
  try:
3672
  store = request.state.store
3673
  start_time = datetime.now()
3674
+ trace_id = uuid.uuid4().hex[:12]
3675
  form = await request.form()
3676
  msg = (form.get("message") or "").strip()
3677
  if not msg:
3678
+ _runtime_log("chat.request.invalid", level=logging.WARNING, trace_id=trace_id, reason="empty_message")
3679
  return JSONResponse({"error": "Message is required."}, status_code=status.HTTP_400_BAD_REQUEST)
3680
  user_msg_raw = msg
3681
  p_name = form.get("patient") or ""
 
3696
  is_start = session_action == "start" or not session_id
3697
  if not session_id:
3698
  session_id = f"session-{uuid.uuid4().hex}"
3699
+ _runtime_log(
3700
+ "chat.request.received",
3701
+ trace_id=trace_id,
3702
+ mode=mode,
3703
+ model_choice=model_choice or "",
3704
+ force_cpu_slow=force_cpu_slow,
3705
+ queue_wait_requested=queue_wait_requested,
3706
+ private=is_priv,
3707
+ session_action=session_action or ("start" if is_start else "message"),
3708
+ session_id=session_id,
3709
+ patient=p_name or "",
3710
+ message_len=len(msg),
3711
+ disable_local_inference=DISABLE_LOCAL_INFERENCE,
3712
+ is_hf_space=IS_HF_SPACE,
3713
+ )
3714
 
3715
  if not DISABLE_LOCAL_INFERENCE:
3716
  model_availability = _local_model_availability_payload()
3717
  if not model_availability.get("has_any_local_model"):
3718
+ _runtime_log(
3719
+ "chat.request.rejected",
3720
+ level=logging.WARNING,
3721
+ trace_id=trace_id,
3722
+ reason="no_local_models",
3723
+ available_models=(model_availability.get("available_models") or []),
3724
+ )
3725
  return JSONResponse(
3726
  {
3727
  "error": (
 
3739
  if not chosen_model:
3740
  model_choice = available_models[0]
3741
  elif chosen_model not in available_models:
3742
+ _runtime_log(
3743
+ "chat.request.rejected",
3744
+ level=logging.WARNING,
3745
+ trace_id=trace_id,
3746
+ reason="model_not_installed",
3747
+ chosen_model=chosen_model,
3748
+ available_models=available_models,
3749
+ )
3750
  return JSONResponse(
3751
  {
3752
  "error": (
 
3801
  if override_prompt.strip():
3802
  prompt = override_prompt.strip()
3803
 
3804
+ _runtime_log(
3805
+ "chat.prompt.ready",
3806
+ trace_id=trace_id,
3807
+ mode=mode,
3808
+ model_choice=model_choice or "",
3809
+ prompt_len=len(prompt or ""),
3810
+ override_prompt=bool(override_prompt.strip()),
3811
+ cfg_tokens=cfg.get("tk"),
3812
+ cfg_temp=cfg.get("t"),
3813
+ cfg_top_p=cfg.get("p"),
3814
+ cfg_top_k=cfg.get("k"),
3815
+ triage_pathway_status=prompt_meta.get("triage_pathway_status") or "",
3816
+ triage_pathway_reason=prompt_meta.get("triage_pathway_reason") or "",
3817
+ triage_pathway_supplemented=bool(prompt_meta.get("triage_pathway_supplemented")),
3818
+ )
3819
+
3820
  # Persist the exact prompt submitted for debug visibility in Settings.
3821
  try:
3822
  set_settings_meta(last_prompt_verbatim=prompt)
3823
  except Exception:
3824
  logger.exception("Unable to persist last_prompt_verbatim")
3825
+ _runtime_log(
3826
+ "chat.prompt.persist_failed",
3827
+ level=logging.WARNING,
3828
+ trace_id=trace_id,
3829
+ error="Unable to persist last_prompt_verbatim",
3830
+ )
3831
 
3832
  model_lock_acquired = False
3833
  busy_meta_set = False
 
3836
  queue_snapshot = {}
3837
  queue_ticket, queue_snapshot = _chat_queue_enqueue(model_choice or "", mode, session_id, p_name)
3838
  if queue_ticket is None:
3839
+ _runtime_log(
3840
+ "chat.queue.rejected",
3841
+ level=logging.WARNING,
3842
+ trace_id=trace_id,
3843
+ reason="queue_full",
3844
+ mode=mode,
3845
+ model_choice=model_choice or "",
3846
+ queue_size=(queue_snapshot or {}).get("size"),
3847
+ )
3848
  return JSONResponse(
3849
  {
3850
  "error": (
 
3857
  status_code=status.HTTP_429_TOO_MANY_REQUESTS,
3858
  )
3859
  queue_position = safe_int(queue_snapshot.get("position"), 1)
3860
+ _runtime_log(
3861
+ "chat.queue.enqueued",
3862
+ trace_id=trace_id,
3863
+ mode=mode,
3864
+ model_choice=model_choice or "",
3865
+ queue_ticket=queue_ticket,
3866
+ queue_position=queue_position,
3867
+ queue_snapshot_size=(queue_snapshot or {}).get("size"),
3868
+ )
3869
  if queue_position > 1 and not queue_wait_requested:
3870
  _chat_queue_release(queue_ticket)
3871
  active = queue_snapshot.get("active") or {}
3872
  active_model = (active.get("model_choice") or "").strip() or "another consultation"
3873
+ _runtime_log(
3874
+ "chat.queue.wait_required",
3875
+ level=logging.WARNING,
3876
+ trace_id=trace_id,
3877
+ queue_position=queue_position,
3878
+ active_model=active_model,
3879
+ )
3880
  return JSONResponse(
3881
  {
3882
  "error": (
 
3893
  status_code=status.HTTP_429_TOO_MANY_REQUESTS,
3894
  )
3895
  queue_wait_seconds = await asyncio.to_thread(_chat_queue_wait_turn, queue_ticket)
3896
+ _runtime_log(
3897
+ "chat.queue.turn_acquired",
3898
+ trace_id=trace_id,
3899
+ queue_ticket=queue_ticket,
3900
+ wait_seconds=queue_wait_seconds,
3901
+ queue_wait_requested=queue_wait_requested,
3902
+ )
3903
  _set_model_busy_meta(model_choice or "", mode, session_id, p_name)
3904
  busy_meta_set = True
3905
 
3906
  try:
3907
+ _runtime_log(
3908
+ "chat.inference.start",
3909
+ trace_id=trace_id,
3910
+ mode=mode,
3911
+ model_choice=model_choice or "",
3912
+ disable_local_inference=DISABLE_LOCAL_INFERENCE,
3913
+ force_cpu_slow=force_cpu_slow,
3914
+ )
3915
  if not DISABLE_LOCAL_INFERENCE:
3916
  MODEL_MUTEX.acquire()
3917
  model_lock_acquired = True
 
3922
  prompt,
3923
  cfg,
3924
  True,
3925
+ trace_id,
3926
  )
3927
  else:
3928
+ res = await asyncio.to_thread(
3929
+ _generate_response,
3930
+ model_choice,
3931
+ force_cpu_slow,
3932
+ prompt,
3933
+ cfg,
3934
+ False,
3935
+ trace_id,
3936
+ )
3937
+ _runtime_log(
3938
+ "chat.inference.success",
3939
+ trace_id=trace_id,
3940
+ mode=mode,
3941
+ model_choice=model_choice or "",
3942
+ response_len=len(res or ""),
3943
+ )
3944
  except RuntimeError as e:
3945
  _dbg(f"chat runtime error: {e}")
3946
+ _runtime_log(
3947
+ "chat.inference.runtime_error",
3948
+ level=logging.ERROR,
3949
+ trace_id=trace_id,
3950
+ error=str(e),
3951
+ error_type=type(e).__name__,
3952
+ )
3953
  if str(e) == "CHAT_QUEUE_TICKET_MISSING":
3954
  return JSONResponse(
3955
  {"error": "Queue ticket was lost. Please submit again."},
 
3976
  },
3977
  status_code=status.HTTP_400_BAD_REQUEST,
3978
  )
3979
+ if str(e).startswith("REMOTE_INFERENCE_FAILED"):
3980
+ return JSONResponse(
3981
+ {
3982
+ "error": (
3983
+ "Remote MedGemma inference failed before a response was returned. "
3984
+ "Open Settings -> Runtime Debug Log (HF) for full details."
3985
+ ),
3986
+ "remote_inference_failed": True,
3987
+ "details": str(e),
3988
+ },
3989
+ status_code=status.HTTP_502_BAD_GATEWAY,
3990
+ )
3991
  if "Missing model cache" in str(e) or str(e) in {"REMOTE_TOKEN_MISSING", "LOCAL_INFERENCE_DISABLED"}:
3992
  return JSONResponse(
3993
  {"error": str(e), "offline_missing": True},
 
4076
  upsert_history_entry(entry)
4077
 
4078
  metrics = _update_chat_metrics(store, models["active_name"])
4079
+ _runtime_log(
4080
+ "chat.response.ready",
4081
+ trace_id=trace_id,
4082
+ mode=mode,
4083
+ model=models["active_name"],
4084
+ duration_ms=elapsed_ms,
4085
+ queue_wait_seconds=queue_wait_seconds,
4086
+ session_id=session_id,
4087
+ private=is_priv,
4088
+ )
4089
 
4090
  return JSONResponse(
4091
  {
 
4112
  }
4113
  )
4114
  except Exception as e:
4115
+ _runtime_log(
4116
+ "chat.request.exception",
4117
+ level=logging.ERROR,
4118
+ trace_id=trace_id,
4119
+ error_type=type(e).__name__,
4120
+ error=str(e),
4121
+ traceback=traceback.format_exc(),
4122
+ )
4123
  return JSONResponse({"error": str(e)}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
4124
 
4125
 
static/js/settings.js CHANGED
@@ -1382,6 +1382,96 @@ async function resetTriagePromptTreeToDefault() {
1382
  }
1383
  }
1384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1385
  if (document.readyState !== 'loading') {
1386
  applyStoredUserMode();
1387
  loadSettingsUI().catch(() => {});
@@ -1486,6 +1576,7 @@ async function loadSettingsUI() {
1486
  applySettingsToUI(s);
1487
  loadPromptTemplates().catch(() => {});
1488
  loadTriagePromptTree().catch(() => {});
 
1489
  try { localStorage.setItem('user_mode', s.user_mode || 'user'); } catch (err) { /* ignore */ }
1490
  if (!offlineInitialized) {
1491
  offlineInitialized = true;
@@ -1502,6 +1593,7 @@ async function loadSettingsUI() {
1502
  applySettingsToUI({ ...DEFAULT_SETTINGS, user_mode: localMode });
1503
  loadPromptTemplates().catch(() => {});
1504
  loadTriagePromptTree().catch(() => {});
 
1505
  }
1506
  }
1507
 
@@ -2758,6 +2850,9 @@ window.moveEquipmentCategory = moveEquipmentCategory;
2758
  window.addConsumableCategory = addConsumableCategory;
2759
  window.removeConsumableCategory = removeConsumableCategory;
2760
  window.moveConsumableCategory = moveConsumableCategory;
 
 
 
2761
 
2762
 
2763
  //
 
1382
  }
1383
  }
1384
 
1385
+ function setRuntimeLogStatus(message, isError = false) {
1386
+ const el = document.getElementById('runtime-log-status');
1387
+ if (!el) return;
1388
+ el.textContent = message || '';
1389
+ el.style.color = isError ? 'var(--red)' : '#2c3e50';
1390
+ }
1391
+
1392
+ function updateRuntimeLogMeta(meta = {}) {
1393
+ const el = document.getElementById('runtime-log-meta');
1394
+ if (!el) return;
1395
+ const exists = !!meta.exists;
1396
+ const size = Number(meta.size || 0);
1397
+ if (!exists) {
1398
+ el.textContent = 'No log file yet';
1399
+ return;
1400
+ }
1401
+ const kb = (size / 1024).toFixed(1);
1402
+ el.textContent = `${kb} KB`;
1403
+ }
1404
+
1405
+ function getRuntimeLogLinesRequested() {
1406
+ const el = document.getElementById('runtime-log-lines');
1407
+ const parsed = Number(el?.value || 600);
1408
+ if (Number.isFinite(parsed) && parsed >= 50 && parsed <= 5000) {
1409
+ return Math.floor(parsed);
1410
+ }
1411
+ return 600;
1412
+ }
1413
+
1414
+ async function loadRuntimeLog(lines = null) {
1415
+ const textarea = document.getElementById('runtime-log-text');
1416
+ const lineCount = lines == null ? getRuntimeLogLinesRequested() : Number(lines);
1417
+ if (textarea) {
1418
+ textarea.value = '';
1419
+ }
1420
+ setRuntimeLogStatus('Loading runtime log...');
1421
+ try {
1422
+ const res = await fetch(`/api/runtime-log?lines=${encodeURIComponent(lineCount)}`, {
1423
+ credentials: 'same-origin',
1424
+ });
1425
+ const payload = await res.json();
1426
+ if (!res.ok || payload?.error) {
1427
+ throw new Error(payload?.error || `Status ${res.status}`);
1428
+ }
1429
+ const text = String(payload?.text || '');
1430
+ if (textarea) {
1431
+ textarea.value = text || '[Runtime log is currently empty]';
1432
+ textarea.scrollTop = textarea.scrollHeight;
1433
+ }
1434
+ updateRuntimeLogMeta(payload || {});
1435
+ const loadedLines = text ? text.split('\n').length : 0;
1436
+ const pathLabel = payload?.path ? ` (${payload.path})` : '';
1437
+ setRuntimeLogStatus(`Loaded ${loadedLines} line${loadedLines === 1 ? '' : 's'}${pathLabel}.`);
1438
+ } catch (err) {
1439
+ setRuntimeLogStatus(`Runtime log load failed: ${err.message}`, true);
1440
+ }
1441
+ }
1442
+
1443
+ function downloadRuntimeLog() {
1444
+ setRuntimeLogStatus('Downloading runtime log...');
1445
+ const anchor = document.createElement('a');
1446
+ anchor.href = '/api/runtime-log/download';
1447
+ anchor.download = 'runtime.log';
1448
+ document.body.appendChild(anchor);
1449
+ anchor.click();
1450
+ anchor.remove();
1451
+ setTimeout(() => {
1452
+ setRuntimeLogStatus('Runtime log download requested.');
1453
+ }, 100);
1454
+ }
1455
+
1456
+ async function clearRuntimeLog() {
1457
+ if (!confirm('Clear the runtime log file?')) return;
1458
+ setRuntimeLogStatus('Clearing runtime log...');
1459
+ try {
1460
+ const res = await fetch('/api/runtime-log/clear', {
1461
+ method: 'POST',
1462
+ credentials: 'same-origin',
1463
+ });
1464
+ const payload = await res.json();
1465
+ if (!res.ok || payload?.error) {
1466
+ throw new Error(payload?.error || `Status ${res.status}`);
1467
+ }
1468
+ await loadRuntimeLog(getRuntimeLogLinesRequested());
1469
+ setRuntimeLogStatus('Runtime log cleared.');
1470
+ } catch (err) {
1471
+ setRuntimeLogStatus(`Runtime log clear failed: ${err.message}`, true);
1472
+ }
1473
+ }
1474
+
1475
  if (document.readyState !== 'loading') {
1476
  applyStoredUserMode();
1477
  loadSettingsUI().catch(() => {});
 
1576
  applySettingsToUI(s);
1577
  loadPromptTemplates().catch(() => {});
1578
  loadTriagePromptTree().catch(() => {});
1579
+ loadRuntimeLog().catch(() => {});
1580
  try { localStorage.setItem('user_mode', s.user_mode || 'user'); } catch (err) { /* ignore */ }
1581
  if (!offlineInitialized) {
1582
  offlineInitialized = true;
 
1593
  applySettingsToUI({ ...DEFAULT_SETTINGS, user_mode: localMode });
1594
  loadPromptTemplates().catch(() => {});
1595
  loadTriagePromptTree().catch(() => {});
1596
+ loadRuntimeLog().catch(() => {});
1597
  }
1598
  }
1599
 
 
2850
  window.addConsumableCategory = addConsumableCategory;
2851
  window.removeConsumableCategory = removeConsumableCategory;
2852
  window.moveConsumableCategory = moveConsumableCategory;
2853
+ window.loadRuntimeLog = loadRuntimeLog;
2854
+ window.downloadRuntimeLog = downloadRuntimeLog;
2855
+ window.clearRuntimeLog = clearRuntimeLog;
2856
 
2857
 
2858
  //
templates/index.html CHANGED
@@ -1217,6 +1217,26 @@
1217
  <div style="font-size:12px; color:#555;">Recommended sequence: Run readiness check → Download missing models → Create backup snapshot → Enable offline mode.</div>
1218
  </div>
1219
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  <div class="collapsible" style="margin-bottom:12px;">
1221
  <div class="col-header crew-med-header" onclick="toggleSection(this)" style="background:#fff; justify-content:flex-start;">
1222
  <span class="dev-tag">dev:settings-last-prompt</span><span class="detail-icon history-arrow" style="font-size:18px; margin-right:8px;">▸</span><span style="font-weight:700;">Last Prompt Verbatim</span>
@@ -1488,7 +1508,7 @@
1488
  <script src="/static/js/crew.js?v=20260223"></script>
1489
  <script src="/static/js/pharmacy.js?v=20260223"></script>
1490
  <script src="/static/js/equipment.js?v=20260223"></script>
1491
- <script src="/static/js/settings.js?v=20260307"></script>
1492
  <script src="/static/js/main.js?v=20260223"></script>
1493
  <script src="/static/js/recovery.js?v=20260304"></script>
1494
  <script>
 
1217
  <div style="font-size:12px; color:#555;">Recommended sequence: Run readiness check → Download missing models → Create backup snapshot → Enable offline mode.</div>
1218
  </div>
1219
  </div>
1220
+ <div class="collapsible" style="margin-bottom:12px;">
1221
+ <div class="col-header crew-med-header" onclick="toggleSection(this)" style="background:#fff; justify-content:flex-start;">
1222
+ <span class="dev-tag">dev:settings-runtime-log</span><span class="detail-icon history-arrow" style="font-size:18px; margin-right:8px;">▸</span><span style="font-weight:700;">Runtime Debug Log (HF)</span>
1223
+ <span id="runtime-log-meta" style="margin-left:auto; font-size:12px; color:#4a5568;"></span>
1224
+ </div>
1225
+ <div class="col-body" style="padding:12px; background:#f8f9fa; display:none;">
1226
+ <div style="font-size:12px; color:#475569; margin-bottom:8px;">
1227
+ Use this log to troubleshoot stalled or failed MedGemma requests on Hugging Face.
1228
+ </div>
1229
+ <div style="display:flex; flex-wrap:wrap; gap:8px; align-items:center; margin-bottom:8px;">
1230
+ <label for="runtime-log-lines" style="font-size:12px; color:#333;">Lines</label>
1231
+ <input id="runtime-log-lines" data-settings-no-autosave="1" type="number" min="50" max="5000" step="50" value="600" style="width:90px; padding:6px;">
1232
+ <button class="btn btn-sm" style="background:var(--inquiry);" onclick="loadRuntimeLog()">Refresh</button>
1233
+ <button class="btn btn-sm" style="background:#0b8457;" onclick="downloadRuntimeLog()">Download</button>
1234
+ <button class="btn btn-sm" style="background:#8b0000;" onclick="clearRuntimeLog()">Clear</button>
1235
+ </div>
1236
+ <div id="runtime-log-status" style="font-size:12px; color:#2c3e50; margin-bottom:8px;">Not loaded yet.</div>
1237
+ <textarea id="runtime-log-text" data-settings-no-autosave="1" readonly style="width:100%; min-height:240px; font-family:Menlo, Consolas, monospace; font-size:12px; line-height:1.35; background:#fff;"></textarea>
1238
+ </div>
1239
+ </div>
1240
  <div class="collapsible" style="margin-bottom:12px;">
1241
  <div class="col-header crew-med-header" onclick="toggleSection(this)" style="background:#fff; justify-content:flex-start;">
1242
  <span class="dev-tag">dev:settings-last-prompt</span><span class="detail-icon history-arrow" style="font-size:18px; margin-right:8px;">▸</span><span style="font-weight:700;">Last Prompt Verbatim</span>
 
1508
  <script src="/static/js/crew.js?v=20260223"></script>
1509
  <script src="/static/js/pharmacy.js?v=20260223"></script>
1510
  <script src="/static/js/equipment.js?v=20260223"></script>
1511
+ <script src="/static/js/settings.js?v=20260308"></script>
1512
  <script src="/static/js/main.js?v=20260223"></script>
1513
  <script src="/static/js/recovery.js?v=20260304"></script>
1514
  <script>