Marintosti commited on
Commit
ff010d3
·
verified ·
1 Parent(s): 47dc5da

ci: deploy afad831

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. requirements-deploy.txt +5 -5
  3. src/deployment/serve.py +58 -11
Dockerfile CHANGED
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive \
8
  PIP_NO_CACHE_DIR=1 \
9
  HF_HOME=/home/user/.cache/huggingface \
10
  PORT=7860 \
11
- USE_VLLM=0
12
 
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  python3.11 python3.11-venv python3-pip git curl \
 
8
  PIP_NO_CACHE_DIR=1 \
9
  HF_HOME=/home/user/.cache/huggingface \
10
  PORT=7860 \
11
+ USE_VLLM=1
12
 
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  python3.11 python3.11-venv python3-pip git curl \
requirements-deploy.txt CHANGED
@@ -6,13 +6,13 @@ uvicorn[standard]==0.34.0
6
  pydantic==2.10.4
7
  pyyaml==6.0.2
8
 
9
- # Backend d'inference transformers (avec support Qwen3 >= 4.51).
10
- # vLLM retire du POC : incompatible avec transformers 4.51+ et non requis en mode transformers.
11
- # Pour ajouter vLLM en prod : bumper aussi vllm vers >= 0.7 (support Qwen3).
12
- transformers>=4.51,<5
13
  accelerate>=1.2
14
  peft>=0.14
15
  huggingface_hub>=0.27
16
- torch>=2.4
17
  safetensors>=0.4.5
18
  sentencepiece>=0.2
 
6
  pydantic==2.10.4
7
  pyyaml==6.0.2
8
 
9
+ # Backend d'inference vLLM (Qwen3 supporte depuis vllm 0.7).
10
+ # vllm 0.7.x impose torch 2.5 et transformers 4.51-4.52.
11
+ vllm>=0.7,<0.8
12
+ transformers>=4.51,<4.53
13
  accelerate>=1.2
14
  peft>=0.14
15
  huggingface_hub>=0.27
16
+ torch>=2.5,<2.6
17
  safetensors>=0.4.5
18
  sentencepiece>=0.2
src/deployment/serve.py CHANGED
@@ -15,13 +15,17 @@ Variables d'environnement:
15
  - USE_VLLM : "1" pour charger vLLM, "0" pour transformers/mock (default: "1")
16
  - API_KEY : si defini, exige le header X-API-Key
17
  - PORT : port d'ecoute (default: 7860 - requis par HF Spaces)
 
 
18
  """
19
 
20
  from __future__ import annotations
21
 
 
22
  import logging
23
  import os
24
  import re
 
25
  import time
26
  import uuid
27
  from contextlib import asynccontextmanager
@@ -47,10 +51,52 @@ MODEL_ID = os.environ.get("MODEL_ID", "").strip()
47
  ADAPTER_ID = os.environ.get("ADAPTER_ID", "").strip()
48
  USE_VLLM = os.environ.get("USE_VLLM", "1") == "1"
49
  API_KEY = os.environ.get("API_KEY", "").strip()
 
50
 
51
  # Etat global du backend
52
  _backend: dict[str, Any] = {"kind": "mock", "engine": None, "tokenizer": None}
53
  audit_log: list[dict] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
  def _load_vllm(model_id: str) -> None:
@@ -101,6 +147,7 @@ def _load_transformers(model_id: str, adapter_id: str = "") -> None:
101
 
102
  @asynccontextmanager
103
  async def lifespan(app: FastAPI):
 
104
  if not MODEL_ID:
105
  logger.warning("MODEL_ID non defini - mode mock (echo) active")
106
  else:
@@ -363,17 +410,17 @@ async def triage(req: TriageRequest):
363
  backend=_backend["kind"],
364
  )
365
 
366
- audit_log.append(
367
- {
368
- "interaction_id": interaction_id,
369
- "timestamp": timestamp,
370
- "patient_id": req.patient_id,
371
- "symptoms": req.symptoms,
372
- "priority_level": priority,
373
- "latency_ms": response.latency_ms,
374
- "backend": _backend["kind"],
375
- }
376
- )
377
  logger.info("Triage %s -> %s (%.0f ms)", interaction_id, priority, latency_ms)
378
  return response
379
 
 
15
  - USE_VLLM : "1" pour charger vLLM, "0" pour transformers/mock (default: "1")
16
  - API_KEY : si defini, exige le header X-API-Key
17
  - PORT : port d'ecoute (default: 7860 - requis par HF Spaces)
18
+ - AUDIT_LOG_PATH : chemin du fichier JSONL append-only pour la tracabilite RGPD
19
+ (default: audit/audit.jsonl). Mettre vide pour desactiver.
20
  """
21
 
22
  from __future__ import annotations
23
 
24
+ import json
25
  import logging
26
  import os
27
  import re
28
+ import threading
29
  import time
30
  import uuid
31
  from contextlib import asynccontextmanager
 
51
  ADAPTER_ID = os.environ.get("ADAPTER_ID", "").strip()
52
  USE_VLLM = os.environ.get("USE_VLLM", "1") == "1"
53
  API_KEY = os.environ.get("API_KEY", "").strip()
54
+ AUDIT_LOG_PATH = os.environ.get("AUDIT_LOG_PATH", str(PROJECT_ROOT / "audit" / "audit.jsonl")).strip()
55
 
56
  # Etat global du backend
57
  _backend: dict[str, Any] = {"kind": "mock", "engine": None, "tokenizer": None}
58
  audit_log: list[dict] = []
59
+ _audit_lock = threading.Lock()
60
+
61
+
62
+ def _persist_audit_entry(entry: dict) -> None:
63
+ """Append-only JSONL pour la tracabilite RGPD (relisible apres redemarrage).
64
+
65
+ Sans persistance, l'audit log est perdu au redeploiement, ce qui n'est pas
66
+ acceptable pour un audit medical.
67
+ """
68
+ if not AUDIT_LOG_PATH:
69
+ return
70
+ try:
71
+ path = Path(AUDIT_LOG_PATH)
72
+ path.parent.mkdir(parents=True, exist_ok=True)
73
+ line = json.dumps(entry, ensure_ascii=False)
74
+ with _audit_lock, open(path, "a", encoding="utf-8") as f:
75
+ f.write(line + "\n")
76
+ except OSError as exc:
77
+ logger.warning("Echec persistance audit (%s): %s", AUDIT_LOG_PATH, exc)
78
+
79
+
80
+ def _load_audit_history() -> None:
81
+ """Recharge l'historique au demarrage pour exposer un audit complet via /audit."""
82
+ if not AUDIT_LOG_PATH:
83
+ return
84
+ path = Path(AUDIT_LOG_PATH)
85
+ if not path.exists():
86
+ return
87
+ try:
88
+ with open(path, encoding="utf-8") as f:
89
+ for line in f:
90
+ line = line.strip()
91
+ if not line:
92
+ continue
93
+ try:
94
+ audit_log.append(json.loads(line))
95
+ except json.JSONDecodeError:
96
+ continue
97
+ logger.info("Audit log charge: %d entrees depuis %s", len(audit_log), path)
98
+ except OSError as exc:
99
+ logger.warning("Echec lecture audit (%s): %s", path, exc)
100
 
101
 
102
  def _load_vllm(model_id: str) -> None:
 
147
 
148
  @asynccontextmanager
149
  async def lifespan(app: FastAPI):
150
+ _load_audit_history()
151
  if not MODEL_ID:
152
  logger.warning("MODEL_ID non defini - mode mock (echo) active")
153
  else:
 
410
  backend=_backend["kind"],
411
  )
412
 
413
+ entry = {
414
+ "interaction_id": interaction_id,
415
+ "timestamp": timestamp,
416
+ "patient_id": req.patient_id,
417
+ "symptoms": req.symptoms,
418
+ "priority_level": priority,
419
+ "latency_ms": response.latency_ms,
420
+ "backend": _backend["kind"],
421
+ }
422
+ audit_log.append(entry)
423
+ _persist_audit_entry(entry)
424
  logger.info("Triage %s -> %s (%.0f ms)", interaction_id, priority, latency_ms)
425
  return response
426