Spaces:
Sleeping
feat: refonte interface benchmark — composition visuelle des concurrents OCR/LLM
Browse filesBackend
-------
- GET /api/models/{provider} : liste des modèles en temps réel pour tesseract
(langues installées), mistral_ocr/openai/anthropic/mistral (appels API live),
ollama (tags locaux), google_vision/azure_doc_intel (statiques), prompts (.txt)
- POST /api/benchmark/run : accepte une liste de CompetitorConfig composés
(moteur OCR + modèle + LLM optionnel + mode pipeline + prompt)
- GET /api/engines : ajout de mistral_ocr, google_vision, azure_doc_intel
comme moteurs OCR cloud ; champ langs pour Tesseract ; label Mistral LLM corrigé
- _engine_from_competitor() : instancie TesseractEngine, MistralOCREngine,
GoogleVisionEngine, AzureDocIntelEngine, ou OCRLLMPipeline selon config
- _run_benchmark_thread_v2() : thread de benchmark pour les concurrents composés
MistralOCREngine
----------------
- Support de mistral-ocr-latest via l'endpoint dédié POST /v1/ocr (native API)
- Détection automatique selon le nom du modèle : "mistral-ocr" → native,
sinon → vision/chat API (pixtral-12b, pixtral-large…)
Frontend
--------
- Section 1 (OCR) : liste les moteurs avec statut et modèles disponibles (spinner)
- Section 2 (LLM) : liste les providers avec statut et modèles (spinner)
- Section 3 (Composer) : toggle OCR seul / Pipeline OCR+LLM, sélection moteur +
modèle, provider LLM + modèle + mode + prompt, bouton Ajouter → carte concurrent
- Cartes concurrents supprimables avec badge OCR / Pipeline
- startBenchmark() poste sur /api/benchmark/run avec la liste _competitors
- Barres de progression créées dynamiquement par engine name reçu en SSE
- Auto-refresh toutes les 10 s si une nouvelle clé API est détectée
Tests
-----
- TestFastAPIModels (12 tests) : GET /api/models/{provider}
- TestFastAPIBenchmarkRun (8 tests) : POST /api/benchmark/run
- TestFastAPIEnginesExtended (6 tests) : champs ajoutés dans api_engines()
- TestMistralOCRNativeAPI (6 tests) : routing mistral-ocr vs pixtral
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/engines/mistral_ocr.py +36 -7
- picarones/web/app.py +744 -68
- tests/test_sprint6_web_interface.py +224 -0
|
@@ -56,14 +56,7 @@ class MistralOCREngine(BaseOCREngine):
|
|
| 56 |
raise RuntimeError(
|
| 57 |
"Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
|
| 58 |
)
|
| 59 |
-
try:
|
| 60 |
-
from mistralai import Mistral
|
| 61 |
-
except ImportError as exc:
|
| 62 |
-
raise RuntimeError(
|
| 63 |
-
"Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
|
| 64 |
-
) from exc
|
| 65 |
|
| 66 |
-
# Encoder l'image en base64 avec media type correct
|
| 67 |
suffix = image_path.suffix.lower()
|
| 68 |
media_type = {
|
| 69 |
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
|
@@ -74,6 +67,42 @@ class MistralOCREngine(BaseOCREngine):
|
|
| 74 |
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 75 |
image_url = f"data:{media_type};base64,{image_b64}"
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
client = Mistral(api_key=self._api_key)
|
| 78 |
response = client.chat.complete(
|
| 79 |
model=self._model,
|
|
|
|
| 56 |
raise RuntimeError(
|
| 57 |
"Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
|
| 58 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
|
|
|
| 60 |
suffix = image_path.suffix.lower()
|
| 61 |
media_type = {
|
| 62 |
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
|
|
|
| 67 |
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 68 |
image_url = f"data:{media_type};base64,{image_b64}"
|
| 69 |
|
| 70 |
+
if "mistral-ocr" in self._model.lower():
|
| 71 |
+
return self._run_ocr_native_api(image_url)
|
| 72 |
+
return self._run_ocr_vision_api(image_url)
|
| 73 |
+
|
| 74 |
+
def _run_ocr_native_api(self, image_url: str) -> str:
|
| 75 |
+
"""Endpoint dédié /v1/ocr (pour mistral-ocr-latest et variantes)."""
|
| 76 |
+
import json
|
| 77 |
+
import urllib.request
|
| 78 |
+
|
| 79 |
+
payload = json.dumps({
|
| 80 |
+
"model": self._model,
|
| 81 |
+
"document": {"type": "image_url", "image_url": image_url},
|
| 82 |
+
}).encode("utf-8")
|
| 83 |
+
req = urllib.request.Request(
|
| 84 |
+
"https://api.mistral.ai/v1/ocr",
|
| 85 |
+
data=payload,
|
| 86 |
+
headers={
|
| 87 |
+
"Authorization": f"Bearer {self._api_key}",
|
| 88 |
+
"Content-Type": "application/json",
|
| 89 |
+
},
|
| 90 |
+
method="POST",
|
| 91 |
+
)
|
| 92 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 93 |
+
data = json.loads(resp.read().decode())
|
| 94 |
+
pages = data.get("pages", [])
|
| 95 |
+
return "\n\n".join(p.get("markdown", "") for p in pages).strip()
|
| 96 |
+
|
| 97 |
+
def _run_ocr_vision_api(self, image_url: str) -> str:
|
| 98 |
+
"""API vision/chat Mistral (pour pixtral-12b, pixtral-large, etc.)."""
|
| 99 |
+
try:
|
| 100 |
+
from mistralai import Mistral
|
| 101 |
+
except ImportError as exc:
|
| 102 |
+
raise RuntimeError(
|
| 103 |
+
"Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
|
| 104 |
+
) from exc
|
| 105 |
+
|
| 106 |
client = Mistral(api_key=self._api_key)
|
| 107 |
response = client.chat.complete(
|
| 108 |
model=self._model,
|
|
@@ -136,6 +136,25 @@ class HuggingFaceImportRequest(BaseModel):
|
|
| 136 |
max_samples: int = 100
|
| 137 |
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# ---------------------------------------------------------------------------
|
| 140 |
# API — status
|
| 141 |
# ---------------------------------------------------------------------------
|
|
@@ -198,6 +217,7 @@ async def api_engines() -> dict:
|
|
| 198 |
|
| 199 |
# Tesseract
|
| 200 |
tess = _check_engine("tesseract", "pytesseract")
|
|
|
|
| 201 |
engines.append(tess)
|
| 202 |
|
| 203 |
# Pero OCR
|
|
@@ -212,6 +232,42 @@ async def api_engines() -> dict:
|
|
| 212 |
calamari = _check_engine("calamari", "calamari_ocr", label="Calamari")
|
| 213 |
engines.append(calamari)
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
llms = []
|
| 216 |
|
| 217 |
# OpenAI
|
|
@@ -234,10 +290,10 @@ async def api_engines() -> dict:
|
|
| 234 |
"status": "configured" if os.environ.get("ANTHROPIC_API_KEY") else "missing_key",
|
| 235 |
})
|
| 236 |
|
| 237 |
-
# Mistral
|
| 238 |
llms.append({
|
| 239 |
"id": "mistral",
|
| 240 |
-
"label": "Mistral (Mistral
|
| 241 |
"type": "llm",
|
| 242 |
"available": bool(os.environ.get("MISTRAL_API_KEY")),
|
| 243 |
"key_env": "MISTRAL_API_KEY",
|
|
@@ -312,6 +368,134 @@ def _list_ollama_models() -> list[str]:
|
|
| 312 |
return []
|
| 313 |
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
# ---------------------------------------------------------------------------
|
| 316 |
# API — corpus browse
|
| 317 |
# ---------------------------------------------------------------------------
|
|
@@ -587,6 +771,192 @@ def _sse_format(event_type: str, data: Any) -> str:
|
|
| 587 |
return f"event: {event_type}\ndata: {payload}\n\n"
|
| 588 |
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
|
| 591 |
"""Exécute le benchmark dans un thread et envoie des événements SSE."""
|
| 592 |
import time
|
|
@@ -840,6 +1210,26 @@ tr:hover td { background: #f0ede6; }
|
|
| 840 |
/* Spinner */
|
| 841 |
.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid #ccc; border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; }
|
| 842 |
@keyframes spin { to { transform: rotate(360deg); } }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
</style>
|
| 844 |
</head>
|
| 845 |
<body>
|
|
@@ -876,15 +1266,92 @@ tr:hover td { background: #f0ede6; }
|
|
| 876 |
<div id="corpus-info" style="margin-top:8px; font-size:12px; color: var(--text-muted);"></div>
|
| 877 |
</div>
|
| 878 |
|
|
|
|
| 879 |
<div class="card">
|
| 880 |
-
<h2 data-i18n="
|
| 881 |
-
<div id="
|
| 882 |
-
<div style="color: var(--text-muted); font-size: 12px;"
|
| 883 |
</div>
|
| 884 |
</div>
|
| 885 |
|
|
|
|
| 886 |
<div class="card">
|
| 887 |
-
<h2 data-i18n="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
<div class="form-row">
|
| 889 |
<div class="form-group">
|
| 890 |
<label data-i18n="bench_norm_label">Profil de normalisation</label>
|
|
@@ -892,10 +1359,6 @@ tr:hover td { background: #f0ede6; }
|
|
| 892 |
<option value="nfc">NFC (standard)</option>
|
| 893 |
</select>
|
| 894 |
</div>
|
| 895 |
-
<div class="form-group">
|
| 896 |
-
<label data-i18n="bench_lang_label">Langue (Tesseract)</label>
|
| 897 |
-
<input type="text" id="bench-lang" value="fra" placeholder="fra" />
|
| 898 |
-
</div>
|
| 899 |
<div class="form-group">
|
| 900 |
<label data-i18n="bench_output_label">Dossier de sortie</label>
|
| 901 |
<input type="text" id="output-dir" value="./rapports/" />
|
|
@@ -1072,7 +1535,23 @@ const T = {
|
|
| 1072 |
bench_corpus_label: "Chemin vers le dossier corpus (paires image / .gt.txt)",
|
| 1073 |
bench_browse: "Parcourir",
|
| 1074 |
bench_engines_title: "2. Moteurs et pipelines",
|
| 1075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
bench_norm_label: "Profil de normalisation",
|
| 1077 |
bench_lang_label: "Langue (Tesseract)",
|
| 1078 |
bench_output_label: "Dossier de sortie",
|
|
@@ -1124,7 +1603,23 @@ const T = {
|
|
| 1124 |
bench_corpus_label: "Path to corpus directory (image / .gt.txt pairs)",
|
| 1125 |
bench_browse: "Browse",
|
| 1126 |
bench_engines_title: "2. Engines & pipelines",
|
| 1127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1128 |
bench_norm_label: "Normalization profile",
|
| 1129 |
bench_lang_label: "Language (Tesseract)",
|
| 1130 |
bench_output_label: "Output directory",
|
|
@@ -1198,32 +1693,221 @@ async function loadStatus() {
|
|
| 1198 |
} catch(e) {}
|
| 1199 |
}
|
| 1200 |
|
| 1201 |
-
// ───
|
| 1202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1203 |
try {
|
| 1204 |
const r = await fetch("/api/engines");
|
| 1205 |
const d = await r.json();
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
-
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1217 |
});
|
| 1218 |
-
|
| 1219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1220 |
|
| 1221 |
-
|
| 1222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1223 |
} catch(e) {
|
| 1224 |
-
document.getElementById("
|
| 1225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1226 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
}
|
| 1228 |
|
| 1229 |
// ─── Normalization profiles ──────────────────────────────────────────────────
|
|
@@ -1301,19 +1985,17 @@ async function startBenchmark() {
|
|
| 1301 |
alert(lang === "fr" ? "Veuillez sélectionner un dossier corpus." : "Please select a corpus directory.");
|
| 1302 |
return;
|
| 1303 |
}
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
alert(lang === "fr" ? "Veuillez sélectionner au moins un moteur." : "Please select at least one engine.");
|
| 1307 |
return;
|
| 1308 |
}
|
| 1309 |
|
| 1310 |
const payload = {
|
| 1311 |
corpus_path: corpusPath,
|
| 1312 |
-
|
| 1313 |
normalization_profile: document.getElementById("norm-profile").value,
|
| 1314 |
output_dir: document.getElementById("output-dir").value,
|
| 1315 |
report_name: document.getElementById("report-name").value,
|
| 1316 |
-
lang: document.getElementById("bench-lang").value,
|
| 1317 |
};
|
| 1318 |
|
| 1319 |
document.getElementById("start-btn").disabled = true;
|
|
@@ -1325,7 +2007,7 @@ async function startBenchmark() {
|
|
| 1325 |
document.getElementById("bench-status-text").textContent = lang === "fr" ? "Démarrage…" : "Starting…";
|
| 1326 |
|
| 1327 |
try {
|
| 1328 |
-
const r = await fetch("/api/benchmark/
|
| 1329 |
method: "POST",
|
| 1330 |
headers: {"Content-Type": "application/json"},
|
| 1331 |
body: JSON.stringify(payload),
|
|
@@ -1336,7 +2018,7 @@ async function startBenchmark() {
|
|
| 1336 |
}
|
| 1337 |
const d = await r.json();
|
| 1338 |
_currentJobId = d.job_id;
|
| 1339 |
-
_startSSE(_currentJobId
|
| 1340 |
} catch(e) {
|
| 1341 |
appendLog(`Erreur : ${e.message}`, "error");
|
| 1342 |
document.getElementById("start-btn").disabled = false;
|
|
@@ -1345,20 +2027,11 @@ async function startBenchmark() {
|
|
| 1345 |
}
|
| 1346 |
}
|
| 1347 |
|
| 1348 |
-
function _startSSE(jobId
|
| 1349 |
if (_eventSource) _eventSource.close();
|
| 1350 |
-
// Init engine progress bars
|
| 1351 |
const pl = document.getElementById("engine-progress-list");
|
| 1352 |
pl.innerHTML = "";
|
| 1353 |
-
|
| 1354 |
-
const div = document.createElement("div");
|
| 1355 |
-
div.id = `eng-progress-${eng}`;
|
| 1356 |
-
div.style = "margin-bottom: 8px;";
|
| 1357 |
-
div.innerHTML = `<div style="display:flex; justify-content:space-between; font-size:12px; margin-bottom:3px;">
|
| 1358 |
-
<span>${eng}</span><span id="eng-pct-${eng}">0%</span></div>
|
| 1359 |
-
<div class="progress-bar-outer"><div class="progress-bar-inner" id="eng-bar-${eng}" style="width:0%"></div></div>`;
|
| 1360 |
-
pl.appendChild(div);
|
| 1361 |
-
});
|
| 1362 |
|
| 1363 |
_eventSource = new EventSource(`/api/benchmark/${jobId}/stream`);
|
| 1364 |
|
|
@@ -1381,16 +2054,22 @@ function _startSSE(jobId, engines) {
|
|
| 1381 |
_eventSource.addEventListener("progress", e => {
|
| 1382 |
const d = JSON.parse(e.data);
|
| 1383 |
const pct = Math.round(d.progress * 100);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1384 |
document.getElementById("bench-status-text").textContent =
|
| 1385 |
`${pct}% — ${d.engine} (${d.processed}/${d.total})`;
|
| 1386 |
-
engines.forEach(eng => {
|
| 1387 |
-
const bar = document.getElementById(`eng-bar-${eng}`);
|
| 1388 |
-
const pctEl = document.getElementById(`eng-pct-${eng}`);
|
| 1389 |
-
if (d.engine === eng && bar && pctEl) {
|
| 1390 |
-
bar.style.width = pct + "%";
|
| 1391 |
-
pctEl.textContent = pct + "%";
|
| 1392 |
-
}
|
| 1393 |
-
});
|
| 1394 |
});
|
| 1395 |
|
| 1396 |
_eventSource.addEventListener("complete", e => {
|
|
@@ -1411,15 +2090,8 @@ function _startSSE(jobId, engines) {
|
|
| 1411 |
_finishBenchmark();
|
| 1412 |
});
|
| 1413 |
|
| 1414 |
-
_eventSource.addEventListener("done", e => {
|
| 1415 |
-
|
| 1416 |
-
});
|
| 1417 |
-
|
| 1418 |
-
_eventSource.onerror = () => {
|
| 1419 |
-
if (_currentJobId) {
|
| 1420 |
-
_finishBenchmark();
|
| 1421 |
-
}
|
| 1422 |
-
};
|
| 1423 |
}
|
| 1424 |
|
| 1425 |
function _showResults(data) {
|
|
@@ -1656,11 +2328,15 @@ async function confirmImport() {
|
|
| 1656 |
}
|
| 1657 |
|
| 1658 |
// ─── Init ────────────────────────────────────────────────────────────────────
|
| 1659 |
-
document.addEventListener("DOMContentLoaded", () => {
|
| 1660 |
loadStatus();
|
| 1661 |
-
loadEngineCheckboxes();
|
| 1662 |
loadNormProfiles();
|
| 1663 |
initHTRFilters();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1664 |
// Close modal on backdrop click
|
| 1665 |
document.getElementById("import-modal").addEventListener("click", e => {
|
| 1666 |
if (e.target === document.getElementById("import-modal")) closeImportModal();
|
|
|
|
| 136 |
max_samples: int = 100
|
| 137 |
|
| 138 |
|
| 139 |
+
class CompetitorConfig(BaseModel):
|
| 140 |
+
name: str = ""
|
| 141 |
+
ocr_engine: str
|
| 142 |
+
ocr_model: str = ""
|
| 143 |
+
llm_provider: str = ""
|
| 144 |
+
llm_model: str = ""
|
| 145 |
+
pipeline_mode: str = ""
|
| 146 |
+
prompt_file: str = ""
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class BenchmarkRunRequest(BaseModel):
|
| 150 |
+
corpus_path: str
|
| 151 |
+
competitors: list[CompetitorConfig]
|
| 152 |
+
normalization_profile: str = "nfc"
|
| 153 |
+
output_dir: str = "./rapports/"
|
| 154 |
+
report_name: str = ""
|
| 155 |
+
report_lang: str = "fr"
|
| 156 |
+
|
| 157 |
+
|
| 158 |
# ---------------------------------------------------------------------------
|
| 159 |
# API — status
|
| 160 |
# ---------------------------------------------------------------------------
|
|
|
|
| 217 |
|
| 218 |
# Tesseract
|
| 219 |
tess = _check_engine("tesseract", "pytesseract")
|
| 220 |
+
tess["langs"] = _get_tesseract_langs()
|
| 221 |
engines.append(tess)
|
| 222 |
|
| 223 |
# Pero OCR
|
|
|
|
| 232 |
calamari = _check_engine("calamari", "calamari_ocr", label="Calamari")
|
| 233 |
engines.append(calamari)
|
| 234 |
|
| 235 |
+
# Mistral OCR (API cloud)
|
| 236 |
+
mistral_key = os.environ.get("MISTRAL_API_KEY")
|
| 237 |
+
engines.append({
|
| 238 |
+
"id": "mistral_ocr",
|
| 239 |
+
"label": "Mistral OCR (Pixtral / mistral-ocr-latest)",
|
| 240 |
+
"type": "ocr_cloud",
|
| 241 |
+
"available": bool(mistral_key),
|
| 242 |
+
"key_env": "MISTRAL_API_KEY",
|
| 243 |
+
"status": "configured" if mistral_key else "missing_key",
|
| 244 |
+
"version": "",
|
| 245 |
+
})
|
| 246 |
+
|
| 247 |
+
# Google Vision (API cloud)
|
| 248 |
+
gv_key = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") or os.environ.get("GOOGLE_API_KEY")
|
| 249 |
+
engines.append({
|
| 250 |
+
"id": "google_vision",
|
| 251 |
+
"label": "Google Vision API",
|
| 252 |
+
"type": "ocr_cloud",
|
| 253 |
+
"available": bool(gv_key),
|
| 254 |
+
"key_env": "GOOGLE_APPLICATION_CREDENTIALS",
|
| 255 |
+
"status": "configured" if gv_key else "missing_key",
|
| 256 |
+
"version": "",
|
| 257 |
+
})
|
| 258 |
+
|
| 259 |
+
# Azure Document Intelligence (API cloud)
|
| 260 |
+
az_key = os.environ.get("AZURE_DOC_INTEL_KEY")
|
| 261 |
+
engines.append({
|
| 262 |
+
"id": "azure_doc_intel",
|
| 263 |
+
"label": "Azure Document Intelligence",
|
| 264 |
+
"type": "ocr_cloud",
|
| 265 |
+
"available": bool(az_key),
|
| 266 |
+
"key_env": "AZURE_DOC_INTEL_KEY",
|
| 267 |
+
"status": "configured" if az_key else "missing_key",
|
| 268 |
+
"version": "",
|
| 269 |
+
})
|
| 270 |
+
|
| 271 |
llms = []
|
| 272 |
|
| 273 |
# OpenAI
|
|
|
|
| 290 |
"status": "configured" if os.environ.get("ANTHROPIC_API_KEY") else "missing_key",
|
| 291 |
})
|
| 292 |
|
| 293 |
+
# Mistral LLM
|
| 294 |
llms.append({
|
| 295 |
"id": "mistral",
|
| 296 |
+
"label": "Mistral LLM (Mistral Large, Small…)",
|
| 297 |
"type": "llm",
|
| 298 |
"available": bool(os.environ.get("MISTRAL_API_KEY")),
|
| 299 |
"key_env": "MISTRAL_API_KEY",
|
|
|
|
| 368 |
return []
|
| 369 |
|
| 370 |
|
| 371 |
+
def _get_tesseract_langs() -> list[str]:
|
| 372 |
+
try:
|
| 373 |
+
import pytesseract
|
| 374 |
+
langs = pytesseract.get_languages(config="")
|
| 375 |
+
return sorted(l for l in langs if l != "osd")
|
| 376 |
+
except Exception:
|
| 377 |
+
return ["fra", "lat", "eng", "deu", "ita", "spa"]
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
# ---------------------------------------------------------------------------
|
| 381 |
+
# API — models (dynamic per provider)
|
| 382 |
+
# ---------------------------------------------------------------------------
|
| 383 |
+
|
| 384 |
+
@app.get("/api/models/{provider}")
|
| 385 |
+
async def api_models(provider: str) -> dict:
|
| 386 |
+
"""Retourne la liste des modèles disponibles pour un provider, en temps réel."""
|
| 387 |
+
import urllib.error
|
| 388 |
+
import urllib.request as _urlreq
|
| 389 |
+
|
| 390 |
+
def _fetch_json(url: str, headers: dict) -> dict:
|
| 391 |
+
req = _urlreq.Request(url, headers=headers)
|
| 392 |
+
with _urlreq.urlopen(req, timeout=10) as resp:
|
| 393 |
+
return json.loads(resp.read().decode())
|
| 394 |
+
|
| 395 |
+
if provider == "tesseract":
|
| 396 |
+
return {"provider": provider, "models": _get_tesseract_langs()}
|
| 397 |
+
|
| 398 |
+
if provider == "mistral_ocr":
|
| 399 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 400 |
+
if not api_key:
|
| 401 |
+
return {"provider": provider, "models": [], "error": "MISTRAL_API_KEY non définie"}
|
| 402 |
+
try:
|
| 403 |
+
data = _fetch_json(
|
| 404 |
+
"https://api.mistral.ai/v1/models",
|
| 405 |
+
{"Authorization": f"Bearer {api_key}"},
|
| 406 |
+
)
|
| 407 |
+
models = sorted(
|
| 408 |
+
m["id"] for m in data.get("data", [])
|
| 409 |
+
if "pixtral" in m["id"].lower() or "mistral-ocr" in m["id"].lower()
|
| 410 |
+
)
|
| 411 |
+
return {"provider": provider, "models": models}
|
| 412 |
+
except Exception as exc:
|
| 413 |
+
return {
|
| 414 |
+
"provider": provider,
|
| 415 |
+
"models": ["pixtral-12b-2409", "pixtral-large-latest", "mistral-ocr-latest"],
|
| 416 |
+
"error": str(exc),
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
if provider == "openai":
|
| 420 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 421 |
+
if not api_key:
|
| 422 |
+
return {"provider": provider, "models": [], "error": "OPENAI_API_KEY non définie"}
|
| 423 |
+
try:
|
| 424 |
+
data = _fetch_json(
|
| 425 |
+
"https://api.openai.com/v1/models",
|
| 426 |
+
{"Authorization": f"Bearer {api_key}"},
|
| 427 |
+
)
|
| 428 |
+
models = sorted(
|
| 429 |
+
(m["id"] for m in data.get("data", []) if "gpt-4" in m["id"].lower()),
|
| 430 |
+
reverse=True,
|
| 431 |
+
)
|
| 432 |
+
return {"provider": provider, "models": models}
|
| 433 |
+
except Exception as exc:
|
| 434 |
+
return {
|
| 435 |
+
"provider": provider,
|
| 436 |
+
"models": ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"],
|
| 437 |
+
"error": str(exc),
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
if provider == "anthropic":
|
| 441 |
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 442 |
+
if not api_key:
|
| 443 |
+
return {"provider": provider, "models": [], "error": "ANTHROPIC_API_KEY non définie"}
|
| 444 |
+
try:
|
| 445 |
+
data = _fetch_json(
|
| 446 |
+
"https://api.anthropic.com/v1/models",
|
| 447 |
+
{"x-api-key": api_key, "anthropic-version": "2023-06-01"},
|
| 448 |
+
)
|
| 449 |
+
models = [m["id"] for m in data.get("data", [])]
|
| 450 |
+
return {"provider": provider, "models": models}
|
| 451 |
+
except Exception as exc:
|
| 452 |
+
return {
|
| 453 |
+
"provider": provider,
|
| 454 |
+
"models": ["claude-sonnet-4-6", "claude-haiku-4-5-20251001", "claude-opus-4-6"],
|
| 455 |
+
"error": str(exc),
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
if provider == "mistral":
|
| 459 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 460 |
+
if not api_key:
|
| 461 |
+
return {"provider": provider, "models": [], "error": "MISTRAL_API_KEY non définie"}
|
| 462 |
+
try:
|
| 463 |
+
data = _fetch_json(
|
| 464 |
+
"https://api.mistral.ai/v1/models",
|
| 465 |
+
{"Authorization": f"Bearer {api_key}"},
|
| 466 |
+
)
|
| 467 |
+
models = sorted(
|
| 468 |
+
m["id"] for m in data.get("data", [])
|
| 469 |
+
if "pixtral" not in m["id"].lower() and "mistral-ocr" not in m["id"].lower()
|
| 470 |
+
)
|
| 471 |
+
return {"provider": provider, "models": models}
|
| 472 |
+
except Exception as exc:
|
| 473 |
+
return {
|
| 474 |
+
"provider": provider,
|
| 475 |
+
"models": ["mistral-large-latest", "mistral-small-latest"],
|
| 476 |
+
"error": str(exc),
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
if provider == "ollama":
|
| 480 |
+
return {"provider": provider, "models": _list_ollama_models()}
|
| 481 |
+
|
| 482 |
+
if provider == "google_vision":
|
| 483 |
+
return {"provider": provider, "models": ["document_text_detection", "text_detection"]}
|
| 484 |
+
|
| 485 |
+
if provider == "azure_doc_intel":
|
| 486 |
+
return {"provider": provider, "models": ["prebuilt-document", "prebuilt-read"]}
|
| 487 |
+
|
| 488 |
+
if provider == "prompts":
|
| 489 |
+
prompts_dir = Path(__file__).parent.parent / "prompts"
|
| 490 |
+
if prompts_dir.exists():
|
| 491 |
+
prompts = sorted(f.name for f in prompts_dir.glob("*.txt"))
|
| 492 |
+
else:
|
| 493 |
+
prompts = []
|
| 494 |
+
return {"provider": provider, "models": prompts}
|
| 495 |
+
|
| 496 |
+
raise HTTPException(status_code=404, detail=f"Provider inconnu : {provider}")
|
| 497 |
+
|
| 498 |
+
|
| 499 |
# ---------------------------------------------------------------------------
|
| 500 |
# API — corpus browse
|
| 501 |
# ---------------------------------------------------------------------------
|
|
|
|
| 771 |
return f"event: {event_type}\ndata: {payload}\n\n"
|
| 772 |
|
| 773 |
|
| 774 |
+
# ---------------------------------------------------------------------------
|
| 775 |
+
# API — benchmark/run (concurrents composés)
|
| 776 |
+
# ---------------------------------------------------------------------------
|
| 777 |
+
|
| 778 |
+
@app.post("/api/benchmark/run")
|
| 779 |
+
async def api_benchmark_run(req: BenchmarkRunRequest) -> dict:
|
| 780 |
+
corpus_path = Path(req.corpus_path)
|
| 781 |
+
if not corpus_path.exists() or not corpus_path.is_dir():
|
| 782 |
+
raise HTTPException(status_code=400, detail=f"Corpus non trouvé : {req.corpus_path}")
|
| 783 |
+
if not req.competitors:
|
| 784 |
+
raise HTTPException(status_code=400, detail="Aucun concurrent défini.")
|
| 785 |
+
|
| 786 |
+
job_id = str(uuid.uuid4())
|
| 787 |
+
job = BenchmarkJob(job_id=job_id)
|
| 788 |
+
_JOBS[job_id] = job
|
| 789 |
+
|
| 790 |
+
thread = threading.Thread(
|
| 791 |
+
target=_run_benchmark_thread_v2,
|
| 792 |
+
args=(job, req),
|
| 793 |
+
daemon=True,
|
| 794 |
+
)
|
| 795 |
+
thread.start()
|
| 796 |
+
return {"job_id": job_id, "status": "pending"}
|
| 797 |
+
|
| 798 |
+
|
| 799 |
+
def _engine_from_competitor(comp: CompetitorConfig) -> Any:
|
| 800 |
+
"""Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig."""
|
| 801 |
+
from picarones.engines.tesseract import TesseractEngine
|
| 802 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 803 |
+
|
| 804 |
+
engine_id = comp.ocr_engine
|
| 805 |
+
|
| 806 |
+
if engine_id == "tesseract":
|
| 807 |
+
ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6})
|
| 808 |
+
elif engine_id == "mistral_ocr":
|
| 809 |
+
ocr = MistralOCREngine(config={"model": comp.ocr_model or "pixtral-12b-2409"})
|
| 810 |
+
elif engine_id == "google_vision":
|
| 811 |
+
try:
|
| 812 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 813 |
+
ocr = GoogleVisionEngine(config={"detection_type": comp.ocr_model or "document_text_detection"})
|
| 814 |
+
except ImportError as exc:
|
| 815 |
+
raise RuntimeError("Google Vision non disponible (google-cloud-vision non installé).") from exc
|
| 816 |
+
elif engine_id == "azure_doc_intel":
|
| 817 |
+
try:
|
| 818 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 819 |
+
ocr = AzureDocIntelEngine(config={"model": comp.ocr_model or "prebuilt-document"})
|
| 820 |
+
except ImportError as exc:
|
| 821 |
+
raise RuntimeError("Azure Document Intelligence non disponible.") from exc
|
| 822 |
+
else:
|
| 823 |
+
raise ValueError(f"Moteur OCR inconnu : {engine_id}")
|
| 824 |
+
|
| 825 |
+
if not comp.llm_provider:
|
| 826 |
+
return ocr
|
| 827 |
+
|
| 828 |
+
# Pipeline OCR+LLM
|
| 829 |
+
_mode_map = {
|
| 830 |
+
"text_only": "text_only",
|
| 831 |
+
"post_correction_text": "text_only",
|
| 832 |
+
"text_and_image": "text_and_image",
|
| 833 |
+
"post_correction_image": "text_and_image",
|
| 834 |
+
"zero_shot": "zero_shot",
|
| 835 |
+
}
|
| 836 |
+
mode = _mode_map.get(comp.pipeline_mode, "text_only")
|
| 837 |
+
|
| 838 |
+
if comp.llm_provider == "openai":
|
| 839 |
+
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 840 |
+
llm = OpenAIAdapter(model=comp.llm_model or None)
|
| 841 |
+
elif comp.llm_provider == "anthropic":
|
| 842 |
+
from picarones.llm.anthropic_adapter import AnthropicAdapter
|
| 843 |
+
llm = AnthropicAdapter(model=comp.llm_model or None)
|
| 844 |
+
elif comp.llm_provider == "mistral":
|
| 845 |
+
from picarones.llm.mistral_adapter import MistralAdapter
|
| 846 |
+
llm = MistralAdapter(model=comp.llm_model or None)
|
| 847 |
+
elif comp.llm_provider == "ollama":
|
| 848 |
+
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 849 |
+
llm = OllamaAdapter(model=comp.llm_model or None)
|
| 850 |
+
else:
|
| 851 |
+
raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}")
|
| 852 |
+
|
| 853 |
+
from picarones.pipelines.base import OCRLLMPipeline
|
| 854 |
+
prompt = comp.prompt_file or "correction_medieval_french.txt"
|
| 855 |
+
pipeline_name = comp.name or f"{engine_id}→{comp.llm_model or comp.llm_provider}"
|
| 856 |
+
return OCRLLMPipeline(
|
| 857 |
+
ocr_engine=ocr,
|
| 858 |
+
llm_adapter=llm,
|
| 859 |
+
mode=mode,
|
| 860 |
+
prompt=prompt,
|
| 861 |
+
pipeline_name=pipeline_name,
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
|
| 865 |
+
def _run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None:
|
| 866 |
+
"""Exécute un benchmark à partir d'une liste de CompetitorConfig."""
|
| 867 |
+
import time
|
| 868 |
+
|
| 869 |
+
job.status = "running"
|
| 870 |
+
job.started_at = _iso_now()
|
| 871 |
+
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 872 |
+
|
| 873 |
+
try:
|
| 874 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 875 |
+
from picarones.core.runner import run_benchmark
|
| 876 |
+
|
| 877 |
+
corpus = load_corpus_from_directory(req.corpus_path)
|
| 878 |
+
job.total_docs = len(corpus)
|
| 879 |
+
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 880 |
+
|
| 881 |
+
if job.status == "cancelled":
|
| 882 |
+
return
|
| 883 |
+
|
| 884 |
+
engines = []
|
| 885 |
+
for comp in req.competitors:
|
| 886 |
+
try:
|
| 887 |
+
eng = _engine_from_competitor(comp)
|
| 888 |
+
engines.append(eng)
|
| 889 |
+
job.add_event("log", {"message": f"Concurrent : {eng.name}"})
|
| 890 |
+
except Exception as exc:
|
| 891 |
+
job.add_event("warning", {
|
| 892 |
+
"message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}"
|
| 893 |
+
})
|
| 894 |
+
|
| 895 |
+
if not engines:
|
| 896 |
+
raise ValueError("Aucun concurrent valide disponible.")
|
| 897 |
+
|
| 898 |
+
output_dir = Path(req.output_dir)
|
| 899 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 900 |
+
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 901 |
+
output_json = str(output_dir / f"{report_name}.json")
|
| 902 |
+
output_html = str(output_dir / f"{report_name}.html")
|
| 903 |
+
|
| 904 |
+
n_engines = len(engines)
|
| 905 |
+
total_steps = job.total_docs * n_engines
|
| 906 |
+
step_counter = [0]
|
| 907 |
+
|
| 908 |
+
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 909 |
+
if job.status == "cancelled":
|
| 910 |
+
return
|
| 911 |
+
step_counter[0] += 1
|
| 912 |
+
job.current_engine = engine_name
|
| 913 |
+
job.processed_docs = doc_idx
|
| 914 |
+
job.progress = step_counter[0] / max(total_steps, 1)
|
| 915 |
+
job.add_event("progress", {
|
| 916 |
+
"engine": engine_name,
|
| 917 |
+
"doc_idx": doc_idx,
|
| 918 |
+
"doc_id": doc_id,
|
| 919 |
+
"progress": job.progress,
|
| 920 |
+
"processed": step_counter[0],
|
| 921 |
+
"total": total_steps,
|
| 922 |
+
})
|
| 923 |
+
|
| 924 |
+
result = run_benchmark(
|
| 925 |
+
corpus=corpus,
|
| 926 |
+
engines=engines,
|
| 927 |
+
output_json=output_json,
|
| 928 |
+
show_progress=False,
|
| 929 |
+
progress_callback=_progress_callback,
|
| 930 |
+
)
|
| 931 |
+
|
| 932 |
+
if job.status == "cancelled":
|
| 933 |
+
return
|
| 934 |
+
|
| 935 |
+
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 936 |
+
from picarones.report.generator import ReportGenerator
|
| 937 |
+
gen = ReportGenerator(result, lang=req.report_lang)
|
| 938 |
+
gen.generate(output_html)
|
| 939 |
+
|
| 940 |
+
job.output_path = output_html
|
| 941 |
+
job.progress = 1.0
|
| 942 |
+
job.status = "complete"
|
| 943 |
+
job.finished_at = _iso_now()
|
| 944 |
+
|
| 945 |
+
ranking = result.ranking()
|
| 946 |
+
job.add_event("complete", {
|
| 947 |
+
"message": "Benchmark terminé.",
|
| 948 |
+
"output_html": output_html,
|
| 949 |
+
"output_json": output_json,
|
| 950 |
+
"ranking": ranking,
|
| 951 |
+
})
|
| 952 |
+
|
| 953 |
+
except Exception as exc:
|
| 954 |
+
job.status = "error"
|
| 955 |
+
job.error = str(exc)
|
| 956 |
+
job.finished_at = _iso_now()
|
| 957 |
+
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 958 |
+
|
| 959 |
+
|
| 960 |
def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
|
| 961 |
"""Exécute le benchmark dans un thread et envoie des événements SSE."""
|
| 962 |
import time
|
|
|
|
| 1210 |
/* Spinner */
|
| 1211 |
.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid #ccc; border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; }
|
| 1212 |
@keyframes spin { to { transform: rotate(360deg); } }
|
| 1213 |
+
|
| 1214 |
+
/* Provider rows (OCR/LLM status sections) */
|
| 1215 |
+
.provider-row { display: flex; align-items: center; gap: 10px; padding: 7px 10px; border: 1px solid var(--border); border-radius: var(--radius); margin-bottom: 6px; background: #fff; }
|
| 1216 |
+
.provider-label { min-width: 200px; display: flex; align-items: center; gap: 8px; font-size: 13px; font-weight: 500; }
|
| 1217 |
+
.provider-status { font-size: 11px; color: var(--text-muted); min-width: 80px; }
|
| 1218 |
+
.provider-model-select { flex: 1; font-size: 12px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
| 1219 |
+
|
| 1220 |
+
/* Competitor composer */
|
| 1221 |
+
.mode-toggle { display: flex; gap: 20px; padding: 10px 14px; background: #f4f2ed; border-radius: var(--radius); margin-bottom: 12px; }
|
| 1222 |
+
.mode-toggle label { display: flex; align-items: center; gap: 7px; cursor: pointer; font-size: 13px; font-weight: 500; }
|
| 1223 |
+
.composer-row { display: flex; gap: 10px; flex-wrap: wrap; align-items: flex-end; margin-bottom: 10px; }
|
| 1224 |
+
.composer-row .form-group { min-width: 150px; }
|
| 1225 |
+
|
| 1226 |
+
/* Competitor cards */
|
| 1227 |
+
.competitor-card { display: flex; align-items: center; justify-content: space-between; padding: 9px 14px; border: 1px solid var(--border); border-radius: var(--radius); margin-bottom: 7px; background: #fff; gap: 10px; }
|
| 1228 |
+
.competitor-card:hover { border-color: var(--accent); background: #f8f7ff; }
|
| 1229 |
+
.competitor-info { display: flex; align-items: center; gap: 10px; flex: 1; min-width: 0; }
|
| 1230 |
+
.competitor-badge { font-size: 11px; background: #eef2fc; color: var(--accent); padding: 2px 8px; border-radius: 10px; white-space: nowrap; flex-shrink: 0; }
|
| 1231 |
+
.competitor-name { font-size: 13px; font-weight: 500; }
|
| 1232 |
+
.competitor-detail { font-size: 11px; color: var(--text-muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
| 1233 |
</style>
|
| 1234 |
</head>
|
| 1235 |
<body>
|
|
|
|
| 1266 |
<div id="corpus-info" style="margin-top:8px; font-size:12px; color: var(--text-muted);"></div>
|
| 1267 |
</div>
|
| 1268 |
|
| 1269 |
+
<!-- ── Section 1 : Moteurs OCR ─────────────────────────────────── -->
|
| 1270 |
<div class="card">
|
| 1271 |
+
<h2 data-i18n="bench_ocr_title">2. Moteurs OCR</h2>
|
| 1272 |
+
<div id="ocr-engines-status-list">
|
| 1273 |
+
<div style="color: var(--text-muted); font-size: 12px;"><span class="spinner"></span> Chargement…</div>
|
| 1274 |
</div>
|
| 1275 |
</div>
|
| 1276 |
|
| 1277 |
+
<!-- ── Section 2 : Modèles LLM ──────────────────────────────────── -->
|
| 1278 |
<div class="card">
|
| 1279 |
+
<h2 data-i18n="bench_llm_title">3. Modèles LLM</h2>
|
| 1280 |
+
<div id="llm-status-list">
|
| 1281 |
+
<div style="color: var(--text-muted); font-size: 12px;"><span class="spinner"></span> Chargement…</div>
|
| 1282 |
+
</div>
|
| 1283 |
+
</div>
|
| 1284 |
+
|
| 1285 |
+
<!-- ── Section 3 : Composition des concurrents ──────────────────── -->
|
| 1286 |
+
<div class="card">
|
| 1287 |
+
<h2 data-i18n="bench_compose_title">4. Concurrents à benchmarker</h2>
|
| 1288 |
+
|
| 1289 |
+
<div class="mode-toggle">
|
| 1290 |
+
<label><input type="radio" name="compose-mode" value="ocr" checked onchange="onComposeModeChange()"> 🔍 <span data-i18n="compose_ocr_only">OCR seul</span></label>
|
| 1291 |
+
<label><input type="radio" name="compose-mode" value="pipeline" onchange="onComposeModeChange()"> ⛓ <span data-i18n="compose_pipeline">Pipeline OCR+LLM</span></label>
|
| 1292 |
+
</div>
|
| 1293 |
+
|
| 1294 |
+
<div class="composer-row">
|
| 1295 |
+
<div class="form-group">
|
| 1296 |
+
<label data-i18n="compose_ocr_engine">Moteur OCR</label>
|
| 1297 |
+
<select id="compose-ocr-engine" onchange="onComposeOCRChange()">
|
| 1298 |
+
<option value="tesseract">Tesseract</option>
|
| 1299 |
+
<option value="mistral_ocr">Mistral OCR</option>
|
| 1300 |
+
<option value="google_vision">Google Vision</option>
|
| 1301 |
+
<option value="azure_doc_intel">Azure Doc Intel</option>
|
| 1302 |
+
</select>
|
| 1303 |
+
</div>
|
| 1304 |
+
<div class="form-group" style="flex:1;">
|
| 1305 |
+
<label data-i18n="compose_ocr_model">Modèle / Langue <span class="spinner" id="sp-ocr-model" style="display:none"></span></label>
|
| 1306 |
+
<select id="compose-ocr-model"></select>
|
| 1307 |
+
</div>
|
| 1308 |
+
</div>
|
| 1309 |
+
|
| 1310 |
+
<div id="compose-pipeline-section" style="display:none;">
|
| 1311 |
+
<div class="composer-row">
|
| 1312 |
+
<div class="form-group">
|
| 1313 |
+
<label data-i18n="compose_llm_provider">Provider LLM</label>
|
| 1314 |
+
<select id="compose-llm-provider" onchange="onComposeLLMChange()">
|
| 1315 |
+
<option value="openai">OpenAI</option>
|
| 1316 |
+
<option value="anthropic">Anthropic</option>
|
| 1317 |
+
<option value="mistral">Mistral LLM</option>
|
| 1318 |
+
<option value="ollama">Ollama</option>
|
| 1319 |
+
</select>
|
| 1320 |
+
</div>
|
| 1321 |
+
<div class="form-group" style="flex:1;">
|
| 1322 |
+
<label data-i18n="compose_llm_model">Modèle LLM <span class="spinner" id="sp-llm-model" style="display:none"></span></label>
|
| 1323 |
+
<select id="compose-llm-model"></select>
|
| 1324 |
+
</div>
|
| 1325 |
+
</div>
|
| 1326 |
+
<div class="composer-row">
|
| 1327 |
+
<div class="form-group">
|
| 1328 |
+
<label data-i18n="compose_mode">Mode pipeline</label>
|
| 1329 |
+
<select id="compose-pipeline-mode">
|
| 1330 |
+
<option value="text_only" data-i18n="mode_text_only">Post-correction texte</option>
|
| 1331 |
+
<option value="text_and_image" data-i18n="mode_text_image">Post-correction image+texte</option>
|
| 1332 |
+
<option value="zero_shot" data-i18n="mode_zero_shot">Zero-shot</option>
|
| 1333 |
+
</select>
|
| 1334 |
+
</div>
|
| 1335 |
+
<div class="form-group" style="flex:1;">
|
| 1336 |
+
<label data-i18n="compose_prompt">Prompt <span class="spinner" id="sp-prompt" style="display:none"></span></label>
|
| 1337 |
+
<select id="compose-prompt"></select>
|
| 1338 |
+
</div>
|
| 1339 |
+
</div>
|
| 1340 |
+
</div>
|
| 1341 |
+
|
| 1342 |
+
<div style="display:flex; gap:10px; align-items:center; margin-top:10px;">
|
| 1343 |
+
<button class="btn btn-primary btn-sm" onclick="addCompetitor()" data-i18n="compose_add">+ Ajouter</button>
|
| 1344 |
+
<span id="compose-error" style="color: var(--danger); font-size:12px;"></span>
|
| 1345 |
+
</div>
|
| 1346 |
+
|
| 1347 |
+
<div id="competitors-list" style="margin-top:14px;">
|
| 1348 |
+
<div style="color: var(--text-muted); font-size:12px;" data-i18n="compose_empty">Aucun concurrent ajouté.</div>
|
| 1349 |
+
</div>
|
| 1350 |
+
</div>
|
| 1351 |
+
|
| 1352 |
+
<!-- ── 5. Options ─────────────────────────────────────────────────── -->
|
| 1353 |
+
<div class="card">
|
| 1354 |
+
<h2 data-i18n="bench_options_title">5. Options</h2>
|
| 1355 |
<div class="form-row">
|
| 1356 |
<div class="form-group">
|
| 1357 |
<label data-i18n="bench_norm_label">Profil de normalisation</label>
|
|
|
|
| 1359 |
<option value="nfc">NFC (standard)</option>
|
| 1360 |
</select>
|
| 1361 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1362 |
<div class="form-group">
|
| 1363 |
<label data-i18n="bench_output_label">Dossier de sortie</label>
|
| 1364 |
<input type="text" id="output-dir" value="./rapports/" />
|
|
|
|
| 1535 |
bench_corpus_label: "Chemin vers le dossier corpus (paires image / .gt.txt)",
|
| 1536 |
bench_browse: "Parcourir",
|
| 1537 |
bench_engines_title: "2. Moteurs et pipelines",
|
| 1538 |
+
bench_ocr_title: "2. Moteurs OCR",
|
| 1539 |
+
bench_llm_title: "3. Modèles LLM",
|
| 1540 |
+
bench_compose_title: "4. Concurrents à benchmarker",
|
| 1541 |
+
bench_options_title: "5. Options",
|
| 1542 |
+
compose_ocr_only: "OCR seul",
|
| 1543 |
+
compose_pipeline: "Pipeline OCR+LLM",
|
| 1544 |
+
compose_ocr_engine: "Moteur OCR",
|
| 1545 |
+
compose_ocr_model: "Modèle / Langue",
|
| 1546 |
+
compose_llm_provider: "Provider LLM",
|
| 1547 |
+
compose_llm_model: "Modèle LLM",
|
| 1548 |
+
compose_mode: "Mode pipeline",
|
| 1549 |
+
compose_prompt: "Prompt",
|
| 1550 |
+
compose_add: "+ Ajouter",
|
| 1551 |
+
compose_empty: "Aucun concurrent ajouté.",
|
| 1552 |
+
mode_text_only: "Post-correction texte",
|
| 1553 |
+
mode_text_image: "Post-correction image+texte",
|
| 1554 |
+
mode_zero_shot: "Zero-shot",
|
| 1555 |
bench_norm_label: "Profil de normalisation",
|
| 1556 |
bench_lang_label: "Langue (Tesseract)",
|
| 1557 |
bench_output_label: "Dossier de sortie",
|
|
|
|
| 1603 |
bench_corpus_label: "Path to corpus directory (image / .gt.txt pairs)",
|
| 1604 |
bench_browse: "Browse",
|
| 1605 |
bench_engines_title: "2. Engines & pipelines",
|
| 1606 |
+
bench_ocr_title: "2. OCR Engines",
|
| 1607 |
+
bench_llm_title: "3. LLM Models",
|
| 1608 |
+
bench_compose_title: "4. Competitors",
|
| 1609 |
+
bench_options_title: "5. Options",
|
| 1610 |
+
compose_ocr_only: "OCR only",
|
| 1611 |
+
compose_pipeline: "OCR+LLM Pipeline",
|
| 1612 |
+
compose_ocr_engine: "OCR Engine",
|
| 1613 |
+
compose_ocr_model: "Model / Language",
|
| 1614 |
+
compose_llm_provider: "LLM Provider",
|
| 1615 |
+
compose_llm_model: "LLM Model",
|
| 1616 |
+
compose_mode: "Pipeline mode",
|
| 1617 |
+
compose_prompt: "Prompt",
|
| 1618 |
+
compose_add: "+ Add",
|
| 1619 |
+
compose_empty: "No competitors added.",
|
| 1620 |
+
mode_text_only: "Text post-correction",
|
| 1621 |
+
mode_text_image: "Image+text post-correction",
|
| 1622 |
+
mode_zero_shot: "Zero-shot",
|
| 1623 |
bench_norm_label: "Normalization profile",
|
| 1624 |
bench_lang_label: "Language (Tesseract)",
|
| 1625 |
bench_output_label: "Output directory",
|
|
|
|
| 1693 |
} catch(e) {}
|
| 1694 |
}
|
| 1695 |
|
| 1696 |
+
// ─── Models cache & fetching ─────────────────────────────────────────────────
|
| 1697 |
+
let _modelsCache = {};
|
| 1698 |
+
let _enginesData = null;
|
| 1699 |
+
let _competitors = [];
|
| 1700 |
+
let _refreshIntervalId = null;
|
| 1701 |
+
|
| 1702 |
+
async function fetchModels(provider) {
|
| 1703 |
+
if (_modelsCache[provider]) return _modelsCache[provider];
|
| 1704 |
+
const r = await fetch(`/api/models/${provider}`);
|
| 1705 |
+
const d = await r.json();
|
| 1706 |
+
const models = d.models || [];
|
| 1707 |
+
_modelsCache[provider] = models;
|
| 1708 |
+
return models;
|
| 1709 |
+
}
|
| 1710 |
+
|
| 1711 |
+
function populateSelect(selectId, models, spinnerId) {
|
| 1712 |
+
const sel = document.getElementById(selectId);
|
| 1713 |
+
if (spinnerId) { const sp = document.getElementById(spinnerId); if (sp) sp.style.display = "none"; }
|
| 1714 |
+
if (!sel) return;
|
| 1715 |
+
sel.innerHTML = models.length === 0
|
| 1716 |
+
? '<option value="">— aucun modèle —</option>'
|
| 1717 |
+
: models.map(m => `<option value="${m}">${m}</option>`).join("");
|
| 1718 |
+
}
|
| 1719 |
+
|
| 1720 |
+
// ─── Benchmark sections (OCR + LLM status + composer init) ───────────────────
|
| 1721 |
+
async function loadBenchmarkSections() {
|
| 1722 |
try {
|
| 1723 |
const r = await fetch("/api/engines");
|
| 1724 |
const d = await r.json();
|
| 1725 |
+
_enginesData = d;
|
| 1726 |
+
renderOCREnginesSection(d.engines);
|
| 1727 |
+
renderLLMSection(d.llms);
|
| 1728 |
+
} catch(e) {
|
| 1729 |
+
document.getElementById("ocr-engines-status-list").innerHTML =
|
| 1730 |
+
`<div style="color:var(--danger);font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1731 |
+
}
|
| 1732 |
+
}
|
| 1733 |
+
|
| 1734 |
+
function _makeProviderRow(eng, msId) {
|
| 1735 |
+
const dotCls = eng.available ? "status-ok" : (eng.status === "not_running" ? "status-warn" : "status-err");
|
| 1736 |
+
let statusLabel;
|
| 1737 |
+
if (eng.available) statusLabel = eng.version ? eng.version : (lang === "fr" ? "disponible" : "available");
|
| 1738 |
+
else if (eng.status === "missing_key") statusLabel = eng.key_env ? `<code style="font-size:11px;color:var(--warning)">${eng.key_env}</code>` : (lang === "fr" ? "clé manquante" : "key missing");
|
| 1739 |
+
else if (eng.status === "not_running") statusLabel = lang === "fr" ? "inactif" : "not running";
|
| 1740 |
+
else statusLabel = lang === "fr" ? "non installé" : "not installed";
|
| 1741 |
+
|
| 1742 |
+
const row = document.createElement("div");
|
| 1743 |
+
row.className = "provider-row";
|
| 1744 |
+
row.innerHTML = `
|
| 1745 |
+
<div class="provider-label"><span class="engine-status ${dotCls}"></span><strong>${eng.label}</strong></div>
|
| 1746 |
+
<div class="provider-status">${statusLabel}</div>
|
| 1747 |
+
<div class="provider-model-select" id="${msId}">${eng.available ? '<span class="spinner"></span>' : ""}</div>`;
|
| 1748 |
+
return row;
|
| 1749 |
+
}
|
| 1750 |
+
|
| 1751 |
+
async function renderOCREnginesSection(engines) {
|
| 1752 |
+
const container = document.getElementById("ocr-engines-status-list");
|
| 1753 |
+
container.innerHTML = "";
|
| 1754 |
+
for (const eng of engines) {
|
| 1755 |
+
const msId = `ms-ocr-${eng.id}`;
|
| 1756 |
+
container.appendChild(_makeProviderRow(eng, msId));
|
| 1757 |
+
if (eng.available) {
|
| 1758 |
+
fetchModels(eng.id).then(models => {
|
| 1759 |
+
const div = document.getElementById(msId);
|
| 1760 |
+
if (!div) return;
|
| 1761 |
+
div.innerHTML = models.length === 0
|
| 1762 |
+
? `<span style="color:var(--text-muted);font-size:11px;">—</span>`
|
| 1763 |
+
: `<span style="font-size:12px;">${models.slice(0,5).join(", ")}${models.length > 5 ? ` +${models.length-5}` : ""}</span>`;
|
| 1764 |
+
}).catch(() => {
|
| 1765 |
+
const div = document.getElementById(msId);
|
| 1766 |
+
if (div) div.innerHTML = `<span style="color:var(--danger);font-size:11px;">Erreur API</span>`;
|
| 1767 |
});
|
| 1768 |
+
}
|
| 1769 |
+
}
|
| 1770 |
+
}
|
| 1771 |
+
|
| 1772 |
+
async function renderLLMSection(llms) {
|
| 1773 |
+
const container = document.getElementById("llm-status-list");
|
| 1774 |
+
container.innerHTML = "";
|
| 1775 |
+
for (const llm of llms) {
|
| 1776 |
+
const msId = `ms-llm-${llm.id}`;
|
| 1777 |
+
container.appendChild(_makeProviderRow(llm, msId));
|
| 1778 |
+
if (llm.available) {
|
| 1779 |
+
fetchModels(llm.id).then(models => {
|
| 1780 |
+
const div = document.getElementById(msId);
|
| 1781 |
+
if (!div) return;
|
| 1782 |
+
div.innerHTML = models.length === 0
|
| 1783 |
+
? `<span style="color:var(--text-muted);font-size:11px;">—</span>`
|
| 1784 |
+
: `<span style="font-size:12px;">${models.slice(0,3).join(", ")}${models.length > 3 ? ` +${models.length-3}` : ""}</span>`;
|
| 1785 |
+
}).catch(() => {
|
| 1786 |
+
const div = document.getElementById(msId);
|
| 1787 |
+
if (div) div.innerHTML = `<span style="color:var(--danger);font-size:11px;">Erreur API</span>`;
|
| 1788 |
+
});
|
| 1789 |
+
}
|
| 1790 |
+
}
|
| 1791 |
+
}
|
| 1792 |
+
|
| 1793 |
+
function startAutoRefresh() {
|
| 1794 |
+
if (_refreshIntervalId) clearInterval(_refreshIntervalId);
|
| 1795 |
+
_refreshIntervalId = setInterval(async () => {
|
| 1796 |
+
try {
|
| 1797 |
+
const r = await fetch("/api/engines");
|
| 1798 |
+
const d = await r.json();
|
| 1799 |
+
if (!_enginesData || JSON.stringify(d) !== JSON.stringify(_enginesData)) {
|
| 1800 |
+
_modelsCache = {};
|
| 1801 |
+
_enginesData = d;
|
| 1802 |
+
renderOCREnginesSection(d.engines);
|
| 1803 |
+
renderLLMSection(d.llms);
|
| 1804 |
+
}
|
| 1805 |
+
} catch(e) {}
|
| 1806 |
+
}, 10000);
|
| 1807 |
+
}
|
| 1808 |
+
|
| 1809 |
+
// ─── Competitor composer ──────────────────────────────────────────────────────
|
| 1810 |
+
async function onComposeOCRChange() {
|
| 1811 |
+
const engine = document.getElementById("compose-ocr-engine").value;
|
| 1812 |
+
const sp = document.getElementById("sp-ocr-model");
|
| 1813 |
+
sp.style.display = "inline-block";
|
| 1814 |
+
try {
|
| 1815 |
+
const models = await fetchModels(engine);
|
| 1816 |
+
populateSelect("compose-ocr-model", models, "sp-ocr-model");
|
| 1817 |
+
} catch(e) {
|
| 1818 |
+
sp.style.display = "none";
|
| 1819 |
+
document.getElementById("compose-ocr-model").innerHTML = '<option value="">Erreur</option>';
|
| 1820 |
+
}
|
| 1821 |
+
}
|
| 1822 |
+
|
| 1823 |
+
async function onComposeLLMChange() {
|
| 1824 |
+
const provider = document.getElementById("compose-llm-provider").value;
|
| 1825 |
+
const sp = document.getElementById("sp-llm-model");
|
| 1826 |
+
sp.style.display = "inline-block";
|
| 1827 |
+
try {
|
| 1828 |
+
const models = await fetchModels(provider);
|
| 1829 |
+
populateSelect("compose-llm-model", models, "sp-llm-model");
|
| 1830 |
+
} catch(e) {
|
| 1831 |
+
sp.style.display = "none";
|
| 1832 |
+
document.getElementById("compose-llm-model").innerHTML = '<option value="">Erreur</option>';
|
| 1833 |
+
}
|
| 1834 |
+
}
|
| 1835 |
|
| 1836 |
+
function onComposeModeChange() {
|
| 1837 |
+
const mode = document.querySelector("input[name=compose-mode]:checked").value;
|
| 1838 |
+
document.getElementById("compose-pipeline-section").style.display =
|
| 1839 |
+
mode === "pipeline" ? "block" : "none";
|
| 1840 |
+
}
|
| 1841 |
+
|
| 1842 |
+
async function loadComposePrompts() {
|
| 1843 |
+
document.getElementById("sp-prompt").style.display = "inline-block";
|
| 1844 |
+
try {
|
| 1845 |
+
const models = await fetchModels("prompts");
|
| 1846 |
+
populateSelect("compose-prompt", models, "sp-prompt");
|
| 1847 |
} catch(e) {
|
| 1848 |
+
document.getElementById("sp-prompt").style.display = "none";
|
| 1849 |
+
}
|
| 1850 |
+
}
|
| 1851 |
+
|
| 1852 |
+
function addCompetitor() {
|
| 1853 |
+
const ocrEngine = document.getElementById("compose-ocr-engine").value;
|
| 1854 |
+
const ocrModel = document.getElementById("compose-ocr-model").value;
|
| 1855 |
+
const mode = document.querySelector("input[name=compose-mode]:checked").value;
|
| 1856 |
+
const errEl = document.getElementById("compose-error");
|
| 1857 |
+
|
| 1858 |
+
if (!ocrEngine) {
|
| 1859 |
+
errEl.textContent = lang === "fr" ? "Sélectionnez un moteur OCR." : "Select an OCR engine.";
|
| 1860 |
+
return;
|
| 1861 |
+
}
|
| 1862 |
+
|
| 1863 |
+
const comp = { name: "", ocr_engine: ocrEngine, ocr_model: ocrModel,
|
| 1864 |
+
llm_provider: "", llm_model: "", pipeline_mode: "", prompt_file: "" };
|
| 1865 |
+
|
| 1866 |
+
if (mode === "pipeline") {
|
| 1867 |
+
comp.llm_provider = document.getElementById("compose-llm-provider").value;
|
| 1868 |
+
comp.llm_model = document.getElementById("compose-llm-model").value;
|
| 1869 |
+
comp.pipeline_mode = document.getElementById("compose-pipeline-mode").value;
|
| 1870 |
+
comp.prompt_file = document.getElementById("compose-prompt").value;
|
| 1871 |
+
if (!comp.llm_provider) {
|
| 1872 |
+
errEl.textContent = lang === "fr" ? "Sélectionnez un provider LLM." : "Select an LLM provider.";
|
| 1873 |
+
return;
|
| 1874 |
+
}
|
| 1875 |
+
comp.name = `${ocrEngine}${ocrModel ? ":"+ocrModel : ""} → ${comp.llm_provider}${comp.llm_model ? ":"+comp.llm_model : ""}`;
|
| 1876 |
+
} else {
|
| 1877 |
+
comp.name = `${ocrEngine}${ocrModel ? " ("+ocrModel+")" : ""}`;
|
| 1878 |
+
}
|
| 1879 |
+
|
| 1880 |
+
errEl.textContent = "";
|
| 1881 |
+
_competitors.push(comp);
|
| 1882 |
+
renderCompetitors();
|
| 1883 |
+
}
|
| 1884 |
+
|
| 1885 |
+
function removeCompetitor(idx) {
|
| 1886 |
+
_competitors.splice(idx, 1);
|
| 1887 |
+
renderCompetitors();
|
| 1888 |
+
}
|
| 1889 |
+
|
| 1890 |
+
function renderCompetitors() {
|
| 1891 |
+
const container = document.getElementById("competitors-list");
|
| 1892 |
+
if (_competitors.length === 0) {
|
| 1893 |
+
container.innerHTML = `<div style="color:var(--text-muted);font-size:12px;">${t("compose_empty")}</div>`;
|
| 1894 |
+
return;
|
| 1895 |
}
|
| 1896 |
+
container.innerHTML = _competitors.map((c, i) => {
|
| 1897 |
+
const isPipeline = !!c.llm_provider;
|
| 1898 |
+
const badge = isPipeline ? "⛓ Pipeline" : "🔍 OCR";
|
| 1899 |
+
const detail = isPipeline
|
| 1900 |
+
? `${c.ocr_engine}:${c.ocr_model} → ${c.llm_provider}:${c.llm_model} [${c.pipeline_mode}]`
|
| 1901 |
+
: `${c.ocr_engine}:${c.ocr_model}`;
|
| 1902 |
+
return `<div class="competitor-card">
|
| 1903 |
+
<div class="competitor-info">
|
| 1904 |
+
<span class="competitor-badge">${badge}</span>
|
| 1905 |
+
<span class="competitor-name">${c.name}</span>
|
| 1906 |
+
<span class="competitor-detail">${detail}</span>
|
| 1907 |
+
</div>
|
| 1908 |
+
<button class="btn btn-danger btn-sm" onclick="removeCompetitor(${i})">✕</button>
|
| 1909 |
+
</div>`;
|
| 1910 |
+
}).join("");
|
| 1911 |
}
|
| 1912 |
|
| 1913 |
// ─── Normalization profiles ──────────────────────────────────────────────────
|
|
|
|
| 1985 |
alert(lang === "fr" ? "Veuillez sélectionner un dossier corpus." : "Please select a corpus directory.");
|
| 1986 |
return;
|
| 1987 |
}
|
| 1988 |
+
if (_competitors.length === 0) {
|
| 1989 |
+
alert(lang === "fr" ? "Ajoutez au moins un concurrent (Section 4)." : "Add at least one competitor (Section 4).");
|
|
|
|
| 1990 |
return;
|
| 1991 |
}
|
| 1992 |
|
| 1993 |
const payload = {
|
| 1994 |
corpus_path: corpusPath,
|
| 1995 |
+
competitors: _competitors,
|
| 1996 |
normalization_profile: document.getElementById("norm-profile").value,
|
| 1997 |
output_dir: document.getElementById("output-dir").value,
|
| 1998 |
report_name: document.getElementById("report-name").value,
|
|
|
|
| 1999 |
};
|
| 2000 |
|
| 2001 |
document.getElementById("start-btn").disabled = true;
|
|
|
|
| 2007 |
document.getElementById("bench-status-text").textContent = lang === "fr" ? "Démarrage…" : "Starting…";
|
| 2008 |
|
| 2009 |
try {
|
| 2010 |
+
const r = await fetch("/api/benchmark/run", {
|
| 2011 |
method: "POST",
|
| 2012 |
headers: {"Content-Type": "application/json"},
|
| 2013 |
body: JSON.stringify(payload),
|
|
|
|
| 2018 |
}
|
| 2019 |
const d = await r.json();
|
| 2020 |
_currentJobId = d.job_id;
|
| 2021 |
+
_startSSE(_currentJobId);
|
| 2022 |
} catch(e) {
|
| 2023 |
appendLog(`Erreur : ${e.message}`, "error");
|
| 2024 |
document.getElementById("start-btn").disabled = false;
|
|
|
|
| 2027 |
}
|
| 2028 |
}
|
| 2029 |
|
| 2030 |
+
function _startSSE(jobId) {
|
| 2031 |
if (_eventSource) _eventSource.close();
|
|
|
|
| 2032 |
const pl = document.getElementById("engine-progress-list");
|
| 2033 |
pl.innerHTML = "";
|
| 2034 |
+
const seenEngines = {};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2035 |
|
| 2036 |
_eventSource = new EventSource(`/api/benchmark/${jobId}/stream`);
|
| 2037 |
|
|
|
|
| 2054 |
_eventSource.addEventListener("progress", e => {
|
| 2055 |
const d = JSON.parse(e.data);
|
| 2056 |
const pct = Math.round(d.progress * 100);
|
| 2057 |
+
const engId = d.engine.replace(/[^a-z0-9_-]/gi, "_");
|
| 2058 |
+
if (!seenEngines[engId]) {
|
| 2059 |
+
seenEngines[engId] = true;
|
| 2060 |
+
const div = document.createElement("div");
|
| 2061 |
+
div.style = "margin-bottom: 8px;";
|
| 2062 |
+
div.innerHTML = `<div style="display:flex;justify-content:space-between;font-size:12px;margin-bottom:3px;">
|
| 2063 |
+
<span>${d.engine}</span><span id="eng-pct-${engId}">0%</span></div>
|
| 2064 |
+
<div class="progress-bar-outer"><div class="progress-bar-inner" id="eng-bar-${engId}" style="width:0%"></div></div>`;
|
| 2065 |
+
pl.appendChild(div);
|
| 2066 |
+
}
|
| 2067 |
+
const bar = document.getElementById(`eng-bar-${engId}`);
|
| 2068 |
+
const pctEl = document.getElementById(`eng-pct-${engId}`);
|
| 2069 |
+
if (bar) bar.style.width = pct + "%";
|
| 2070 |
+
if (pctEl) pctEl.textContent = pct + "%";
|
| 2071 |
document.getElementById("bench-status-text").textContent =
|
| 2072 |
`${pct}% — ${d.engine} (${d.processed}/${d.total})`;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2073 |
});
|
| 2074 |
|
| 2075 |
_eventSource.addEventListener("complete", e => {
|
|
|
|
| 2090 |
_finishBenchmark();
|
| 2091 |
});
|
| 2092 |
|
| 2093 |
+
_eventSource.addEventListener("done", e => { _finishBenchmark(); });
|
| 2094 |
+
_eventSource.onerror = () => { if (_currentJobId) _finishBenchmark(); };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2095 |
}
|
| 2096 |
|
| 2097 |
function _showResults(data) {
|
|
|
|
| 2328 |
}
|
| 2329 |
|
| 2330 |
// ─── Init ────────────────────────────────────────────────────────────────────
|
| 2331 |
+
document.addEventListener("DOMContentLoaded", async () => {
|
| 2332 |
loadStatus();
|
|
|
|
| 2333 |
loadNormProfiles();
|
| 2334 |
initHTRFilters();
|
| 2335 |
+
// Load OCR engines, LLM models, initialize composer
|
| 2336 |
+
await loadBenchmarkSections();
|
| 2337 |
+
onComposeOCRChange(); // Pre-populate Tesseract languages
|
| 2338 |
+
loadComposePrompts(); // Pre-load prompt files
|
| 2339 |
+
startAutoRefresh(); // Auto-detect new API keys every 10 s
|
| 2340 |
// Close modal on backdrop click
|
| 2341 |
document.getElementById("import-modal").addEventListener("click", e => {
|
| 2342 |
if (e.target === document.getElementById("import-modal")) closeImportModal();
|
|
@@ -978,3 +978,227 @@ class TestRunnerProgressCallback:
|
|
| 978 |
# Ne doit pas lever d'exception
|
| 979 |
result = run_benchmark(corpus, [MockEngine()], progress_callback=bad_callback)
|
| 980 |
assert result is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
# Ne doit pas lever d'exception
|
| 979 |
result = run_benchmark(corpus, [MockEngine()], progress_callback=bad_callback)
|
| 980 |
assert result is not None
|
| 981 |
+
|
| 982 |
+
|
| 983 |
+
# ===========================================================================
|
| 984 |
+
# TestFastAPIModels — GET /api/models/{provider}
|
| 985 |
+
# ===========================================================================
|
| 986 |
+
|
| 987 |
+
class TestFastAPIModels:
|
| 988 |
+
|
| 989 |
+
def test_models_tesseract_200(self, client):
|
| 990 |
+
r = client.get("/api/models/tesseract")
|
| 991 |
+
assert r.status_code == 200
|
| 992 |
+
|
| 993 |
+
def test_models_tesseract_has_models_list(self, client):
|
| 994 |
+
r = client.get("/api/models/tesseract")
|
| 995 |
+
d = r.json()
|
| 996 |
+
assert "models" in d
|
| 997 |
+
assert isinstance(d["models"], list)
|
| 998 |
+
|
| 999 |
+
def test_models_tesseract_has_provider_field(self, client):
|
| 1000 |
+
r = client.get("/api/models/tesseract")
|
| 1001 |
+
assert r.json()["provider"] == "tesseract"
|
| 1002 |
+
|
| 1003 |
+
def test_models_tesseract_has_languages(self, client):
|
| 1004 |
+
r = client.get("/api/models/tesseract")
|
| 1005 |
+
models = r.json()["models"]
|
| 1006 |
+
# Tesseract est installé dans le CI, au moins fra ou eng doit être présent
|
| 1007 |
+
assert len(models) > 0
|
| 1008 |
+
|
| 1009 |
+
def test_models_google_vision_200(self, client):
|
| 1010 |
+
r = client.get("/api/models/google_vision")
|
| 1011 |
+
assert r.status_code == 200
|
| 1012 |
+
assert "document_text_detection" in r.json()["models"]
|
| 1013 |
+
|
| 1014 |
+
def test_models_azure_doc_intel_200(self, client):
|
| 1015 |
+
r = client.get("/api/models/azure_doc_intel")
|
| 1016 |
+
assert r.status_code == 200
|
| 1017 |
+
assert "prebuilt-document" in r.json()["models"]
|
| 1018 |
+
|
| 1019 |
+
def test_models_ollama_200(self, client):
|
| 1020 |
+
r = client.get("/api/models/ollama")
|
| 1021 |
+
assert r.status_code == 200
|
| 1022 |
+
assert isinstance(r.json()["models"], list)
|
| 1023 |
+
|
| 1024 |
+
def test_models_prompts_200(self, client):
|
| 1025 |
+
r = client.get("/api/models/prompts")
|
| 1026 |
+
assert r.status_code == 200
|
| 1027 |
+
d = r.json()
|
| 1028 |
+
assert isinstance(d["models"], list)
|
| 1029 |
+
assert len(d["models"]) >= 5 # 8 prompts intégrés
|
| 1030 |
+
|
| 1031 |
+
def test_models_prompts_are_txt_files(self, client):
|
| 1032 |
+
r = client.get("/api/models/prompts")
|
| 1033 |
+
for name in r.json()["models"]:
|
| 1034 |
+
assert name.endswith(".txt")
|
| 1035 |
+
|
| 1036 |
+
def test_models_openai_no_key_returns_empty(self, client):
|
| 1037 |
+
# Sans clé, doit renvoyer liste vide + champ error
|
| 1038 |
+
with patch.dict(os.environ, {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"}, clear=True):
|
| 1039 |
+
r = client.get("/api/models/openai")
|
| 1040 |
+
assert r.status_code == 200
|
| 1041 |
+
d = r.json()
|
| 1042 |
+
assert d["models"] == [] or "error" in d
|
| 1043 |
+
|
| 1044 |
+
def test_models_anthropic_no_key_returns_empty(self, client):
|
| 1045 |
+
with patch.dict(os.environ, {k: v for k, v in os.environ.items() if k != "ANTHROPIC_API_KEY"}, clear=True):
|
| 1046 |
+
r = client.get("/api/models/anthropic")
|
| 1047 |
+
assert r.status_code == 200
|
| 1048 |
+
d = r.json()
|
| 1049 |
+
assert d["models"] == [] or "error" in d
|
| 1050 |
+
|
| 1051 |
+
def test_models_unknown_provider_404(self, client):
|
| 1052 |
+
r = client.get("/api/models/provider_xyz_unknown")
|
| 1053 |
+
assert r.status_code == 404
|
| 1054 |
+
|
| 1055 |
+
|
| 1056 |
+
# ===========================================================================
|
| 1057 |
+
# TestFastAPIBenchmarkRun — POST /api/benchmark/run
|
| 1058 |
+
# ===========================================================================
|
| 1059 |
+
|
| 1060 |
+
class TestFastAPIBenchmarkRun:
|
| 1061 |
+
|
| 1062 |
+
def test_run_400_missing_corpus(self, client):
|
| 1063 |
+
r = client.post("/api/benchmark/run", json={
|
| 1064 |
+
"corpus_path": "/nonexistent/path/xyz",
|
| 1065 |
+
"competitors": [{"ocr_engine": "tesseract", "ocr_model": "fra"}],
|
| 1066 |
+
})
|
| 1067 |
+
assert r.status_code == 400
|
| 1068 |
+
|
| 1069 |
+
def test_run_400_no_competitors(self, client, tmp_corpus):
|
| 1070 |
+
r = client.post("/api/benchmark/run", json={
|
| 1071 |
+
"corpus_path": str(tmp_corpus),
|
| 1072 |
+
"competitors": [],
|
| 1073 |
+
})
|
| 1074 |
+
assert r.status_code == 400
|
| 1075 |
+
|
| 1076 |
+
def test_run_422_missing_ocr_engine(self, client, tmp_corpus):
|
| 1077 |
+
r = client.post("/api/benchmark/run", json={
|
| 1078 |
+
"corpus_path": str(tmp_corpus),
|
| 1079 |
+
"competitors": [{"ocr_model": "fra"}], # ocr_engine manquant
|
| 1080 |
+
})
|
| 1081 |
+
assert r.status_code == 422
|
| 1082 |
+
|
| 1083 |
+
def test_run_returns_job_id(self, client, tmp_corpus):
|
| 1084 |
+
r = client.post("/api/benchmark/run", json={
|
| 1085 |
+
"corpus_path": str(tmp_corpus),
|
| 1086 |
+
"competitors": [{"ocr_engine": "tesseract", "ocr_model": "fra"}],
|
| 1087 |
+
})
|
| 1088 |
+
assert r.status_code == 200
|
| 1089 |
+
d = r.json()
|
| 1090 |
+
assert "job_id" in d
|
| 1091 |
+
assert "status" in d
|
| 1092 |
+
|
| 1093 |
+
def test_run_job_status_reachable(self, client, tmp_corpus):
|
| 1094 |
+
r = client.post("/api/benchmark/run", json={
|
| 1095 |
+
"corpus_path": str(tmp_corpus),
|
| 1096 |
+
"competitors": [{"ocr_engine": "tesseract", "ocr_model": "fra"}],
|
| 1097 |
+
})
|
| 1098 |
+
job_id = r.json()["job_id"]
|
| 1099 |
+
r2 = client.get(f"/api/benchmark/{job_id}/status")
|
| 1100 |
+
assert r2.status_code == 200
|
| 1101 |
+
d = r2.json()
|
| 1102 |
+
assert d["job_id"] == job_id
|
| 1103 |
+
|
| 1104 |
+
def test_run_with_named_competitor(self, client, tmp_corpus):
|
| 1105 |
+
r = client.post("/api/benchmark/run", json={
|
| 1106 |
+
"corpus_path": str(tmp_corpus),
|
| 1107 |
+
"competitors": [{"name": "Mon Tesseract", "ocr_engine": "tesseract", "ocr_model": "fra"}],
|
| 1108 |
+
})
|
| 1109 |
+
assert r.status_code == 200
|
| 1110 |
+
|
| 1111 |
+
def test_run_multiple_competitors(self, client, tmp_corpus):
|
| 1112 |
+
r = client.post("/api/benchmark/run", json={
|
| 1113 |
+
"corpus_path": str(tmp_corpus),
|
| 1114 |
+
"competitors": [
|
| 1115 |
+
{"ocr_engine": "tesseract", "ocr_model": "fra"},
|
| 1116 |
+
{"ocr_engine": "tesseract", "ocr_model": "eng"},
|
| 1117 |
+
],
|
| 1118 |
+
})
|
| 1119 |
+
assert r.status_code == 200
|
| 1120 |
+
|
| 1121 |
+
def test_run_with_output_options(self, client, tmp_corpus, tmp_path):
|
| 1122 |
+
r = client.post("/api/benchmark/run", json={
|
| 1123 |
+
"corpus_path": str(tmp_corpus),
|
| 1124 |
+
"competitors": [{"ocr_engine": "tesseract", "ocr_model": "fra"}],
|
| 1125 |
+
"output_dir": str(tmp_path),
|
| 1126 |
+
"report_name": "test_run_report",
|
| 1127 |
+
})
|
| 1128 |
+
assert r.status_code == 200
|
| 1129 |
+
|
| 1130 |
+
|
| 1131 |
+
# ===========================================================================
|
| 1132 |
+
# TestFastAPIEnginesExtended — champs ajoutés dans api_engines()
|
| 1133 |
+
# ===========================================================================
|
| 1134 |
+
|
| 1135 |
+
class TestFastAPIEnginesExtended:
|
| 1136 |
+
|
| 1137 |
+
def test_tesseract_has_langs_field(self, client):
|
| 1138 |
+
r = client.get("/api/engines")
|
| 1139 |
+
tess = next(e for e in r.json()["engines"] if e["id"] == "tesseract")
|
| 1140 |
+
assert "langs" in tess
|
| 1141 |
+
assert isinstance(tess["langs"], list)
|
| 1142 |
+
|
| 1143 |
+
def test_mistral_ocr_in_engines(self, client):
|
| 1144 |
+
r = client.get("/api/engines")
|
| 1145 |
+
ids = [e["id"] for e in r.json()["engines"]]
|
| 1146 |
+
assert "mistral_ocr" in ids
|
| 1147 |
+
|
| 1148 |
+
def test_google_vision_in_engines(self, client):
|
| 1149 |
+
r = client.get("/api/engines")
|
| 1150 |
+
ids = [e["id"] for e in r.json()["engines"]]
|
| 1151 |
+
assert "google_vision" in ids
|
| 1152 |
+
|
| 1153 |
+
def test_azure_doc_intel_in_engines(self, client):
|
| 1154 |
+
r = client.get("/api/engines")
|
| 1155 |
+
ids = [e["id"] for e in r.json()["engines"]]
|
| 1156 |
+
assert "azure_doc_intel" in ids
|
| 1157 |
+
|
| 1158 |
+
def test_cloud_engines_have_key_env(self, client):
|
| 1159 |
+
r = client.get("/api/engines")
|
| 1160 |
+
for eng in r.json()["engines"]:
|
| 1161 |
+
if eng.get("type") == "ocr_cloud":
|
| 1162 |
+
assert "key_env" in eng
|
| 1163 |
+
|
| 1164 |
+
def test_mistral_llm_label_updated(self, client):
|
| 1165 |
+
r = client.get("/api/engines")
|
| 1166 |
+
mistral_llm = next(e for e in r.json()["llms"] if e["id"] == "mistral")
|
| 1167 |
+
assert "LLM" in mistral_llm["label"]
|
| 1168 |
+
|
| 1169 |
+
|
| 1170 |
+
# ===========================================================================
|
| 1171 |
+
# TestMistralOCRNativeAPI — mistral-ocr-latest routing
|
| 1172 |
+
# ===========================================================================
|
| 1173 |
+
|
| 1174 |
+
class TestMistralOCRNativeAPI:
|
| 1175 |
+
|
| 1176 |
+
def test_engine_has_native_api_method(self):
|
| 1177 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1178 |
+
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
|
| 1179 |
+
assert hasattr(eng, "_run_ocr_native_api")
|
| 1180 |
+
|
| 1181 |
+
def test_engine_has_vision_api_method(self):
|
| 1182 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1183 |
+
eng = MistralOCREngine(config={"model": "pixtral-12b-2409"})
|
| 1184 |
+
assert hasattr(eng, "_run_ocr_vision_api")
|
| 1185 |
+
|
| 1186 |
+
def test_model_name_stored(self):
|
| 1187 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1188 |
+
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
|
| 1189 |
+
assert eng._model == "mistral-ocr-latest"
|
| 1190 |
+
|
| 1191 |
+
def test_pixtral_model_stored(self):
|
| 1192 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1193 |
+
eng = MistralOCREngine(config={"model": "pixtral-large-latest"})
|
| 1194 |
+
assert "pixtral" in eng._model.lower()
|
| 1195 |
+
|
| 1196 |
+
def test_engine_name_unchanged(self):
|
| 1197 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1198 |
+
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
|
| 1199 |
+
assert eng.name == "mistral_ocr"
|
| 1200 |
+
|
| 1201 |
+
def test_version_returns_model_name(self):
|
| 1202 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1203 |
+
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
|
| 1204 |
+
assert eng.version() == "mistral-ocr-latest"
|