Spaces:
Running
fix(llm): nettoyage debug + robustesse réponse Mistral SDK
Browse filesSupprime tous les logs temporaires de diagnostic (ENTRY TRACE, DEBUG
verbose) ajoutés lors des sprints de debug précédents.
mistral_adapter.py :
- Suppression de 60 lignes de logs DEBUG/INFO temporaires
- Ajout d'un guard pour content de type list (certaines versions du SDK
mistralai retournent une liste de ContentChunk au lieu d'un str)
- Conservation des logs utiles en production : appel, réponse, erreurs
pipelines/base.py :
- Suppression des ENTRY TRACE
- L'exception dans run() est maintenant loguée en WARNING (pas INFO)
engines/base.py :
- Suppression du ENTRY TRACE et de l'import logging inutile
runner.py :
- Suppression du ENTRY TRACE dans _io_doc_worker
36 tests OK (test_sprint15 + test_engines).
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/core/runner.py +0 -5
- picarones/engines/base.py +0 -8
- picarones/llm/mistral_adapter.py +24 -62
- picarones/pipelines/base.py +3 -14
|
@@ -71,11 +71,6 @@ def _io_doc_worker(
|
|
| 71 |
moteur est partagée entre les threads — les adaptateurs HTTP sont
|
| 72 |
généralement sans état mutable entre les appels.
|
| 73 |
"""
|
| 74 |
-
# ENTRY TRACE — confirme que _io_doc_worker est bien exécuté et quelle classe est appelée
|
| 75 |
-
logger.info(
|
| 76 |
-
"[runner-ENTRY] _io_doc_worker — classe=%s, doc=%s",
|
| 77 |
-
engine.__class__.__name__, getattr(doc, "doc_id", "?"),
|
| 78 |
-
)
|
| 79 |
ocr_result = engine.run(doc.image_path) # type: ignore[attr-defined]
|
| 80 |
return _compute_document_result(
|
| 81 |
doc_id=doc.doc_id, # type: ignore[attr-defined]
|
|
|
|
| 71 |
moteur est partagée entre les threads — les adaptateurs HTTP sont
|
| 72 |
généralement sans état mutable entre les appels.
|
| 73 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
ocr_result = engine.run(doc.image_path) # type: ignore[attr-defined]
|
| 75 |
return _compute_document_result(
|
| 76 |
doc_id=doc.doc_id, # type: ignore[attr-defined]
|
|
@@ -3,15 +3,12 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import hashlib
|
| 6 |
-
import logging
|
| 7 |
import time
|
| 8 |
from abc import ABC, abstractmethod
|
| 9 |
from dataclasses import dataclass, field
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Optional
|
| 12 |
|
| 13 |
-
_base_logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
|
| 16 |
@dataclass
|
| 17 |
class EngineResult:
|
|
@@ -71,11 +68,6 @@ class BaseOCREngine(ABC):
|
|
| 71 |
def run(self, image_path: str | Path) -> EngineResult:
|
| 72 |
"""Point d'entrée public : exécute l'OCR et mesure le temps d'exécution."""
|
| 73 |
image_path = Path(image_path)
|
| 74 |
-
# ENTRY TRACE — confirme quel moteur/classe est réellement exécuté
|
| 75 |
-
_base_logger.info(
|
| 76 |
-
"[BaseOCREngine-ENTRY] run() — classe=%s, doc=%s",
|
| 77 |
-
self.__class__.__name__, image_path.name,
|
| 78 |
-
)
|
| 79 |
start = time.perf_counter()
|
| 80 |
try:
|
| 81 |
text = self._run_ocr(image_path)
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import hashlib
|
|
|
|
| 6 |
import time
|
| 7 |
from abc import ABC, abstractmethod
|
| 8 |
from dataclasses import dataclass, field
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import Optional
|
| 11 |
|
|
|
|
|
|
|
| 12 |
|
| 13 |
@dataclass
|
| 14 |
class EngineResult:
|
|
|
|
| 68 |
def run(self, image_path: str | Path) -> EngineResult:
|
| 69 |
"""Point d'entrée public : exécute l'OCR et mesure le temps d'exécution."""
|
| 70 |
image_path = Path(image_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
start = time.perf_counter()
|
| 72 |
try:
|
| 73 |
text = self._run_ocr(image_path)
|
|
@@ -93,15 +93,9 @@ class MistralAdapter(BaseLLMAdapter):
|
|
| 93 |
else:
|
| 94 |
content = prompt
|
| 95 |
|
| 96 |
-
# INFO — longueur du texte OCR reçu (visible niveau INFO)
|
| 97 |
logger.info(
|
| 98 |
-
"[MistralAdapter]
|
| 99 |
-
len(prompt),
|
| 100 |
-
)
|
| 101 |
-
# DEBUG — prompt complet tronqué à 200 chars
|
| 102 |
-
logger.debug(
|
| 103 |
-
"[MistralAdapter] DEBUG prompt (200 premiers chars) : %r",
|
| 104 |
-
prompt[:200],
|
| 105 |
)
|
| 106 |
|
| 107 |
try:
|
|
@@ -113,11 +107,6 @@ class MistralAdapter(BaseLLMAdapter):
|
|
| 113 |
)
|
| 114 |
except Exception as exc:
|
| 115 |
status_code = getattr(exc, "status_code", None) or getattr(exc, "http_status", None)
|
| 116 |
-
# DEBUG — statut HTTP en cas d'erreur
|
| 117 |
-
logger.debug(
|
| 118 |
-
"[MistralAdapter] DEBUG exception type=%s status_code=%s message=%s",
|
| 119 |
-
type(exc).__name__, status_code, exc,
|
| 120 |
-
)
|
| 121 |
if status_code == 401:
|
| 122 |
logger.warning(
|
| 123 |
"[MistralAdapter] erreur HTTP 401 — clé API invalide ou expirée "
|
|
@@ -143,24 +132,9 @@ class MistralAdapter(BaseLLMAdapter):
|
|
| 143 |
)
|
| 144 |
raise
|
| 145 |
|
| 146 |
-
# DEBUG — choices complètes (visible niveau DEBUG uniquement)
|
| 147 |
-
try:
|
| 148 |
-
choices_debug = [
|
| 149 |
-
{
|
| 150 |
-
"index": c.index,
|
| 151 |
-
"finish_reason": c.finish_reason,
|
| 152 |
-
"content_type": type(c.message.content).__name__ if c.message else None,
|
| 153 |
-
"content_len": len(c.message.content) if c.message and c.message.content else 0,
|
| 154 |
-
}
|
| 155 |
-
for c in (response.choices or [])
|
| 156 |
-
]
|
| 157 |
-
except Exception as _exc: # noqa: BLE001
|
| 158 |
-
choices_debug = f"<erreur sérialisation choices : {_exc}>"
|
| 159 |
-
logger.debug("[MistralAdapter] DEBUG response.choices : %s", choices_debug)
|
| 160 |
-
|
| 161 |
if not response.choices:
|
| 162 |
logger.warning(
|
| 163 |
-
"[MistralAdapter]
|
| 164 |
self.model,
|
| 165 |
)
|
| 166 |
return ""
|
|
@@ -168,44 +142,32 @@ class MistralAdapter(BaseLLMAdapter):
|
|
| 168 |
_choice = response.choices[0]
|
| 169 |
raw = _choice.message.content
|
| 170 |
_finish_reason = _choice.finish_reason
|
| 171 |
-
_content_len = len(raw) if raw else 0
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
logger.debug(
|
| 181 |
-
"[MistralAdapter] DEBUG choices[0].message.content type=%s valeur=%r",
|
| 182 |
-
type(raw).__name__,
|
| 183 |
-
raw[:200] if isinstance(raw, str) else raw,
|
| 184 |
-
)
|
| 185 |
|
| 186 |
text = raw or ""
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
logger.warning(
|
| 199 |
-
"[MistralAdapter] réponse vide
|
| 200 |
-
"(
|
| 201 |
-
"Vérifier
|
| 202 |
-
|
| 203 |
-
self.model, len(raw) if raw is not None else "None",
|
| 204 |
-
)
|
| 205 |
-
else:
|
| 206 |
-
logger.debug(
|
| 207 |
-
"[MistralAdapter] réponse reçue — %d caractères, extrait : %r",
|
| 208 |
-
len(text), text[:120],
|
| 209 |
)
|
| 210 |
|
| 211 |
return text
|
|
|
|
| 93 |
else:
|
| 94 |
content = prompt
|
| 95 |
|
|
|
|
| 96 |
logger.info(
|
| 97 |
+
"[MistralAdapter] appel %s — prompt=%d chars, image=%s",
|
| 98 |
+
self.model, len(prompt), "oui" if image_b64 else "non",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
try:
|
|
|
|
| 107 |
)
|
| 108 |
except Exception as exc:
|
| 109 |
status_code = getattr(exc, "status_code", None) or getattr(exc, "http_status", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if status_code == 401:
|
| 111 |
logger.warning(
|
| 112 |
"[MistralAdapter] erreur HTTP 401 — clé API invalide ou expirée "
|
|
|
|
| 132 |
)
|
| 133 |
raise
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
if not response.choices:
|
| 136 |
logger.warning(
|
| 137 |
+
"[MistralAdapter] response.choices vide (modèle=%s).",
|
| 138 |
self.model,
|
| 139 |
)
|
| 140 |
return ""
|
|
|
|
| 142 |
_choice = response.choices[0]
|
| 143 |
raw = _choice.message.content
|
| 144 |
_finish_reason = _choice.finish_reason
|
|
|
|
| 145 |
|
| 146 |
+
# Le SDK mistralai peut retourner une liste de ContentChunk au lieu
|
| 147 |
+
# d'une chaîne pour certains modèles/versions. Normaliser en str.
|
| 148 |
+
if isinstance(raw, list):
|
| 149 |
+
raw = "".join(
|
| 150 |
+
chunk.text if hasattr(chunk, "text") else str(chunk)
|
| 151 |
+
for chunk in raw
|
| 152 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
text = raw or ""
|
| 155 |
|
| 156 |
+
_completion_tokens = None
|
| 157 |
+
if hasattr(response, "usage") and response.usage:
|
| 158 |
+
_completion_tokens = getattr(response.usage, "completion_tokens", None)
|
| 159 |
+
|
| 160 |
+
logger.info(
|
| 161 |
+
"[MistralAdapter] réponse %s — finish_reason=%s, len=%d, tokens=%s",
|
| 162 |
+
self.model, _finish_reason, len(text), _completion_tokens,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
if not text.strip():
|
| 166 |
logger.warning(
|
| 167 |
+
"[MistralAdapter] réponse vide du modèle '%s' "
|
| 168 |
+
"(finish_reason=%s, completion_tokens=%s). "
|
| 169 |
+
"Vérifier le prompt et la compatibilité du modèle.",
|
| 170 |
+
self.model, _finish_reason, _completion_tokens,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
)
|
| 172 |
|
| 173 |
return text
|
|
@@ -143,11 +143,6 @@ class OCRLLMPipeline(BaseOCREngine):
|
|
| 143 |
|
| 144 |
def _run_ocr(self, image_path: Path) -> str:
|
| 145 |
"""Logique interne du pipeline — appelée par ``run()``."""
|
| 146 |
-
# ENTRY TRACE — confirme que _run_ocr() est atteint après run()
|
| 147 |
-
logger.info(
|
| 148 |
-
"[Pipeline-ENTRY] _run_ocr() appelé — doc=%s, mode=%s",
|
| 149 |
-
image_path.name, self.mode.value,
|
| 150 |
-
)
|
| 151 |
self._last_ocr_text = None
|
| 152 |
ocr_text = ""
|
| 153 |
|
|
@@ -237,11 +232,6 @@ class OCRLLMPipeline(BaseOCREngine):
|
|
| 237 |
def run(self, image_path: str | Path) -> EngineResult:
|
| 238 |
"""Exécute le pipeline et retourne un EngineResult enrichi de métadonnées."""
|
| 239 |
image_path = Path(image_path)
|
| 240 |
-
# ENTRY TRACE — confirme que OCRLLMPipeline.run() est bien la méthode exécutée
|
| 241 |
-
logger.info(
|
| 242 |
-
"[Pipeline-ENTRY] OCRLLMPipeline.run() appelé — doc=%s, mode=%s, llm=%s",
|
| 243 |
-
image_path.name, self.mode.value, self.llm_adapter.model,
|
| 244 |
-
)
|
| 245 |
self._last_ocr_text = None
|
| 246 |
start = time.perf_counter()
|
| 247 |
|
|
@@ -251,10 +241,9 @@ class OCRLLMPipeline(BaseOCREngine):
|
|
| 251 |
except Exception as exc: # noqa: BLE001
|
| 252 |
text = ""
|
| 253 |
error = str(exc)
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
image_path.name, type(exc).__name__, exc,
|
| 258 |
)
|
| 259 |
|
| 260 |
duration = time.perf_counter() - start
|
|
|
|
| 143 |
|
| 144 |
def _run_ocr(self, image_path: Path) -> str:
|
| 145 |
"""Logique interne du pipeline — appelée par ``run()``."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
self._last_ocr_text = None
|
| 147 |
ocr_text = ""
|
| 148 |
|
|
|
|
| 232 |
def run(self, image_path: str | Path) -> EngineResult:
|
| 233 |
"""Exécute le pipeline et retourne un EngineResult enrichi de métadonnées."""
|
| 234 |
image_path = Path(image_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
self._last_ocr_text = None
|
| 236 |
start = time.perf_counter()
|
| 237 |
|
|
|
|
| 241 |
except Exception as exc: # noqa: BLE001
|
| 242 |
text = ""
|
| 243 |
error = str(exc)
|
| 244 |
+
logger.warning(
|
| 245 |
+
"[%s] erreur pipeline pour '%s' : %s",
|
| 246 |
+
self._name, image_path.name, exc,
|
|
|
|
| 247 |
)
|
| 248 |
|
| 249 |
duration = time.perf_counter() - start
|