Spaces:
Running
Sprint 10 — Distribution des erreurs par ligne et détection des hallucinations VLM
Browse filesNouveaux modules :
- picarones/core/line_metrics.py : CER par ligne, percentiles (p50–p99),
coefficient de Gini, taux de lignes catastrophiques, carte thermique de position,
agrégation sur corpus
- picarones/core/hallucination.py : taux d'insertion nette, ratio longueur sortie/GT,
score d'ancrage trigrammes, détection de blocs hallucinés, badge hallucination,
agrégation sur corpus
Modèles de données (results.py) :
- DocumentResult : nouveaux champs line_metrics + hallucination_metrics
- EngineReport : nouveaux champs aggregated_line_metrics + aggregated_hallucination
Données de démonstration (fixtures.py) :
- Nouveau moteur fictif VLM (gpt-4o-vision zero-shot) avec hallucinations simulées
- Toutes les phrases intercalées absentes du GT, modernisation systématique
- Calcul des métriques Sprint 10 pour les 5 moteurs (3 OCR + 2 pipelines LLM/VLM)
Rapport HTML (report/generator.py) :
- Colonnes Gini et Ancrage dans le tableau de classement (triables)
- Badge VLM 👁 pour les moteurs zero-shot
- Vue Document : carte thermique des erreurs par position, percentiles CER par ligne,
panneau hallucinations avec blocs détectés
- Vue Analyses : scatter plot Gini vs CER moyen, scatter plot ratio longueur vs ancrage
avec zones de danger (ancrage < 0.5, ratio > 1.2) dessinées par plugin Chart.js
- Export CSV enrichi (gini, anchor_score, length_ratio, is_hallucinating)
Tests (tests/test_sprint10_error_distribution.py) :
- 40 tests couvrant les 5 nouvelles classes : TestLineMetrics (12),
TestHallucinationMetrics (12), TestLineMetricsInResults (4),
TestFixturesVLM (6), TestReportSprint10 (6)
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/core/hallucination.py +332 -0
- picarones/core/line_metrics.py +286 -0
- picarones/core/results.py +18 -0
- picarones/fixtures.py +94 -0
- picarones/report/generator.py +396 -1
- tests/test_sprint10_error_distribution.py +426 -0
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Détection des hallucinations VLM/LLM — Sprint 10.
|
| 2 |
+
|
| 3 |
+
Métriques calculées
|
| 4 |
+
-------------------
|
| 5 |
+
- Taux d'insertion net : mots/caractères ajoutés absents du GT, distinct du WIL existant
|
| 6 |
+
- Ratio de longueur : len(hyp) / len(gt) — ratio > 1.2 → hallucination potentielle
|
| 7 |
+
- Score d'ancrage : proportion des n-grammes (trigrammes) de la sortie présents dans le GT
|
| 8 |
+
- Blocs hallucinés : segments continus de la sortie sans correspondance GT au-delà d'un seuil
|
| 9 |
+
- Badge hallucination : True si ancrage faible ou ratio de longueur anormal
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import re
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Helpers texte
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def _tokenize(text: str) -> list[str]:
|
| 24 |
+
"""Découpe en mots (minuscules, sans ponctuation)."""
|
| 25 |
+
return re.findall(r"[^\s]+", text.lower())
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _ngrams(tokens: list[str], n: int) -> list[tuple[str, ...]]:
|
| 29 |
+
"""Génère les n-grammes d'une liste de tokens."""
|
| 30 |
+
if len(tokens) < n:
|
| 31 |
+
return [tuple(tokens)] if tokens else []
|
| 32 |
+
return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
# Blocs hallucinés (segments continus sans ancrage)
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class HallucinatedBlock:
|
| 41 |
+
"""Segment continu de la sortie sans correspondance dans le GT."""
|
| 42 |
+
start_token: int
|
| 43 |
+
end_token: int
|
| 44 |
+
text: str
|
| 45 |
+
length: int # nombre de tokens
|
| 46 |
+
|
| 47 |
+
def as_dict(self) -> dict:
|
| 48 |
+
return {
|
| 49 |
+
"start_token": self.start_token,
|
| 50 |
+
"end_token": self.end_token,
|
| 51 |
+
"text": self.text,
|
| 52 |
+
"length": self.length,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _detect_hallucinated_blocks(
|
| 57 |
+
hyp_tokens: list[str],
|
| 58 |
+
gt_token_set: set[str],
|
| 59 |
+
tolerance: int = 3,
|
| 60 |
+
min_block_length: int = 4,
|
| 61 |
+
) -> list[HallucinatedBlock]:
|
| 62 |
+
"""Détecte les blocs de tokens hypothèse sans correspondance dans le GT.
|
| 63 |
+
|
| 64 |
+
Un bloc est un segment contigu de tokens hypothèse dont aucun n'est présent
|
| 65 |
+
dans le vocabulaire GT. Une tolérance de ``tolerance`` tokens connus interrompus
|
| 66 |
+
est acceptée avant de clore un bloc.
|
| 67 |
+
|
| 68 |
+
Parameters
|
| 69 |
+
----------
|
| 70 |
+
hyp_tokens:
|
| 71 |
+
Tokens de la sortie OCR/VLM.
|
| 72 |
+
gt_token_set:
|
| 73 |
+
Ensemble des tokens du GT (pour recherche O(1)).
|
| 74 |
+
tolerance:
|
| 75 |
+
Nombre de tokens connus consécutifs interrompant un bloc avant de le clore.
|
| 76 |
+
min_block_length:
|
| 77 |
+
Longueur minimale (tokens) pour qu'un bloc soit signalé.
|
| 78 |
+
|
| 79 |
+
Returns
|
| 80 |
+
-------
|
| 81 |
+
list[HallucinatedBlock]
|
| 82 |
+
"""
|
| 83 |
+
blocks: list[HallucinatedBlock] = []
|
| 84 |
+
if not hyp_tokens:
|
| 85 |
+
return blocks
|
| 86 |
+
|
| 87 |
+
in_block = False
|
| 88 |
+
block_start = 0
|
| 89 |
+
consecutive_known = 0
|
| 90 |
+
|
| 91 |
+
for i, tok in enumerate(hyp_tokens):
|
| 92 |
+
is_unknown = tok not in gt_token_set
|
| 93 |
+
if is_unknown:
|
| 94 |
+
if not in_block:
|
| 95 |
+
in_block = True
|
| 96 |
+
block_start = i
|
| 97 |
+
consecutive_known = 0
|
| 98 |
+
else:
|
| 99 |
+
consecutive_known = 0
|
| 100 |
+
else:
|
| 101 |
+
if in_block:
|
| 102 |
+
consecutive_known += 1
|
| 103 |
+
if consecutive_known >= tolerance:
|
| 104 |
+
# Clore le bloc
|
| 105 |
+
end = i - consecutive_known
|
| 106 |
+
length = end - block_start + 1
|
| 107 |
+
if length >= min_block_length:
|
| 108 |
+
text = " ".join(hyp_tokens[block_start:end + 1])
|
| 109 |
+
blocks.append(HallucinatedBlock(
|
| 110 |
+
start_token=block_start,
|
| 111 |
+
end_token=end,
|
| 112 |
+
text=text,
|
| 113 |
+
length=length,
|
| 114 |
+
))
|
| 115 |
+
in_block = False
|
| 116 |
+
consecutive_known = 0
|
| 117 |
+
|
| 118 |
+
# Bloc non terminé
|
| 119 |
+
if in_block:
|
| 120 |
+
end = len(hyp_tokens) - 1
|
| 121 |
+
length = end - block_start + 1
|
| 122 |
+
if length >= min_block_length:
|
| 123 |
+
text = " ".join(hyp_tokens[block_start:end + 1])
|
| 124 |
+
blocks.append(HallucinatedBlock(
|
| 125 |
+
start_token=block_start,
|
| 126 |
+
end_token=end,
|
| 127 |
+
text=text,
|
| 128 |
+
length=length,
|
| 129 |
+
))
|
| 130 |
+
|
| 131 |
+
return blocks
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
# Résultat structuré
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
|
| 138 |
+
@dataclass
|
| 139 |
+
class HallucinationMetrics:
|
| 140 |
+
"""Métriques de détection des hallucinations pour une paire (GT, hypothèse)."""
|
| 141 |
+
|
| 142 |
+
net_insertion_rate: float
|
| 143 |
+
"""Taux d'insertion nette : tokens hypothèse absents du GT / total tokens hypothèse."""
|
| 144 |
+
|
| 145 |
+
length_ratio: float
|
| 146 |
+
"""Ratio de longueur : len(hyp) / len(gt) en caractères. > 1.2 = signal d'hallucination."""
|
| 147 |
+
|
| 148 |
+
anchor_score: float
|
| 149 |
+
"""Score d'ancrage : proportion des trigrammes hypothèse présents dans les trigrammes GT.
|
| 150 |
+
Score élevé → l'hypothèse s'ancre bien dans le GT. Score faible → hallucinations probables."""
|
| 151 |
+
|
| 152 |
+
hallucinated_blocks: list[HallucinatedBlock]
|
| 153 |
+
"""Segments continus de la sortie sans correspondance GT (au-dessus du seuil de tolérance)."""
|
| 154 |
+
|
| 155 |
+
is_hallucinating: bool
|
| 156 |
+
"""True si anchor_score < anchor_threshold OU length_ratio > length_ratio_threshold."""
|
| 157 |
+
|
| 158 |
+
# Détails supplémentaires
|
| 159 |
+
gt_word_count: int = 0
|
| 160 |
+
hyp_word_count: int = 0
|
| 161 |
+
net_inserted_words: int = 0
|
| 162 |
+
anchor_threshold_used: float = 0.5
|
| 163 |
+
length_ratio_threshold_used: float = 1.2
|
| 164 |
+
ngram_size_used: int = 3
|
| 165 |
+
|
| 166 |
+
def as_dict(self) -> dict:
|
| 167 |
+
return {
|
| 168 |
+
"net_insertion_rate": round(self.net_insertion_rate, 6),
|
| 169 |
+
"length_ratio": round(self.length_ratio, 6),
|
| 170 |
+
"anchor_score": round(self.anchor_score, 6),
|
| 171 |
+
"hallucinated_blocks": [b.as_dict() for b in self.hallucinated_blocks],
|
| 172 |
+
"is_hallucinating": self.is_hallucinating,
|
| 173 |
+
"gt_word_count": self.gt_word_count,
|
| 174 |
+
"hyp_word_count": self.hyp_word_count,
|
| 175 |
+
"net_inserted_words": self.net_inserted_words,
|
| 176 |
+
"anchor_threshold_used": self.anchor_threshold_used,
|
| 177 |
+
"length_ratio_threshold_used": self.length_ratio_threshold_used,
|
| 178 |
+
"ngram_size_used": self.ngram_size_used,
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
@classmethod
|
| 182 |
+
def from_dict(cls, d: dict) -> "HallucinationMetrics":
|
| 183 |
+
blocks = [
|
| 184 |
+
HallucinatedBlock(**b) for b in d.get("hallucinated_blocks", [])
|
| 185 |
+
]
|
| 186 |
+
return cls(
|
| 187 |
+
net_insertion_rate=d.get("net_insertion_rate", 0.0),
|
| 188 |
+
length_ratio=d.get("length_ratio", 1.0),
|
| 189 |
+
anchor_score=d.get("anchor_score", 1.0),
|
| 190 |
+
hallucinated_blocks=blocks,
|
| 191 |
+
is_hallucinating=d.get("is_hallucinating", False),
|
| 192 |
+
gt_word_count=d.get("gt_word_count", 0),
|
| 193 |
+
hyp_word_count=d.get("hyp_word_count", 0),
|
| 194 |
+
net_inserted_words=d.get("net_inserted_words", 0),
|
| 195 |
+
anchor_threshold_used=d.get("anchor_threshold_used", 0.5),
|
| 196 |
+
length_ratio_threshold_used=d.get("length_ratio_threshold_used", 1.2),
|
| 197 |
+
ngram_size_used=d.get("ngram_size_used", 3),
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# ---------------------------------------------------------------------------
|
| 202 |
+
# Calcul principal
|
| 203 |
+
# ---------------------------------------------------------------------------
|
| 204 |
+
|
| 205 |
+
def compute_hallucination_metrics(
|
| 206 |
+
reference: str,
|
| 207 |
+
hypothesis: str,
|
| 208 |
+
n: int = 3,
|
| 209 |
+
length_ratio_threshold: float = 1.2,
|
| 210 |
+
anchor_threshold: float = 0.5,
|
| 211 |
+
block_tolerance: int = 3,
|
| 212 |
+
min_block_length: int = 4,
|
| 213 |
+
) -> HallucinationMetrics:
|
| 214 |
+
"""Calcule les métriques de détection des hallucinations VLM/LLM.
|
| 215 |
+
|
| 216 |
+
Parameters
|
| 217 |
+
----------
|
| 218 |
+
reference:
|
| 219 |
+
Texte de vérité terrain (GT).
|
| 220 |
+
hypothesis:
|
| 221 |
+
Texte produit par le modèle.
|
| 222 |
+
n:
|
| 223 |
+
Taille des n-grammes pour le score d'ancrage (défaut : trigrammes).
|
| 224 |
+
length_ratio_threshold:
|
| 225 |
+
Seuil de ratio de longueur au-dessus duquel on signale une hallucination potentielle.
|
| 226 |
+
anchor_threshold:
|
| 227 |
+
Seuil de score d'ancrage en dessous duquel on signale une hallucination potentielle.
|
| 228 |
+
block_tolerance:
|
| 229 |
+
Nombre de tokens connus consécutifs acceptés dans un bloc halluciné.
|
| 230 |
+
min_block_length:
|
| 231 |
+
Longueur minimale (tokens) pour signaler un bloc halluciné.
|
| 232 |
+
|
| 233 |
+
Returns
|
| 234 |
+
-------
|
| 235 |
+
HallucinationMetrics
|
| 236 |
+
"""
|
| 237 |
+
gt_tokens = _tokenize(reference)
|
| 238 |
+
hyp_tokens = _tokenize(hypothesis)
|
| 239 |
+
|
| 240 |
+
gt_len_chars = len(reference.strip())
|
| 241 |
+
hyp_len_chars = len(hypothesis.strip())
|
| 242 |
+
|
| 243 |
+
# ── Ratio de longueur ────────────────────────────────────────────────
|
| 244 |
+
if gt_len_chars == 0:
|
| 245 |
+
length_ratio = 1.0 if hyp_len_chars == 0 else float("inf")
|
| 246 |
+
else:
|
| 247 |
+
length_ratio = hyp_len_chars / gt_len_chars
|
| 248 |
+
|
| 249 |
+
# ── Taux d'insertion nette ───────────────────────────────────────────
|
| 250 |
+
gt_token_set = set(gt_tokens)
|
| 251 |
+
hyp_token_count = len(hyp_tokens)
|
| 252 |
+
|
| 253 |
+
if hyp_token_count == 0:
|
| 254 |
+
net_insertion_rate = 0.0
|
| 255 |
+
net_inserted_words = 0
|
| 256 |
+
else:
|
| 257 |
+
net_inserted = [t for t in hyp_tokens if t not in gt_token_set]
|
| 258 |
+
net_inserted_words = len(net_inserted)
|
| 259 |
+
net_insertion_rate = net_inserted_words / hyp_token_count
|
| 260 |
+
|
| 261 |
+
# ── Score d'ancrage (n-grammes) ──────────────────────────────────────
|
| 262 |
+
gt_ngrams = set(_ngrams(gt_tokens, n))
|
| 263 |
+
hyp_ngrams = _ngrams(hyp_tokens, n)
|
| 264 |
+
|
| 265 |
+
if not hyp_ngrams:
|
| 266 |
+
# Pas de n-grammes dans l'hypothèse → ancrage parfait (hypothèse vide ou trop courte)
|
| 267 |
+
anchor_score = 1.0 if not gt_ngrams else 0.0
|
| 268 |
+
elif not gt_ngrams:
|
| 269 |
+
anchor_score = 0.0
|
| 270 |
+
else:
|
| 271 |
+
anchored = sum(1 for ng in hyp_ngrams if ng in gt_ngrams)
|
| 272 |
+
anchor_score = anchored / len(hyp_ngrams)
|
| 273 |
+
|
| 274 |
+
# ── Blocs hallucinés ─────────────────────────────────────────────────
|
| 275 |
+
blocks = _detect_hallucinated_blocks(
|
| 276 |
+
hyp_tokens=hyp_tokens,
|
| 277 |
+
gt_token_set=gt_token_set,
|
| 278 |
+
tolerance=block_tolerance,
|
| 279 |
+
min_block_length=min_block_length,
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# ── Badge hallucination ──────────────────────────────────────────────
|
| 283 |
+
is_hallucinating = (
|
| 284 |
+
anchor_score < anchor_threshold
|
| 285 |
+
or (length_ratio > length_ratio_threshold and length_ratio != float("inf"))
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
return HallucinationMetrics(
|
| 289 |
+
net_insertion_rate=net_insertion_rate,
|
| 290 |
+
length_ratio=min(length_ratio, 9.99), # plafonner pour la sérialisation
|
| 291 |
+
anchor_score=anchor_score,
|
| 292 |
+
hallucinated_blocks=blocks,
|
| 293 |
+
is_hallucinating=is_hallucinating,
|
| 294 |
+
gt_word_count=len(gt_tokens),
|
| 295 |
+
hyp_word_count=hyp_token_count,
|
| 296 |
+
net_inserted_words=net_inserted_words,
|
| 297 |
+
anchor_threshold_used=anchor_threshold,
|
| 298 |
+
length_ratio_threshold_used=length_ratio_threshold,
|
| 299 |
+
ngram_size_used=n,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
# Agrégation sur un corpus
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
|
| 307 |
+
def aggregate_hallucination_metrics(results: list[HallucinationMetrics]) -> dict:
|
| 308 |
+
"""Agrège les métriques d'hallucination sur un corpus.
|
| 309 |
+
|
| 310 |
+
Returns
|
| 311 |
+
-------
|
| 312 |
+
dict
|
| 313 |
+
Statistiques agrégées : anchor_score moyen, taux de documents hallucinés…
|
| 314 |
+
"""
|
| 315 |
+
if not results:
|
| 316 |
+
return {}
|
| 317 |
+
|
| 318 |
+
n = len(results)
|
| 319 |
+
anchor_values = [r.anchor_score for r in results]
|
| 320 |
+
ratio_values = [r.length_ratio for r in results]
|
| 321 |
+
insertion_values = [r.net_insertion_rate for r in results]
|
| 322 |
+
hallucinating_count = sum(1 for r in results if r.is_hallucinating)
|
| 323 |
+
|
| 324 |
+
return {
|
| 325 |
+
"anchor_score_mean": round(sum(anchor_values) / n, 6),
|
| 326 |
+
"anchor_score_min": round(min(anchor_values), 6),
|
| 327 |
+
"length_ratio_mean": round(sum(ratio_values) / n, 6),
|
| 328 |
+
"net_insertion_rate_mean": round(sum(insertion_values) / n, 6),
|
| 329 |
+
"hallucinating_doc_count": hallucinating_count,
|
| 330 |
+
"hallucinating_doc_rate": round(hallucinating_count / n, 6),
|
| 331 |
+
"document_count": n,
|
| 332 |
+
}
|
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Distribution des erreurs CER par ligne — Sprint 10.
|
| 2 |
+
|
| 3 |
+
Métriques calculées
|
| 4 |
+
-------------------
|
| 5 |
+
- CER par ligne : distance d'édition caractère/longueur GT sur chaque paire de lignes
|
| 6 |
+
- Percentiles : p50, p75, p90, p95, p99 sur la distribution des CER ligne
|
| 7 |
+
- Taux catastrophiques : % de lignes dépassant des seuils configurables (30 %, 50 %, 100 %)
|
| 8 |
+
- Coefficient de Gini : concentration des erreurs (0 = uniformes, 1 = toutes concentrées)
|
| 9 |
+
- Carte thermique : CER moyen par tranche de position dans le document
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import unicodedata
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# CER d'une paire de lignes (distance d'édition Levenshtein normalisée)
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def _edit_distance(a: str, b: str) -> int:
|
| 24 |
+
"""Distance de Levenshtein entre deux chaînes."""
|
| 25 |
+
if not a:
|
| 26 |
+
return len(b)
|
| 27 |
+
if not b:
|
| 28 |
+
return len(a)
|
| 29 |
+
prev = list(range(len(b) + 1))
|
| 30 |
+
for i, ca in enumerate(a, 1):
|
| 31 |
+
curr = [i]
|
| 32 |
+
for j, cb in enumerate(b, 1):
|
| 33 |
+
cost = 0 if ca == cb else 1
|
| 34 |
+
curr.append(min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost))
|
| 35 |
+
prev = curr
|
| 36 |
+
return prev[-1]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _line_cer(ref_line: str, hyp_line: str) -> float:
|
| 40 |
+
"""CER pour une paire de lignes. Retourne 1.0 si le GT est vide et que l'hyp ne l'est pas."""
|
| 41 |
+
ref = unicodedata.normalize("NFC", ref_line.strip())
|
| 42 |
+
hyp = unicodedata.normalize("NFC", hyp_line.strip())
|
| 43 |
+
if not ref:
|
| 44 |
+
return 0.0 if not hyp else 1.0
|
| 45 |
+
dist = _edit_distance(ref, hyp)
|
| 46 |
+
return dist / len(ref)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Percentiles (implémentation pur-Python, sans numpy)
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
def _percentile(sorted_values: list[float], p: float) -> float:
|
| 54 |
+
"""Retourne le p-ième percentile (0 ≤ p ≤ 100) d'une liste triée."""
|
| 55 |
+
if not sorted_values:
|
| 56 |
+
return 0.0
|
| 57 |
+
n = len(sorted_values)
|
| 58 |
+
index = p / 100 * (n - 1)
|
| 59 |
+
lo = int(index)
|
| 60 |
+
hi = min(lo + 1, n - 1)
|
| 61 |
+
frac = index - lo
|
| 62 |
+
return sorted_values[lo] + frac * (sorted_values[hi] - sorted_values[lo])
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Coefficient de Gini
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def _gini(values: list[float]) -> float:
|
| 70 |
+
"""Coefficient de Gini des erreurs (0 = uniformes, 1 = toutes concentrées).
|
| 71 |
+
|
| 72 |
+
Formule : G = (2 * Σ i*x_i) / (n * Σ x_i) - (n+1)/n
|
| 73 |
+
sur les valeurs triées par ordre croissant.
|
| 74 |
+
"""
|
| 75 |
+
if not values:
|
| 76 |
+
return 0.0
|
| 77 |
+
xs = sorted(max(v, 0.0) for v in values)
|
| 78 |
+
n = len(xs)
|
| 79 |
+
total = sum(xs)
|
| 80 |
+
if total == 0.0:
|
| 81 |
+
return 0.0
|
| 82 |
+
weighted_sum = sum((i + 1) * x for i, x in enumerate(xs))
|
| 83 |
+
return (2.0 * weighted_sum) / (n * total) - (n + 1) / n
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Résultat structuré
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
@dataclass
|
| 91 |
+
class LineMetrics:
|
| 92 |
+
"""Distribution des erreurs CER par ligne pour une paire (GT, hypothèse)."""
|
| 93 |
+
|
| 94 |
+
cer_per_line: list[float]
|
| 95 |
+
"""CER de chaque ligne (longueur = nombre de lignes GT)."""
|
| 96 |
+
|
| 97 |
+
percentiles: dict[str, float]
|
| 98 |
+
"""Percentiles : p50, p75, p90, p95, p99."""
|
| 99 |
+
|
| 100 |
+
catastrophic_rate: dict[str, float]
|
| 101 |
+
"""Taux de lignes catastrophiques pour chaque seuil (ex. {0.3: 0.12, 0.5: 0.07, 1.0: 0.02})."""
|
| 102 |
+
|
| 103 |
+
gini: float
|
| 104 |
+
"""Coefficient de Gini des erreurs (0 → uniforme, 1 → concentrées)."""
|
| 105 |
+
|
| 106 |
+
heatmap: list[float]
|
| 107 |
+
"""CER moyen par tranche de position dans le document (longueur = heatmap_bins)."""
|
| 108 |
+
|
| 109 |
+
line_count: int
|
| 110 |
+
"""Nombre de lignes GT traitées."""
|
| 111 |
+
|
| 112 |
+
mean_cer: float
|
| 113 |
+
"""CER moyen sur l'ensemble des lignes."""
|
| 114 |
+
|
| 115 |
+
def as_dict(self) -> dict:
|
| 116 |
+
return {
|
| 117 |
+
"cer_per_line": [round(v, 6) for v in self.cer_per_line],
|
| 118 |
+
"percentiles": {k: round(v, 6) for k, v in self.percentiles.items()},
|
| 119 |
+
"catastrophic_rate": {str(k): round(v, 6) for k, v in self.catastrophic_rate.items()},
|
| 120 |
+
"gini": round(self.gini, 6),
|
| 121 |
+
"heatmap": [round(v, 6) for v in self.heatmap],
|
| 122 |
+
"line_count": self.line_count,
|
| 123 |
+
"mean_cer": round(self.mean_cer, 6),
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
@classmethod
|
| 127 |
+
def from_dict(cls, d: dict) -> "LineMetrics":
|
| 128 |
+
return cls(
|
| 129 |
+
cer_per_line=d.get("cer_per_line", []),
|
| 130 |
+
percentiles=d.get("percentiles", {}),
|
| 131 |
+
catastrophic_rate={float(k): v for k, v in d.get("catastrophic_rate", {}).items()},
|
| 132 |
+
gini=d.get("gini", 0.0),
|
| 133 |
+
heatmap=d.get("heatmap", []),
|
| 134 |
+
line_count=d.get("line_count", 0),
|
| 135 |
+
mean_cer=d.get("mean_cer", 0.0),
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
# Calcul principal
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
|
| 143 |
+
def compute_line_metrics(
|
| 144 |
+
reference: str,
|
| 145 |
+
hypothesis: str,
|
| 146 |
+
thresholds: Optional[list[float]] = None,
|
| 147 |
+
heatmap_bins: int = 10,
|
| 148 |
+
) -> LineMetrics:
|
| 149 |
+
"""Calcule la distribution des erreurs CER ligne par ligne.
|
| 150 |
+
|
| 151 |
+
Parameters
|
| 152 |
+
----------
|
| 153 |
+
reference:
|
| 154 |
+
Texte de vérité terrain (GT) avec sauts de ligne.
|
| 155 |
+
hypothesis:
|
| 156 |
+
Texte produit par le moteur OCR.
|
| 157 |
+
thresholds:
|
| 158 |
+
Seuils CER pour le taux catastrophique. Défaut : [0.30, 0.50, 1.00].
|
| 159 |
+
heatmap_bins:
|
| 160 |
+
Nombre de tranches de position pour la carte thermique.
|
| 161 |
+
|
| 162 |
+
Returns
|
| 163 |
+
-------
|
| 164 |
+
LineMetrics
|
| 165 |
+
"""
|
| 166 |
+
if thresholds is None:
|
| 167 |
+
thresholds = [0.30, 0.50, 1.00]
|
| 168 |
+
|
| 169 |
+
ref_lines = reference.splitlines()
|
| 170 |
+
hyp_lines = hypothesis.splitlines()
|
| 171 |
+
|
| 172 |
+
# Aligner les lignes GT / hypothèse — on prend au moins autant de lignes que le GT
|
| 173 |
+
n = len(ref_lines)
|
| 174 |
+
if n == 0:
|
| 175 |
+
# Pas de lignes : retourner des métriques neutres
|
| 176 |
+
return LineMetrics(
|
| 177 |
+
cer_per_line=[],
|
| 178 |
+
percentiles={f"p{p}": 0.0 for p in (50, 75, 90, 95, 99)},
|
| 179 |
+
catastrophic_rate={t: 0.0 for t in thresholds},
|
| 180 |
+
gini=0.0,
|
| 181 |
+
heatmap=[0.0] * heatmap_bins,
|
| 182 |
+
line_count=0,
|
| 183 |
+
mean_cer=0.0,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Aligner en ignorant les lignes d'hypothèse supplémentaires
|
| 187 |
+
# Si l'hypothèse a moins de lignes, les lignes manquantes comptent comme supprimées (CER = 1.0)
|
| 188 |
+
cer_per_line: list[float] = []
|
| 189 |
+
for i, ref_line in enumerate(ref_lines):
|
| 190 |
+
hyp_line = hyp_lines[i] if i < len(hyp_lines) else ""
|
| 191 |
+
cer_per_line.append(min(_line_cer(ref_line, hyp_line), 1.0))
|
| 192 |
+
|
| 193 |
+
sorted_cer = sorted(cer_per_line)
|
| 194 |
+
|
| 195 |
+
# Percentiles
|
| 196 |
+
percentiles = {
|
| 197 |
+
f"p{p}": _percentile(sorted_cer, p)
|
| 198 |
+
for p in (50, 75, 90, 95, 99)
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
# Taux catastrophiques
|
| 202 |
+
catastrophic_rate: dict[float, float] = {}
|
| 203 |
+
for t in thresholds:
|
| 204 |
+
count = sum(1 for v in cer_per_line if v > t)
|
| 205 |
+
catastrophic_rate[t] = count / n
|
| 206 |
+
|
| 207 |
+
# Gini
|
| 208 |
+
gini = _gini(cer_per_line)
|
| 209 |
+
|
| 210 |
+
# Carte thermique par tranche de position
|
| 211 |
+
bins = heatmap_bins
|
| 212 |
+
heatmap: list[float] = []
|
| 213 |
+
for b in range(bins):
|
| 214 |
+
start = int(b * n / bins)
|
| 215 |
+
end = int((b + 1) * n / bins)
|
| 216 |
+
slice_ = cer_per_line[start:end]
|
| 217 |
+
heatmap.append(sum(slice_) / len(slice_) if slice_ else 0.0)
|
| 218 |
+
|
| 219 |
+
mean_cer = sum(cer_per_line) / n
|
| 220 |
+
|
| 221 |
+
return LineMetrics(
|
| 222 |
+
cer_per_line=cer_per_line,
|
| 223 |
+
percentiles=percentiles,
|
| 224 |
+
catastrophic_rate=catastrophic_rate,
|
| 225 |
+
gini=gini,
|
| 226 |
+
heatmap=heatmap,
|
| 227 |
+
line_count=n,
|
| 228 |
+
mean_cer=mean_cer,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ---------------------------------------------------------------------------
|
| 233 |
+
# Agrégation sur un corpus
|
| 234 |
+
# ---------------------------------------------------------------------------
|
| 235 |
+
|
| 236 |
+
def aggregate_line_metrics(results: list[LineMetrics]) -> dict:
|
| 237 |
+
"""Agrège les métriques de distribution par ligne sur un corpus.
|
| 238 |
+
|
| 239 |
+
Returns
|
| 240 |
+
-------
|
| 241 |
+
dict
|
| 242 |
+
Statistiques agrégées : Gini moyen, percentiles moyens, taux catastrophiques moyens.
|
| 243 |
+
"""
|
| 244 |
+
if not results:
|
| 245 |
+
return {}
|
| 246 |
+
|
| 247 |
+
import statistics as _stats
|
| 248 |
+
|
| 249 |
+
gini_values = [r.gini for r in results]
|
| 250 |
+
mean_cer_values = [r.mean_cer for r in results]
|
| 251 |
+
|
| 252 |
+
# Percentiles moyens
|
| 253 |
+
pct_keys = ["p50", "p75", "p90", "p95", "p99"]
|
| 254 |
+
avg_percentiles = {}
|
| 255 |
+
for k in pct_keys:
|
| 256 |
+
vals = [r.percentiles.get(k, 0.0) for r in results]
|
| 257 |
+
avg_percentiles[k] = round(sum(vals) / len(vals), 6) if vals else 0.0
|
| 258 |
+
|
| 259 |
+
# Taux catastrophiques moyens (union des seuils)
|
| 260 |
+
all_thresholds: set[float] = set()
|
| 261 |
+
for r in results:
|
| 262 |
+
all_thresholds.update(r.catastrophic_rate.keys())
|
| 263 |
+
avg_catastrophic: dict[str, float] = {}
|
| 264 |
+
for t in sorted(all_thresholds):
|
| 265 |
+
vals = [r.catastrophic_rate.get(t, 0.0) for r in results]
|
| 266 |
+
avg_catastrophic[str(t)] = round(sum(vals) / len(vals), 6) if vals else 0.0
|
| 267 |
+
|
| 268 |
+
# Heatmap moyenne (longueur = max des longueurs)
|
| 269 |
+
if results and results[0].heatmap:
|
| 270 |
+
n_bins = len(results[0].heatmap)
|
| 271 |
+
heatmap_avg = []
|
| 272 |
+
for b in range(n_bins):
|
| 273 |
+
vals = [r.heatmap[b] for r in results if b < len(r.heatmap)]
|
| 274 |
+
heatmap_avg.append(round(sum(vals) / len(vals), 6) if vals else 0.0)
|
| 275 |
+
else:
|
| 276 |
+
heatmap_avg = []
|
| 277 |
+
|
| 278 |
+
return {
|
| 279 |
+
"gini_mean": round(sum(gini_values) / len(gini_values), 6),
|
| 280 |
+
"gini_stdev": round(_stats.stdev(gini_values), 6) if len(gini_values) > 1 else 0.0,
|
| 281 |
+
"mean_cer_mean": round(sum(mean_cer_values) / len(mean_cer_values), 6),
|
| 282 |
+
"percentiles": avg_percentiles,
|
| 283 |
+
"catastrophic_rate": avg_catastrophic,
|
| 284 |
+
"heatmap": heatmap_avg,
|
| 285 |
+
"document_count": len(results),
|
| 286 |
+
}
|
|
@@ -46,6 +46,11 @@ class DocumentResult:
|
|
| 46 |
"""Analyse structurelle (segmentation lignes, ordre lecture)."""
|
| 47 |
image_quality: Optional[dict] = None
|
| 48 |
"""Métriques de qualité image."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def as_dict(self) -> dict:
|
| 51 |
d = {
|
|
@@ -71,6 +76,10 @@ class DocumentResult:
|
|
| 71 |
d["structure"] = self.structure
|
| 72 |
if self.image_quality is not None:
|
| 73 |
d["image_quality"] = self.image_quality
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
return d
|
| 75 |
|
| 76 |
|
|
@@ -99,6 +108,11 @@ class EngineReport:
|
|
| 99 |
"""Métriques structurelles agrégées."""
|
| 100 |
aggregated_image_quality: Optional[dict] = None
|
| 101 |
"""Métriques de qualité image agrégées."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
def __post_init__(self) -> None:
|
| 104 |
if not self.aggregated_metrics and self.document_results:
|
|
@@ -155,6 +169,10 @@ class EngineReport:
|
|
| 155 |
d["aggregated_structure"] = self.aggregated_structure
|
| 156 |
if self.aggregated_image_quality is not None:
|
| 157 |
d["aggregated_image_quality"] = self.aggregated_image_quality
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
return d
|
| 159 |
|
| 160 |
|
|
|
|
| 46 |
"""Analyse structurelle (segmentation lignes, ordre lecture)."""
|
| 47 |
image_quality: Optional[dict] = None
|
| 48 |
"""Métriques de qualité image."""
|
| 49 |
+
# Champs Sprint 10 — distribution des erreurs + hallucinations VLM
|
| 50 |
+
line_metrics: Optional[dict] = None
|
| 51 |
+
"""Distribution CER par ligne (percentiles, Gini, heatmap de position)."""
|
| 52 |
+
hallucination_metrics: Optional[dict] = None
|
| 53 |
+
"""Métriques de détection des hallucinations VLM (ancrage, ratio longueur, blocs)."""
|
| 54 |
|
| 55 |
def as_dict(self) -> dict:
|
| 56 |
d = {
|
|
|
|
| 76 |
d["structure"] = self.structure
|
| 77 |
if self.image_quality is not None:
|
| 78 |
d["image_quality"] = self.image_quality
|
| 79 |
+
if self.line_metrics is not None:
|
| 80 |
+
d["line_metrics"] = self.line_metrics
|
| 81 |
+
if self.hallucination_metrics is not None:
|
| 82 |
+
d["hallucination_metrics"] = self.hallucination_metrics
|
| 83 |
return d
|
| 84 |
|
| 85 |
|
|
|
|
| 108 |
"""Métriques structurelles agrégées."""
|
| 109 |
aggregated_image_quality: Optional[dict] = None
|
| 110 |
"""Métriques de qualité image agrégées."""
|
| 111 |
+
# Sprint 10
|
| 112 |
+
aggregated_line_metrics: Optional[dict] = None
|
| 113 |
+
"""Distribution CER par ligne agrégée (Gini moyen, percentiles, heatmap, taux catastrophiques)."""
|
| 114 |
+
aggregated_hallucination: Optional[dict] = None
|
| 115 |
+
"""Métriques d'hallucination VLM agrégées (ancrage moyen, taux de docs hallucinés…)."""
|
| 116 |
|
| 117 |
def __post_init__(self) -> None:
|
| 118 |
if not self.aggregated_metrics and self.document_results:
|
|
|
|
| 169 |
d["aggregated_structure"] = self.aggregated_structure
|
| 170 |
if self.aggregated_image_quality is not None:
|
| 171 |
d["aggregated_image_quality"] = self.aggregated_image_quality
|
| 172 |
+
if self.aggregated_line_metrics is not None:
|
| 173 |
+
d["aggregated_line_metrics"] = self.aggregated_line_metrics
|
| 174 |
+
if self.aggregated_hallucination is not None:
|
| 175 |
+
d["aggregated_hallucination"] = self.aggregated_hallucination
|
| 176 |
return d
|
| 177 |
|
| 178 |
|
|
@@ -25,6 +25,9 @@ from picarones.core.taxonomy import classify_errors, aggregate_taxonomy
|
|
| 25 |
from picarones.core.structure import analyze_structure, aggregate_structure
|
| 26 |
from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
|
| 27 |
from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# ---------------------------------------------------------------------------
|
| 30 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
|
@@ -117,6 +120,51 @@ def _llm_correction(text: str, rng: random.Random) -> str:
|
|
| 117 |
return text
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def _bad_engine_errors(text: str, rng: random.Random) -> str:
|
| 121 |
"""Moteur de mauvaise qualité : nombreuses erreurs."""
|
| 122 |
words = text.split()
|
|
@@ -252,6 +300,30 @@ def generate_sample_benchmark(
|
|
| 252 |
],
|
| 253 |
},
|
| 254 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
]
|
| 256 |
|
| 257 |
engine_reports: list[EngineReport] = []
|
|
@@ -297,6 +369,13 @@ def generate_sample_benchmark(
|
|
| 297 |
|
| 298 |
metrics = _make_metrics(gt, hypothesis)
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
# Sprint 5 — métriques avancées patrimoniales
|
| 301 |
cm = build_confusion_matrix(gt, hypothesis)
|
| 302 |
lig_score = compute_ligature_score(gt, hypothesis)
|
|
@@ -326,6 +405,8 @@ def generate_sample_benchmark(
|
|
| 326 |
taxonomy=taxonomy_result.as_dict(),
|
| 327 |
structure=struct_result.as_dict(),
|
| 328 |
image_quality={**iq_result.as_dict(), "script_type": _script_type},
|
|
|
|
|
|
|
| 329 |
)
|
| 330 |
)
|
| 331 |
|
|
@@ -384,6 +465,17 @@ def generate_sample_benchmark(
|
|
| 384 |
for dr in doc_results if dr.image_quality
|
| 385 |
])
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
report = EngineReport(
|
| 388 |
engine_name=engine_name,
|
| 389 |
engine_version=engine_version,
|
|
@@ -395,6 +487,8 @@ def generate_sample_benchmark(
|
|
| 395 |
aggregated_taxonomy=agg_taxonomy,
|
| 396 |
aggregated_structure=agg_structure,
|
| 397 |
aggregated_image_quality=agg_iq,
|
|
|
|
|
|
|
| 398 |
)
|
| 399 |
engine_reports.append(report)
|
| 400 |
|
|
|
|
| 25 |
from picarones.core.structure import analyze_structure, aggregate_structure
|
| 26 |
from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
|
| 27 |
from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
|
| 28 |
+
# Sprint 10 — distribution des erreurs + hallucinations VLM
|
| 29 |
+
from picarones.core.line_metrics import compute_line_metrics, aggregate_line_metrics, LineMetrics
|
| 30 |
+
from picarones.core.hallucination import compute_hallucination_metrics, aggregate_hallucination_metrics
|
| 31 |
|
| 32 |
# ---------------------------------------------------------------------------
|
| 33 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
|
|
|
| 120 |
return text
|
| 121 |
|
| 122 |
|
| 123 |
+
def _vlm_hallucinations(text: str, rng: random.Random) -> str:
|
| 124 |
+
"""Simule les hallucinations typiques d'un modèle VLM (vision-language).
|
| 125 |
+
|
| 126 |
+
Le modèle "voit" l'image et génère du texte proche du GT mais :
|
| 127 |
+
- Insère des phrases entières inventées (~30% de contenu supplémentaire)
|
| 128 |
+
- Mélange des graphies modernes avec des graphies médiévales
|
| 129 |
+
- Parfois ajoute des métadonnées (folio, date) inventées
|
| 130 |
+
- Garde une cohérence partielle avec le GT (pas totalement aléatoire)
|
| 131 |
+
"""
|
| 132 |
+
# Correction partielle d'erreurs OCR (le VLM lit l'image directement)
|
| 133 |
+
text = text.replace("ſ", "s").replace("&", "et")
|
| 134 |
+
|
| 135 |
+
# Hallucination : phrases intercalées absentes du GT
|
| 136 |
+
hallucinated_phrases = [
|
| 137 |
+
"Ledit document fut enregistré au greffe le lendemain.",
|
| 138 |
+
"Signé et paraphé par le notaire royal en présence de témoins.",
|
| 139 |
+
"Archives nationales, cote F/7/1234, pièce n° 42.",
|
| 140 |
+
"Transcription réalisée d'après l'original conservé à la BnF.",
|
| 141 |
+
"Le présent acte a été lu et approuvé par toutes les parties.",
|
| 142 |
+
"En foi de quoi nous avons apposé notre sceau et notre signature.",
|
| 143 |
+
"Registre des délibérations du Parlement de Paris, tome III.",
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
words = text.split()
|
| 147 |
+
if len(words) > 8 and rng.random() < 0.65:
|
| 148 |
+
# Insérer une ou deux phrases hallucinées
|
| 149 |
+
n_phrases = rng.randint(1, 2)
|
| 150 |
+
for _ in range(n_phrases):
|
| 151 |
+
phrase = rng.choice(hallucinated_phrases)
|
| 152 |
+
insert_pos = rng.randint(len(words) // 2, len(words))
|
| 153 |
+
words = words[:insert_pos] + phrase.split() + words[insert_pos:]
|
| 154 |
+
|
| 155 |
+
# Modernisation systématique (le VLM normalise)
|
| 156 |
+
result = " ".join(words)
|
| 157 |
+
modern_replacements = [
|
| 158 |
+
("nostre", "notre"), ("maistre", "maître"), ("faictes", "faites"),
|
| 159 |
+
("ledit", "le dit"), ("ladicte", "la dite"), ("icelle", "icelle"),
|
| 160 |
+
("iceluy", "icelui"), ("eſt", "est"), ("ſur", "sur"),
|
| 161 |
+
]
|
| 162 |
+
for src, tgt in modern_replacements:
|
| 163 |
+
result = result.replace(src, tgt)
|
| 164 |
+
|
| 165 |
+
return result
|
| 166 |
+
|
| 167 |
+
|
| 168 |
def _bad_engine_errors(text: str, rng: random.Random) -> str:
|
| 169 |
"""Moteur de mauvaise qualité : nombreuses erreurs."""
|
| 170 |
words = text.split()
|
|
|
|
| 300 |
],
|
| 301 |
},
|
| 302 |
),
|
| 303 |
+
# Sprint 10 — Modèle VLM fictif avec hallucinations simulées
|
| 304 |
+
(
|
| 305 |
+
"gpt-4o-vision (zero-shot)",
|
| 306 |
+
"gpt-4o-2024-11-20",
|
| 307 |
+
{"mode": "zero_shot"},
|
| 308 |
+
_vlm_hallucinations,
|
| 309 |
+
True,
|
| 310 |
+
{
|
| 311 |
+
"pipeline_mode": "zero_shot",
|
| 312 |
+
"prompt_file": "zero_shot_medieval_vlm.txt",
|
| 313 |
+
"llm_model": "gpt-4o-2024-11-20",
|
| 314 |
+
"llm_provider": "openai",
|
| 315 |
+
"pipeline_steps": [
|
| 316 |
+
{
|
| 317 |
+
"type": "llm",
|
| 318 |
+
"model": "gpt-4o-2024-11-20",
|
| 319 |
+
"provider": "openai",
|
| 320 |
+
"mode": "zero_shot",
|
| 321 |
+
"prompt_file": "zero_shot_medieval_vlm.txt",
|
| 322 |
+
},
|
| 323 |
+
],
|
| 324 |
+
"is_vlm": True,
|
| 325 |
+
},
|
| 326 |
+
),
|
| 327 |
]
|
| 328 |
|
| 329 |
engine_reports: list[EngineReport] = []
|
|
|
|
| 369 |
|
| 370 |
metrics = _make_metrics(gt, hypothesis)
|
| 371 |
|
| 372 |
+
# Sprint 10 — distribution des erreurs par ligne
|
| 373 |
+
# Pour simuler des textes multi-lignes, on découpe GT et hypothèse en lignes
|
| 374 |
+
gt_multiline = "\n".join(gt[i:i+30] for i in range(0, len(gt), 30))
|
| 375 |
+
hyp_multiline = "\n".join(hypothesis[i:i+30] for i in range(0, len(hypothesis), 30))
|
| 376 |
+
lm = compute_line_metrics(gt_multiline, hyp_multiline)
|
| 377 |
+
hm = compute_hallucination_metrics(gt, hypothesis)
|
| 378 |
+
|
| 379 |
# Sprint 5 — métriques avancées patrimoniales
|
| 380 |
cm = build_confusion_matrix(gt, hypothesis)
|
| 381 |
lig_score = compute_ligature_score(gt, hypothesis)
|
|
|
|
| 405 |
taxonomy=taxonomy_result.as_dict(),
|
| 406 |
structure=struct_result.as_dict(),
|
| 407 |
image_quality={**iq_result.as_dict(), "script_type": _script_type},
|
| 408 |
+
line_metrics=lm.as_dict(),
|
| 409 |
+
hallucination_metrics=hm.as_dict(),
|
| 410 |
)
|
| 411 |
)
|
| 412 |
|
|
|
|
| 465 |
for dr in doc_results if dr.image_quality
|
| 466 |
])
|
| 467 |
|
| 468 |
+
# Sprint 10 — agrégation distribution des erreurs + hallucinations
|
| 469 |
+
agg_line = aggregate_line_metrics([
|
| 470 |
+
LineMetrics.from_dict(dr.line_metrics)
|
| 471 |
+
for dr in doc_results if dr.line_metrics
|
| 472 |
+
])
|
| 473 |
+
from picarones.core.hallucination import HallucinationMetrics as _HM
|
| 474 |
+
agg_hallucination = aggregate_hallucination_metrics([
|
| 475 |
+
_HM.from_dict(dr.hallucination_metrics)
|
| 476 |
+
for dr in doc_results if dr.hallucination_metrics
|
| 477 |
+
])
|
| 478 |
+
|
| 479 |
report = EngineReport(
|
| 480 |
engine_name=engine_name,
|
| 481 |
engine_version=engine_version,
|
|
|
|
| 487 |
aggregated_taxonomy=agg_taxonomy,
|
| 488 |
aggregated_structure=agg_structure,
|
| 489 |
aggregated_image_quality=agg_iq,
|
| 490 |
+
aggregated_line_metrics=agg_line,
|
| 491 |
+
aggregated_hallucination=agg_hallucination,
|
| 492 |
)
|
| 493 |
engine_reports.append(report)
|
| 494 |
|
|
@@ -115,6 +115,17 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 115 |
"aggregated_taxonomy": report.aggregated_taxonomy,
|
| 116 |
"aggregated_structure": report.aggregated_structure,
|
| 117 |
"aggregated_image_quality": report.aggregated_image_quality,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
}
|
| 119 |
engines_summary.append(entry)
|
| 120 |
|
|
@@ -172,6 +183,11 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 172 |
er_entry["structure"] = dr.structure
|
| 173 |
if dr.image_quality is not None:
|
| 174 |
er_entry["image_quality"] = dr.image_quality
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
engine_results.append(er_entry)
|
| 176 |
|
| 177 |
# CER moyen sur ce document (pour le badge galerie)
|
|
@@ -308,6 +324,32 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 308 |
**corr,
|
| 309 |
})
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
return {
|
| 312 |
"meta": {
|
| 313 |
"corpus_name": benchmark.corpus_name,
|
|
@@ -329,6 +371,9 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 329 |
"venn_data": venn_data,
|
| 330 |
"error_clusters": error_clusters,
|
| 331 |
"correlation_per_engine": correlation_per_engine,
|
|
|
|
|
|
|
|
|
|
| 332 |
}
|
| 333 |
|
| 334 |
|
|
@@ -818,6 +863,58 @@ body.present-mode nav .meta {{ display: none; }}
|
|
| 818 |
min-width: 60px;
|
| 819 |
}}
|
| 820 |
.corr-table th {{ background: var(--bg); font-weight: 600; font-size: .75rem; }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 821 |
</style>
|
| 822 |
</head>
|
| 823 |
|
|
@@ -862,6 +959,8 @@ body.present-mode nav .meta {{ display: none; }}
|
|
| 862 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
| 863 |
<th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (fi, fl, œ, æ, ff…)">Ligatures<i class="sort-icon">↕</i></th>
|
| 864 |
<th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
|
|
|
|
|
|
|
| 865 |
<th>CER médian</th>
|
| 866 |
<th>CER min</th>
|
| 867 |
<th>CER max</th>
|
|
@@ -973,6 +1072,18 @@ body.present-mode nav .meta {{ display: none; }}
|
|
| 973 |
<h3>Sorties OCR — diff par moteur</h3>
|
| 974 |
<div class="diff-panels" id="doc-diff-panels"></div>
|
| 975 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 976 |
</div>
|
| 977 |
</div>
|
| 978 |
</div>
|
|
@@ -1080,6 +1191,29 @@ body.present-mode nav .meta {{ display: none; }}
|
|
| 1080 |
<div id="error-clusters-container"></div>
|
| 1081 |
</div>
|
| 1082 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
<!-- Sprint 7 — Matrice de corrélation -->
|
| 1084 |
<div class="chart-card technical" style="grid-column:1/-1">
|
| 1085 |
<h3>Matrice de corrélation entre métriques</h3>
|
|
@@ -1283,11 +1417,29 @@ function renderRanking() {{
|
|
| 1283 |
</td>`;
|
| 1284 |
}}
|
| 1285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1286 |
return `<tr>
|
| 1287 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 1288 |
<td>
|
| 1289 |
<span class="engine-name">${{esc(e.name)}}</span>
|
| 1290 |
${{pipelineBadge}}
|
|
|
|
| 1291 |
<span class="engine-version">v${{esc(e.version)}}</span>
|
| 1292 |
${{pipelineStepsHtml}}
|
| 1293 |
</td>
|
|
@@ -1301,6 +1453,8 @@ function renderRanking() {{
|
|
| 1301 |
<td>${{pct(e.wil)}}</td>
|
| 1302 |
<td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
|
| 1303 |
<td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
|
|
|
|
|
|
|
| 1304 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 1305 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 1306 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
|
@@ -1531,6 +1685,240 @@ function loadDocument(docId) {{
|
|
| 1531 |
${{tripleDiffHtml}}
|
| 1532 |
</div>`;
|
| 1533 |
}}).join('');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1534 |
}}
|
| 1535 |
|
| 1536 |
function buildDocList() {{
|
|
@@ -1603,6 +1991,9 @@ function buildCharts() {{
|
|
| 1603 |
buildWilcoxonTable();
|
| 1604 |
buildErrorClusters();
|
| 1605 |
initCorrelationMatrix();
|
|
|
|
|
|
|
|
|
|
| 1606 |
}}
|
| 1607 |
|
| 1608 |
function buildCerHistogram() {{
|
|
@@ -2131,7 +2522,7 @@ function togglePresentMode() {{
|
|
| 2131 |
|
| 2132 |
// ── Sprint 7 — Export CSV ────────────────────────────────────────
|
| 2133 |
function exportCSV() {{
|
| 2134 |
-
const rows = [['doc_id','engine','cer','wer','mer','wil','duration','ligature_score','diacritic_score','difficulty_score']];
|
| 2135 |
DATA.documents.forEach(doc => {{
|
| 2136 |
doc.engine_results.forEach(er => {{
|
| 2137 |
rows.push([
|
|
@@ -2145,6 +2536,10 @@ function exportCSV() {{
|
|
| 2145 |
er.ligature_score !== null ? er.ligature_score : '',
|
| 2146 |
er.diacritic_score !== null ? er.diacritic_score : '',
|
| 2147 |
doc.difficulty_score !== undefined ? (doc.difficulty_score * 100).toFixed(2) : '',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2148 |
]);
|
| 2149 |
}});
|
| 2150 |
}});
|
|
|
|
| 115 |
"aggregated_taxonomy": report.aggregated_taxonomy,
|
| 116 |
"aggregated_structure": report.aggregated_structure,
|
| 117 |
"aggregated_image_quality": report.aggregated_image_quality,
|
| 118 |
+
# Sprint 10 — distribution des erreurs + hallucinations VLM
|
| 119 |
+
"gini": _safe(report.aggregated_line_metrics.get("gini_mean")) if report.aggregated_line_metrics else None,
|
| 120 |
+
"cer_p90": _safe(report.aggregated_line_metrics.get("percentiles", {}).get("p90")) if report.aggregated_line_metrics else None,
|
| 121 |
+
"cer_p99": _safe(report.aggregated_line_metrics.get("percentiles", {}).get("p99")) if report.aggregated_line_metrics else None,
|
| 122 |
+
"catastrophic_rate_30": _safe(report.aggregated_line_metrics.get("catastrophic_rate", {}).get("0.3")) if report.aggregated_line_metrics else None,
|
| 123 |
+
"aggregated_line_metrics": report.aggregated_line_metrics,
|
| 124 |
+
"anchor_score": _safe(report.aggregated_hallucination.get("anchor_score_mean")) if report.aggregated_hallucination else None,
|
| 125 |
+
"length_ratio": _safe(report.aggregated_hallucination.get("length_ratio_mean")) if report.aggregated_hallucination else None,
|
| 126 |
+
"hallucinating_doc_rate": _safe(report.aggregated_hallucination.get("hallucinating_doc_rate")) if report.aggregated_hallucination else None,
|
| 127 |
+
"aggregated_hallucination": report.aggregated_hallucination,
|
| 128 |
+
"is_vlm": report.pipeline_info.get("is_vlm", False) if report.pipeline_info else False,
|
| 129 |
}
|
| 130 |
engines_summary.append(entry)
|
| 131 |
|
|
|
|
| 183 |
er_entry["structure"] = dr.structure
|
| 184 |
if dr.image_quality is not None:
|
| 185 |
er_entry["image_quality"] = dr.image_quality
|
| 186 |
+
# Sprint 10
|
| 187 |
+
if dr.line_metrics is not None:
|
| 188 |
+
er_entry["line_metrics"] = dr.line_metrics
|
| 189 |
+
if dr.hallucination_metrics is not None:
|
| 190 |
+
er_entry["hallucination_metrics"] = dr.hallucination_metrics
|
| 191 |
engine_results.append(er_entry)
|
| 192 |
|
| 193 |
# CER moyen sur ce document (pour le badge galerie)
|
|
|
|
| 324 |
**corr,
|
| 325 |
})
|
| 326 |
|
| 327 |
+
# ── Sprint 10 — Données scatter plots ─────────────────────────────────
|
| 328 |
+
# Scatter 1 : Gini vs CER moyen (moteurs)
|
| 329 |
+
gini_vs_cer = []
|
| 330 |
+
for report in benchmark.engine_reports:
|
| 331 |
+
gini_val = report.aggregated_line_metrics.get("gini_mean") if report.aggregated_line_metrics else None
|
| 332 |
+
cer_val = report.mean_cer
|
| 333 |
+
if gini_val is not None and cer_val is not None:
|
| 334 |
+
gini_vs_cer.append({
|
| 335 |
+
"engine": report.engine_name,
|
| 336 |
+
"cer": _safe(cer_val),
|
| 337 |
+
"gini": _safe(gini_val),
|
| 338 |
+
"is_pipeline": report.is_pipeline,
|
| 339 |
+
})
|
| 340 |
+
|
| 341 |
+
# Scatter 2 : ratio longueur vs score d'ancrage (moteurs)
|
| 342 |
+
ratio_vs_anchor = []
|
| 343 |
+
for report in benchmark.engine_reports:
|
| 344 |
+
if report.aggregated_hallucination:
|
| 345 |
+
ratio_vs_anchor.append({
|
| 346 |
+
"engine": report.engine_name,
|
| 347 |
+
"length_ratio": _safe(report.aggregated_hallucination.get("length_ratio_mean", 1.0)),
|
| 348 |
+
"anchor_score": _safe(report.aggregated_hallucination.get("anchor_score_mean", 1.0)),
|
| 349 |
+
"hallucinating_rate": _safe(report.aggregated_hallucination.get("hallucinating_doc_rate", 0.0)),
|
| 350 |
+
"is_vlm": report.pipeline_info.get("is_vlm", False) if report.pipeline_info else False,
|
| 351 |
+
})
|
| 352 |
+
|
| 353 |
return {
|
| 354 |
"meta": {
|
| 355 |
"corpus_name": benchmark.corpus_name,
|
|
|
|
| 371 |
"venn_data": venn_data,
|
| 372 |
"error_clusters": error_clusters,
|
| 373 |
"correlation_per_engine": correlation_per_engine,
|
| 374 |
+
# Sprint 10
|
| 375 |
+
"gini_vs_cer": gini_vs_cer,
|
| 376 |
+
"ratio_vs_anchor": ratio_vs_anchor,
|
| 377 |
}
|
| 378 |
|
| 379 |
|
|
|
|
| 863 |
min-width: 60px;
|
| 864 |
}}
|
| 865 |
.corr-table th {{ background: var(--bg); font-weight: 600; font-size: .75rem; }}
|
| 866 |
+
|
| 867 |
+
/* ── Sprint 10 — heatmap erreurs ─────────────────────────────────*/
|
| 868 |
+
.heatmap-wrap {{
|
| 869 |
+
display: flex; gap: 3px; align-items: flex-end;
|
| 870 |
+
height: 60px; margin: .5rem 0;
|
| 871 |
+
}}
|
| 872 |
+
.heatmap-bar {{
|
| 873 |
+
flex: 1; border-radius: 3px 3px 0 0;
|
| 874 |
+
min-height: 4px;
|
| 875 |
+
transition: opacity .15s;
|
| 876 |
+
}}
|
| 877 |
+
.heatmap-bar:hover {{ opacity: .75; }}
|
| 878 |
+
.heatmap-labels {{
|
| 879 |
+
display: flex; justify-content: space-between;
|
| 880 |
+
font-size: .65rem; color: var(--text-muted); margin-top: .15rem;
|
| 881 |
+
}}
|
| 882 |
+
|
| 883 |
+
/* ── Sprint 10 — hallucination badge ──────────────────���──────────*/
|
| 884 |
+
.hallucination-badge {{
|
| 885 |
+
display: inline-flex; align-items: center; gap: .25rem;
|
| 886 |
+
padding: .15rem .45rem; border-radius: 4px;
|
| 887 |
+
font-size: .72rem; font-weight: 700;
|
| 888 |
+
background: #fce7f3; color: #9d174d;
|
| 889 |
+
border: 1px solid #fbcfe8;
|
| 890 |
+
}}
|
| 891 |
+
.hallucination-badge.ok {{
|
| 892 |
+
background: #f0fdf4; color: #15803d;
|
| 893 |
+
border-color: #bbf7d0;
|
| 894 |
+
}}
|
| 895 |
+
|
| 896 |
+
/* ── Sprint 10 — bloc halluciné ──────────────────────────────────*/
|
| 897 |
+
.halluc-block {{
|
| 898 |
+
background: #fce7f3; border: 1px solid #f9a8d4;
|
| 899 |
+
border-radius: 4px; padding: .35rem .6rem;
|
| 900 |
+
margin: .25rem 0; font-size: .78rem;
|
| 901 |
+
font-family: 'Georgia', serif; color: #9d174d;
|
| 902 |
+
}}
|
| 903 |
+
.halluc-block-meta {{
|
| 904 |
+
font-size: .65rem; color: #be185d; font-family: system-ui, sans-serif;
|
| 905 |
+
margin-bottom: .15rem; font-weight: 600;
|
| 906 |
+
}}
|
| 907 |
+
|
| 908 |
+
/* ── Sprint 10 — percentile bars ─────────────────────────────────*/
|
| 909 |
+
.pct-bars {{ display: flex; flex-direction: column; gap: .25rem; margin: .4rem 0; }}
|
| 910 |
+
.pct-bar-row {{ display: flex; align-items: center; gap: .4rem; font-size: .72rem; }}
|
| 911 |
+
.pct-bar-label {{ width: 2.5rem; color: var(--text-muted); text-align: right; flex-shrink: 0; }}
|
| 912 |
+
.pct-bar-track {{
|
| 913 |
+
flex: 1; height: 8px; background: var(--bg);
|
| 914 |
+
border-radius: 4px; overflow: hidden;
|
| 915 |
+
}}
|
| 916 |
+
.pct-bar-fill {{ height: 100%; border-radius: 4px; }}
|
| 917 |
+
.pct-bar-val {{ width: 3rem; color: var(--text); font-weight: 600; }}
|
| 918 |
</style>
|
| 919 |
</head>
|
| 920 |
|
|
|
|
| 959 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
| 960 |
<th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (fi, fl, œ, æ, ff…)">Ligatures<i class="sort-icon">↕</i></th>
|
| 961 |
<th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
|
| 962 |
+
<th data-col="gini" class="sortable" title="Coefficient de Gini des erreurs CER par ligne — 0 = erreurs uniformes, 1 = erreurs concentrées. Un bon moteur a CER bas ET Gini bas.">Gini<i class="sort-icon">↕</i></th>
|
| 963 |
+
<th data-col="anchor_score" class="sortable" title="Score d'ancrage : proportion des trigrammes de la sortie trouvant un ancrage dans le GT — faible score = hallucinations probables (LLM/VLM)">Ancrage<i class="sort-icon">↕</i></th>
|
| 964 |
<th>CER médian</th>
|
| 965 |
<th>CER min</th>
|
| 966 |
<th>CER max</th>
|
|
|
|
| 1072 |
<h3>Sorties OCR — diff par moteur</h3>
|
| 1073 |
<div class="diff-panels" id="doc-diff-panels"></div>
|
| 1074 |
</div>
|
| 1075 |
+
|
| 1076 |
+
<!-- Sprint 10 — Distribution CER par ligne -->
|
| 1077 |
+
<div class="card" id="doc-line-metrics-card" style="display:none">
|
| 1078 |
+
<h3>Distribution des erreurs par ligne</h3>
|
| 1079 |
+
<div id="doc-line-metrics-content"></div>
|
| 1080 |
+
</div>
|
| 1081 |
+
|
| 1082 |
+
<!-- Sprint 10 — Hallucinations détectées -->
|
| 1083 |
+
<div class="card" id="doc-hallucination-card" style="display:none">
|
| 1084 |
+
<h3>Analyse des hallucinations</h3>
|
| 1085 |
+
<div id="doc-hallucination-content"></div>
|
| 1086 |
+
</div>
|
| 1087 |
</div>
|
| 1088 |
</div>
|
| 1089 |
</div>
|
|
|
|
| 1191 |
<div id="error-clusters-container"></div>
|
| 1192 |
</div>
|
| 1193 |
|
| 1194 |
+
<!-- Sprint 10 — Scatter Gini vs CER moyen -->
|
| 1195 |
+
<div class="chart-card">
|
| 1196 |
+
<h3>Gini vs CER moyen <span style="font-size:.72rem;font-weight:400;color:var(--text-muted)">— idéal : bas-gauche</span></h3>
|
| 1197 |
+
<div class="chart-canvas-wrap">
|
| 1198 |
+
<canvas id="chart-gini-cer"></canvas>
|
| 1199 |
+
</div>
|
| 1200 |
+
<div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
|
| 1201 |
+
Axe X = CER moyen, Axe Y = coefficient de Gini. Un moteur idéal a CER bas ET Gini bas (erreurs rares et uniformes).
|
| 1202 |
+
</div>
|
| 1203 |
+
</div>
|
| 1204 |
+
|
| 1205 |
+
<!-- Sprint 10 — Scatter ratio longueur vs ancrage -->
|
| 1206 |
+
<div class="chart-card">
|
| 1207 |
+
<h3>Ratio longueur vs ancrage <span style="font-size:.72rem;font-weight:400;color:var(--text-muted)">— hallucinations VLM</span></h3>
|
| 1208 |
+
<div class="chart-canvas-wrap">
|
| 1209 |
+
<canvas id="chart-ratio-anchor"></canvas>
|
| 1210 |
+
</div>
|
| 1211 |
+
<div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
|
| 1212 |
+
Axe X = score d'ancrage trigrammes [0–1]. Axe Y = ratio longueur sortie/GT.
|
| 1213 |
+
Zone ⚠️ : ancrage < 0.5 ou ratio > 1.2 → hallucinations probables.
|
| 1214 |
+
</div>
|
| 1215 |
+
</div>
|
| 1216 |
+
|
| 1217 |
<!-- Sprint 7 — Matrice de corrélation -->
|
| 1218 |
<div class="chart-card technical" style="grid-column:1/-1">
|
| 1219 |
<h3>Matrice de corrélation entre métriques</h3>
|
|
|
|
| 1417 |
</td>`;
|
| 1418 |
}}
|
| 1419 |
|
| 1420 |
+
// ── Sprint 10 : Gini + Ancrage ─────────────────────────────────────
|
| 1421 |
+
let giniCell = '<td style="color:var(--text-muted)">—</td>';
|
| 1422 |
+
if (e.gini !== null && e.gini !== undefined) {{
|
| 1423 |
+
const gv = e.gini;
|
| 1424 |
+
const gColor = gv < 0.3 ? '#16a34a' : gv < 0.5 ? '#ca8a04' : '#dc2626';
|
| 1425 |
+
const gBg = gv < 0.3 ? '#f0fdf4' : gv < 0.5 ? '#fefce8' : '#fef2f2';
|
| 1426 |
+
giniCell = `<td><span class="cer-badge" style="color:${{gColor}};background:${{gBg}}"
|
| 1427 |
+
title="Gini=${{gv.toFixed(3)}} — 0=uniforme, 1=concentré">${{gv.toFixed(3)}}</span></td>`;
|
| 1428 |
+
}}
|
| 1429 |
+
let anchorCell = '<td style="color:var(--text-muted)">—</td>';
|
| 1430 |
+
if (e.anchor_score !== null && e.anchor_score !== undefined) {{
|
| 1431 |
+
const av = e.anchor_score;
|
| 1432 |
+
const hallBadge = (e.hallucinating_doc_rate && e.hallucinating_doc_rate > 0.2)
|
| 1433 |
+
? ' <span title="Hallucinations détectées">⚠️</span>' : '';
|
| 1434 |
+
anchorCell = `<td>${{_scoreBadge(av, 'Ancrage trigrammes')}}${{hallBadge}}</td>`;
|
| 1435 |
+
}}
|
| 1436 |
+
|
| 1437 |
return `<tr>
|
| 1438 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 1439 |
<td>
|
| 1440 |
<span class="engine-name">${{esc(e.name)}}</span>
|
| 1441 |
${{pipelineBadge}}
|
| 1442 |
+
${{e.is_vlm ? '<span class="pipeline-tag" style="background:#fce7f3;color:#9d174d">👁 VLM</span>' : ''}}
|
| 1443 |
<span class="engine-version">v${{esc(e.version)}}</span>
|
| 1444 |
${{pipelineStepsHtml}}
|
| 1445 |
</td>
|
|
|
|
| 1453 |
<td>${{pct(e.wil)}}</td>
|
| 1454 |
<td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
|
| 1455 |
<td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
|
| 1456 |
+
${{giniCell}}
|
| 1457 |
+
${{anchorCell}}
|
| 1458 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 1459 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 1460 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
|
|
|
| 1685 |
${{tripleDiffHtml}}
|
| 1686 |
</div>`;
|
| 1687 |
}}).join('');
|
| 1688 |
+
|
| 1689 |
+
// ── Sprint 10 : distribution CER par ligne ──────────────────────────
|
| 1690 |
+
const lineCard = document.getElementById('doc-line-metrics-card');
|
| 1691 |
+
const lineContent = document.getElementById('doc-line-metrics-content');
|
| 1692 |
+
// Prendre le premier moteur ayant des line_metrics
|
| 1693 |
+
const erWithLine = doc.engine_results.find(er => er.line_metrics);
|
| 1694 |
+
if (erWithLine && erWithLine.line_metrics) {{
|
| 1695 |
+
lineCard.style.display = '';
|
| 1696 |
+
lineContent.innerHTML = renderLineMetrics(doc.engine_results);
|
| 1697 |
+
}} else {{
|
| 1698 |
+
lineCard.style.display = 'none';
|
| 1699 |
+
}}
|
| 1700 |
+
|
| 1701 |
+
// ── Sprint 10 : hallucinations ──────────────────────────────────────
|
| 1702 |
+
const hallCard = document.getElementById('doc-hallucination-card');
|
| 1703 |
+
const hallContent = document.getElementById('doc-hallucination-content');
|
| 1704 |
+
const erWithHall = doc.engine_results.find(er => er.hallucination_metrics && er.hallucination_metrics.is_hallucinating);
|
| 1705 |
+
if (erWithHall || doc.engine_results.some(er => er.hallucination_metrics)) {{
|
| 1706 |
+
hallCard.style.display = '';
|
| 1707 |
+
hallContent.innerHTML = renderHallucinationPanel(doc.engine_results);
|
| 1708 |
+
}} else {{
|
| 1709 |
+
hallCard.style.display = 'none';
|
| 1710 |
+
}}
|
| 1711 |
+
}}
|
| 1712 |
+
|
| 1713 |
+
// ── Sprint 10 : rendu distribution CER par ligne ────────────────
|
| 1714 |
+
function renderLineMetrics(engineResults) {{
|
| 1715 |
+
const heatmapColors = (v) => {{
|
| 1716 |
+
if (v < 0.05) return '#86efac';
|
| 1717 |
+
if (v < 0.15) return '#fde68a';
|
| 1718 |
+
if (v < 0.30) return '#fb923c';
|
| 1719 |
+
return '#f87171';
|
| 1720 |
+
}};
|
| 1721 |
+
|
| 1722 |
+
return engineResults.filter(er => er.line_metrics).map(er => {{
|
| 1723 |
+
const lm = er.line_metrics;
|
| 1724 |
+
const c = cerColor(er.cer); const bg = cerBg(er.cer);
|
| 1725 |
+
|
| 1726 |
+
// Heatmap de position
|
| 1727 |
+
const heatmap = lm.heatmap || [];
|
| 1728 |
+
const maxHeat = Math.max(...heatmap, 0.01);
|
| 1729 |
+
const heatmapHtml = heatmap.length > 0
|
| 1730 |
+
? `<div class="heatmap-wrap">` +
|
| 1731 |
+
heatmap.map((v, i) => {{
|
| 1732 |
+
const h = Math.max(4, Math.round(60 * v / maxHeat));
|
| 1733 |
+
return `<div class="heatmap-bar" style="height:${{h}}px;background:${{heatmapColors(v)}}"
|
| 1734 |
+
title="Tranche ${{i+1}}/${{heatmap.length}} — CER=${{(v*100).toFixed(1)}}%"></div>`;
|
| 1735 |
+
}}).join('') +
|
| 1736 |
+
`</div><div class="heatmap-labels"><span>Début</span><span>Milieu</span><span>Fin</span></div>`
|
| 1737 |
+
: '<em style="color:var(--text-muted)">—</em>';
|
| 1738 |
+
|
| 1739 |
+
// Percentiles
|
| 1740 |
+
const p = lm.percentiles || {{}};
|
| 1741 |
+
const pctBars = ['p50','p75','p90','p95','p99'].map(k => {{
|
| 1742 |
+
const v = p[k] || 0;
|
| 1743 |
+
const w = Math.min(100, v * 100 * 2);
|
| 1744 |
+
const fillColor = v < 0.15 ? '#86efac' : v < 0.30 ? '#fde68a' : '#f87171';
|
| 1745 |
+
return `<div class="pct-bar-row">
|
| 1746 |
+
<span class="pct-bar-label">${{k}}</span>
|
| 1747 |
+
<div class="pct-bar-track"><div class="pct-bar-fill" style="width:${{w}}%;background:${{fillColor}}"></div></div>
|
| 1748 |
+
<span class="pct-bar-val">${{(v*100).toFixed(1)}}%</span>
|
| 1749 |
+
</div>`;
|
| 1750 |
+
}}).join('');
|
| 1751 |
+
|
| 1752 |
+
// Taux catastrophiques
|
| 1753 |
+
const cr = lm.catastrophic_rate || {{}};
|
| 1754 |
+
const crRows = Object.entries(cr).map(([t, rate]) => {{
|
| 1755 |
+
const tPct = (parseFloat(t)*100).toFixed(0);
|
| 1756 |
+
const ratePct = (rate*100).toFixed(1);
|
| 1757 |
+
const color = rate < 0.05 ? '#16a34a' : rate < 0.15 ? '#ca8a04' : '#dc2626';
|
| 1758 |
+
return `<span class="stat"><b style="color:${{color}}">${{ratePct}}%</b> lignes CER>${{tPct}}%</span>`;
|
| 1759 |
+
}}).join('');
|
| 1760 |
+
|
| 1761 |
+
// Gini
|
| 1762 |
+
const gini = lm.gini !== undefined ? lm.gini.toFixed(3) : '—';
|
| 1763 |
+
const giniColor = lm.gini < 0.3 ? '#16a34a' : lm.gini < 0.5 ? '#ca8a04' : '#dc2626';
|
| 1764 |
+
|
| 1765 |
+
return `<div style="margin-bottom:1.25rem;padding-bottom:1rem;border-bottom:1px solid var(--border)">
|
| 1766 |
+
<div style="display:flex;align-items:center;gap:.5rem;margin-bottom:.6rem">
|
| 1767 |
+
<strong>${{esc(er.engine)}}</strong>
|
| 1768 |
+
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
| 1769 |
+
<span class="stat">Gini <b style="color:${{giniColor}}">${{gini}}</b></span>
|
| 1770 |
+
<span class="stat">${{lm.line_count}} lignes</span>
|
| 1771 |
+
${{crRows}}
|
| 1772 |
+
</div>
|
| 1773 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem">
|
| 1774 |
+
<div>
|
| 1775 |
+
<div style="font-size:.75rem;font-weight:600;color:var(--text-muted);margin-bottom:.3rem">CARTE THERMIQUE (position)</div>
|
| 1776 |
+
${{heatmapHtml}}
|
| 1777 |
+
</div>
|
| 1778 |
+
<div>
|
| 1779 |
+
<div style="font-size:.75rem;font-weight:600;color:var(--text-muted);margin-bottom:.3rem">PERCENTILES CER</div>
|
| 1780 |
+
<div class="pct-bars">${{pctBars}}</div>
|
| 1781 |
+
</div>
|
| 1782 |
+
</div>
|
| 1783 |
+
</div>`;
|
| 1784 |
+
}}).join('') || '<em style="color:var(--text-muted)">Aucune métrique de ligne disponible.</em>';
|
| 1785 |
+
}}
|
| 1786 |
+
|
| 1787 |
+
// ── Sprint 10 : rendu panneau hallucinations ─────────────────────
|
| 1788 |
+
function renderHallucinationPanel(engineResults) {{
|
| 1789 |
+
const withHall = engineResults.filter(er => er.hallucination_metrics);
|
| 1790 |
+
if (!withHall.length) return '<em style="color:var(--text-muted)">Aucune métrique d\'hallucination disponible.</em>';
|
| 1791 |
+
|
| 1792 |
+
return withHall.map(er => {{
|
| 1793 |
+
const hm = er.hallucination_metrics;
|
| 1794 |
+
const isHall = hm.is_hallucinating;
|
| 1795 |
+
const badgeClass = isHall ? 'hallucination-badge' : 'hallucination-badge ok';
|
| 1796 |
+
const badgeLabel = isHall ? '⚠️ Hallucinations détectées' : '✓ Ancrage satisfaisant';
|
| 1797 |
+
|
| 1798 |
+
const blocksHtml = hm.hallucinated_blocks && hm.hallucinated_blocks.length > 0
|
| 1799 |
+
? hm.hallucinated_blocks.slice(0, 5).map(b =>
|
| 1800 |
+
`<div class="halluc-block">
|
| 1801 |
+
<div class="halluc-block-meta">Bloc halluciné — ${{b.length}} mots (tokens ${{b.start_token}}–${{b.end_token}})</div>
|
| 1802 |
+
${{esc(b.text)}}
|
| 1803 |
+
</div>`
|
| 1804 |
+
).join('') +
|
| 1805 |
+
(hm.hallucinated_blocks.length > 5 ? `<div style="font-size:.72rem;color:var(--text-muted);margin-top:.25rem">… ${{hm.hallucinated_blocks.length - 5}} bloc(s) supplémentaire(s)</div>` : '')
|
| 1806 |
+
: '<em style="color:var(--text-muted);font-size:.8rem">Aucun bloc halluciné détecté.</em>';
|
| 1807 |
+
|
| 1808 |
+
return `<div style="margin-bottom:1.25rem;padding-bottom:1rem;border-bottom:1px solid var(--border)">
|
| 1809 |
+
<div style="display:flex;align-items:center;gap:.5rem;margin-bottom:.6rem;flex-wrap:wrap">
|
| 1810 |
+
<strong>${{esc(er.engine)}}</strong>
|
| 1811 |
+
<span class="${{badgeClass}}">${{badgeLabel}}</span>
|
| 1812 |
+
<span class="stat">Ancrage <b>${{(hm.anchor_score*100).toFixed(1)}}%</b></span>
|
| 1813 |
+
<span class="stat">Ratio longueur <b>${{hm.length_ratio.toFixed(2)}}</b></span>
|
| 1814 |
+
<span class="stat">Insertion nette <b>${{(hm.net_insertion_rate*100).toFixed(1)}}%</b></span>
|
| 1815 |
+
<span class="stat">${{hm.gt_word_count}} mots GT / ${{hm.hyp_word_count}} mots sortie</span>
|
| 1816 |
+
</div>
|
| 1817 |
+
${{isHall ? `<div style="margin-bottom:.5rem;font-size:.82rem;font-weight:600;color:#9d174d">Blocs sans ancrage dans le GT :</div>` : ''}}
|
| 1818 |
+
${{isHall ? blocksHtml : ''}}
|
| 1819 |
+
</div>`;
|
| 1820 |
+
}}).join('');
|
| 1821 |
+
}}
|
| 1822 |
+
|
| 1823 |
+
// ── Sprint 10 — Scatter Gini vs CER moyen ──────────────────────
|
| 1824 |
+
function buildGiniCerScatter() {{
|
| 1825 |
+
const canvas = document.getElementById('chart-gini-cer');
|
| 1826 |
+
if (!canvas) return;
|
| 1827 |
+
const pts = DATA.gini_vs_cer || [];
|
| 1828 |
+
if (!pts.length) {{
|
| 1829 |
+
canvas.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Données Gini non disponibles.</p>';
|
| 1830 |
+
return;
|
| 1831 |
+
}}
|
| 1832 |
+
const datasets = pts.map((p, i) => ({{
|
| 1833 |
+
label: p.engine,
|
| 1834 |
+
data: [{{ x: p.cer * 100, y: p.gini }}],
|
| 1835 |
+
backgroundColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)) + 'cc',
|
| 1836 |
+
borderColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)),
|
| 1837 |
+
borderWidth: p.is_pipeline ? 2 : 1,
|
| 1838 |
+
pointRadius: p.is_pipeline ? 9 : 7,
|
| 1839 |
+
pointStyle: p.is_pipeline ? 'triangle' : 'circle',
|
| 1840 |
+
}}));
|
| 1841 |
+
|
| 1842 |
+
chartInstances['gini-cer'] = new Chart(canvas.getContext('2d'), {{
|
| 1843 |
+
type: 'scatter',
|
| 1844 |
+
data: {{ datasets }},
|
| 1845 |
+
options: {{
|
| 1846 |
+
responsive: true, maintainAspectRatio: false,
|
| 1847 |
+
plugins: {{
|
| 1848 |
+
legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
|
| 1849 |
+
tooltip: {{ callbacks: {{
|
| 1850 |
+
label: ctx => `${{ctx.dataset.label}}: CER=${{ctx.parsed.x.toFixed(2)}}%, Gini=${{ctx.parsed.y.toFixed(3)}}`,
|
| 1851 |
+
}} }},
|
| 1852 |
+
}},
|
| 1853 |
+
scales: {{
|
| 1854 |
+
x: {{ min: 0, title: {{ display: true, text: 'CER moyen (%)', font: {{ size: 11 }} }} }},
|
| 1855 |
+
y: {{ min: 0, max: 1, title: {{ display: true, text: 'Coefficient de Gini', font: {{ size: 11 }} }} }},
|
| 1856 |
+
}},
|
| 1857 |
+
}},
|
| 1858 |
+
}});
|
| 1859 |
+
}}
|
| 1860 |
+
|
| 1861 |
+
// ── Sprint 10 — Scatter ratio longueur vs score d'ancrage ────────
|
| 1862 |
+
function buildRatioAnchorScatter() {{
|
| 1863 |
+
const canvas = document.getElementById('chart-ratio-anchor');
|
| 1864 |
+
if (!canvas) return;
|
| 1865 |
+
const pts = DATA.ratio_vs_anchor || [];
|
| 1866 |
+
if (!pts.length) {{
|
| 1867 |
+
canvas.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Données d\'ancrage non disponibles.</p>';
|
| 1868 |
+
return;
|
| 1869 |
+
}}
|
| 1870 |
+
|
| 1871 |
+
// Zone de danger (ancrage < 0.5 OU ratio > 1.2) dessinée via plugin
|
| 1872 |
+
const datasets = pts.map((p, i) => ({{
|
| 1873 |
+
label: p.engine + (p.is_vlm ? ' 👁' : ''),
|
| 1874 |
+
data: [{{ x: p.anchor_score, y: p.length_ratio }}],
|
| 1875 |
+
backgroundColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)) + 'cc',
|
| 1876 |
+
borderColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)),
|
| 1877 |
+
borderWidth: p.is_vlm ? 3 : 1,
|
| 1878 |
+
pointRadius: p.is_vlm ? 10 : 7,
|
| 1879 |
+
pointStyle: p.is_vlm ? 'star' : 'circle',
|
| 1880 |
+
}}));
|
| 1881 |
+
|
| 1882 |
+
chartInstances['ratio-anchor'] = new Chart(canvas.getContext('2d'), {{
|
| 1883 |
+
type: 'scatter',
|
| 1884 |
+
data: {{ datasets }},
|
| 1885 |
+
options: {{
|
| 1886 |
+
responsive: true, maintainAspectRatio: false,
|
| 1887 |
+
plugins: {{
|
| 1888 |
+
legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
|
| 1889 |
+
tooltip: {{ callbacks: {{
|
| 1890 |
+
label: ctx => `${{ctx.dataset.label}}: ancrage=${{(ctx.parsed.x*100).toFixed(1)}}%, ratio=${{ctx.parsed.y.toFixed(2)}}`,
|
| 1891 |
+
}} }},
|
| 1892 |
+
}},
|
| 1893 |
+
scales: {{
|
| 1894 |
+
x: {{ min: 0, max: 1, title: {{ display: true, text: 'Score d\'ancrage [0–1]', font: {{ size: 11 }} }} }},
|
| 1895 |
+
y: {{ min: 0, title: {{ display: true, text: 'Ratio longueur (sortie/GT)', font: {{ size: 11 }} }} }},
|
| 1896 |
+
}},
|
| 1897 |
+
}},
|
| 1898 |
+
plugins: [{{
|
| 1899 |
+
id: 'danger-zones',
|
| 1900 |
+
beforeDraw(chart) {{
|
| 1901 |
+
const {{ ctx: c, chartArea: {{ left, top, right, bottom }}, scales: {{ x, y }} }} = chart;
|
| 1902 |
+
c.save();
|
| 1903 |
+
// Ancrage < 0.5 (gauche)
|
| 1904 |
+
const xHalf = x.getPixelForValue(0.5);
|
| 1905 |
+
c.fillStyle = 'rgba(239,68,68,0.07)';
|
| 1906 |
+
c.fillRect(left, top, xHalf - left, bottom - top);
|
| 1907 |
+
// Ratio > 1.2 (haut)
|
| 1908 |
+
const y12 = y.getPixelForValue(1.2);
|
| 1909 |
+
if (y12 > top) {{
|
| 1910 |
+
c.fillRect(left, top, right - left, y12 - top);
|
| 1911 |
+
}}
|
| 1912 |
+
// Lignes de seuil
|
| 1913 |
+
c.strokeStyle = 'rgba(239,68,68,0.35)'; c.lineWidth = 1; c.setLineDash([4,4]);
|
| 1914 |
+
c.beginPath(); c.moveTo(xHalf, top); c.lineTo(xHalf, bottom); c.stroke();
|
| 1915 |
+
if (y12 > top) {{
|
| 1916 |
+
c.beginPath(); c.moveTo(left, y12); c.lineTo(right, y12); c.stroke();
|
| 1917 |
+
}}
|
| 1918 |
+
c.restore();
|
| 1919 |
+
}},
|
| 1920 |
+
}}],
|
| 1921 |
+
}});
|
| 1922 |
}}
|
| 1923 |
|
| 1924 |
function buildDocList() {{
|
|
|
|
| 1991 |
buildWilcoxonTable();
|
| 1992 |
buildErrorClusters();
|
| 1993 |
initCorrelationMatrix();
|
| 1994 |
+
// Sprint 10
|
| 1995 |
+
buildGiniCerScatter();
|
| 1996 |
+
buildRatioAnchorScatter();
|
| 1997 |
}}
|
| 1998 |
|
| 1999 |
function buildCerHistogram() {{
|
|
|
|
| 2522 |
|
| 2523 |
// ── Sprint 7 — Export CSV ────────────────────────────────────────
|
| 2524 |
function exportCSV() {{
|
| 2525 |
+
const rows = [['doc_id','engine','cer','wer','mer','wil','duration','ligature_score','diacritic_score','difficulty_score','gini','anchor_score','length_ratio','is_hallucinating']];
|
| 2526 |
DATA.documents.forEach(doc => {{
|
| 2527 |
doc.engine_results.forEach(er => {{
|
| 2528 |
rows.push([
|
|
|
|
| 2536 |
er.ligature_score !== null ? er.ligature_score : '',
|
| 2537 |
er.diacritic_score !== null ? er.diacritic_score : '',
|
| 2538 |
doc.difficulty_score !== undefined ? (doc.difficulty_score * 100).toFixed(2) : '',
|
| 2539 |
+
er.line_metrics ? er.line_metrics.gini.toFixed(6) : '',
|
| 2540 |
+
er.hallucination_metrics ? er.hallucination_metrics.anchor_score.toFixed(6) : '',
|
| 2541 |
+
er.hallucination_metrics ? er.hallucination_metrics.length_ratio.toFixed(4) : '',
|
| 2542 |
+
er.hallucination_metrics ? (er.hallucination_metrics.is_hallucinating ? '1' : '0') : '',
|
| 2543 |
]);
|
| 2544 |
}});
|
| 2545 |
}});
|
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 10 — Distribution des erreurs par ligne et détection des hallucinations VLM.
|
| 2 |
+
|
| 3 |
+
Classes de tests
|
| 4 |
+
----------------
|
| 5 |
+
TestLineMetrics (12 tests) — compute_line_metrics + aggregate_line_metrics
|
| 6 |
+
TestHallucinationMetrics (12 tests) — compute_hallucination_metrics + aggregate_hallucination_metrics
|
| 7 |
+
TestLineMetricsInResults (4 tests) — intégration dans DocumentResult / EngineReport
|
| 8 |
+
TestFixturesVLM (6 tests) — moteur VLM fictif et génération de données
|
| 9 |
+
TestReportSprint10 (6 tests) — rapport HTML contient les nouvelles métriques
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import math
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import pytest
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Helpers communs
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
GT_SIMPLE = "Le renard brun saute par-dessus le chien paresseux."
|
| 24 |
+
HYP_PERFECT = "Le renard brun saute par-dessus le chien paresseux."
|
| 25 |
+
HYP_ERRORS = "Le renrd brin soute par-desous le chen paressux."
|
| 26 |
+
HYP_MISSING = "Le renard brun saute."
|
| 27 |
+
|
| 28 |
+
GT_MULTILINE = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France."
|
| 29 |
+
HYP_MULTILINE_PERFECT = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France."
|
| 30 |
+
HYP_MULTILINE_ERRORS = "Icy commence le prologue\nde maistre Jehan Froissart\nsus les croniques de France."
|
| 31 |
+
|
| 32 |
+
GT_MEDIEVAL = "Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre."
|
| 33 |
+
HYP_HALLUCINATED = (
|
| 34 |
+
"Icy commence le prologue de maistre Jehan Froissart sus les croniques de France et d'Angleterre. "
|
| 35 |
+
"Ledit document fut enregistré au greffe le lendemain. "
|
| 36 |
+
"Signé et paraphé par le notaire royal en présence de témoins. "
|
| 37 |
+
"Archives nationales, cote F/7/1234, pièce n° 42."
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ===========================================================================
|
| 42 |
+
# TestLineMetrics
|
| 43 |
+
# ===========================================================================
|
| 44 |
+
|
| 45 |
+
class TestLineMetrics:
|
| 46 |
+
"""Tests pour picarones.core.line_metrics.compute_line_metrics."""
|
| 47 |
+
|
| 48 |
+
def test_import(self):
|
| 49 |
+
from picarones.core.line_metrics import compute_line_metrics, LineMetrics
|
| 50 |
+
assert callable(compute_line_metrics)
|
| 51 |
+
assert LineMetrics is not None
|
| 52 |
+
|
| 53 |
+
def test_perfect_match_cer_zero(self):
|
| 54 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 55 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
|
| 56 |
+
assert result.mean_cer == pytest.approx(0.0, abs=1e-9)
|
| 57 |
+
assert all(v == pytest.approx(0.0, abs=1e-9) for v in result.cer_per_line)
|
| 58 |
+
|
| 59 |
+
def test_line_count(self):
|
| 60 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 61 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 62 |
+
assert result.line_count == 3
|
| 63 |
+
|
| 64 |
+
def test_cer_per_line_length(self):
|
| 65 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 66 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 67 |
+
assert len(result.cer_per_line) == 3
|
| 68 |
+
|
| 69 |
+
def test_percentiles_keys(self):
|
| 70 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 71 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 72 |
+
for key in ("p50", "p75", "p90", "p95", "p99"):
|
| 73 |
+
assert key in result.percentiles
|
| 74 |
+
assert 0.0 <= result.percentiles[key] <= 1.0
|
| 75 |
+
|
| 76 |
+
def test_percentile_ordering(self):
|
| 77 |
+
"""p50 ≤ p75 ≤ p90 ≤ p95 ≤ p99."""
|
| 78 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 79 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 80 |
+
p = result.percentiles
|
| 81 |
+
assert p["p50"] <= p["p75"] <= p["p90"] <= p["p95"] <= p["p99"]
|
| 82 |
+
|
| 83 |
+
def test_gini_zero_for_perfect(self):
|
| 84 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 85 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
|
| 86 |
+
assert result.gini == pytest.approx(0.0, abs=1e-9)
|
| 87 |
+
|
| 88 |
+
def test_gini_range(self):
|
| 89 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 90 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 91 |
+
assert 0.0 <= result.gini <= 1.0
|
| 92 |
+
|
| 93 |
+
def test_catastrophic_rate_keys(self):
|
| 94 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 95 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS,
|
| 96 |
+
thresholds=[0.30, 0.50, 1.00])
|
| 97 |
+
for t in (0.30, 0.50, 1.00):
|
| 98 |
+
assert t in result.catastrophic_rate
|
| 99 |
+
assert 0.0 <= result.catastrophic_rate[t] <= 1.0
|
| 100 |
+
|
| 101 |
+
def test_heatmap_length(self):
|
| 102 |
+
from picarones.core.line_metrics import compute_line_metrics
|
| 103 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS, heatmap_bins=5)
|
| 104 |
+
assert len(result.heatmap) == 5
|
| 105 |
+
|
| 106 |
+
def test_as_dict_and_from_dict_roundtrip(self):
|
| 107 |
+
from picarones.core.line_metrics import compute_line_metrics, LineMetrics
|
| 108 |
+
result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 109 |
+
d = result.as_dict()
|
| 110 |
+
restored = LineMetrics.from_dict(d)
|
| 111 |
+
assert restored.gini == pytest.approx(result.gini, abs=1e-5)
|
| 112 |
+
assert restored.line_count == result.line_count
|
| 113 |
+
assert len(restored.cer_per_line) == len(result.cer_per_line)
|
| 114 |
+
|
| 115 |
+
def test_aggregate_line_metrics(self):
|
| 116 |
+
from picarones.core.line_metrics import compute_line_metrics, aggregate_line_metrics, LineMetrics
|
| 117 |
+
r1 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
|
| 118 |
+
r2 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
|
| 119 |
+
agg = aggregate_line_metrics([r1, r2])
|
| 120 |
+
assert "gini_mean" in agg
|
| 121 |
+
assert "percentiles" in agg
|
| 122 |
+
assert "catastrophic_rate" in agg
|
| 123 |
+
assert "document_count" in agg
|
| 124 |
+
assert agg["document_count"] == 2
|
| 125 |
+
assert agg["gini_mean"] >= 0.0
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ===========================================================================
|
| 129 |
+
# TestHallucinationMetrics
|
| 130 |
+
# ===========================================================================
|
| 131 |
+
|
| 132 |
+
class TestHallucinationMetrics:
|
| 133 |
+
"""Tests pour picarones.core.hallucination.compute_hallucination_metrics."""
|
| 134 |
+
|
| 135 |
+
def test_import(self):
|
| 136 |
+
from picarones.core.hallucination import compute_hallucination_metrics, HallucinationMetrics
|
| 137 |
+
assert callable(compute_hallucination_metrics)
|
| 138 |
+
assert HallucinationMetrics is not None
|
| 139 |
+
|
| 140 |
+
def test_perfect_match_anchor_one(self):
|
| 141 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 142 |
+
result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
|
| 143 |
+
# Ancrage parfait → score proche de 1.0
|
| 144 |
+
assert result.anchor_score == pytest.approx(1.0, abs=0.05)
|
| 145 |
+
assert result.is_hallucinating is False
|
| 146 |
+
|
| 147 |
+
def test_length_ratio_perfect(self):
|
| 148 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 149 |
+
result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
|
| 150 |
+
assert result.length_ratio == pytest.approx(1.0, abs=0.05)
|
| 151 |
+
|
| 152 |
+
def test_hallucination_detected(self):
|
| 153 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 154 |
+
result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
|
| 155 |
+
# L'hypothèse est beaucoup plus longue
|
| 156 |
+
assert result.length_ratio > 1.0
|
| 157 |
+
assert result.is_hallucinating is True
|
| 158 |
+
|
| 159 |
+
def test_hallucinated_blocks_detected(self):
|
| 160 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 161 |
+
result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED,
|
| 162 |
+
anchor_threshold=0.5, min_block_length=3)
|
| 163 |
+
# Des blocs hallucinés doivent être détectés
|
| 164 |
+
assert len(result.hallucinated_blocks) > 0
|
| 165 |
+
|
| 166 |
+
def test_net_insertion_rate_range(self):
|
| 167 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 168 |
+
result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
|
| 169 |
+
assert 0.0 <= result.net_insertion_rate <= 1.0
|
| 170 |
+
|
| 171 |
+
def test_word_counts(self):
|
| 172 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 173 |
+
result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
|
| 174 |
+
assert result.gt_word_count > 0
|
| 175 |
+
assert result.hyp_word_count > 0
|
| 176 |
+
|
| 177 |
+
def test_empty_reference(self):
|
| 178 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 179 |
+
result = compute_hallucination_metrics("", "some text here added by model")
|
| 180 |
+
# Référence vide : insertion nette maximale
|
| 181 |
+
assert result.net_insertion_rate == pytest.approx(1.0, abs=0.05)
|
| 182 |
+
|
| 183 |
+
def test_empty_hypothesis(self):
|
| 184 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 185 |
+
result = compute_hallucination_metrics(GT_SIMPLE, "")
|
| 186 |
+
assert result.hyp_word_count == 0
|
| 187 |
+
assert result.net_insertion_rate == pytest.approx(0.0)
|
| 188 |
+
|
| 189 |
+
def test_as_dict_and_from_dict_roundtrip(self):
|
| 190 |
+
from picarones.core.hallucination import compute_hallucination_metrics, HallucinationMetrics
|
| 191 |
+
result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
|
| 192 |
+
d = result.as_dict()
|
| 193 |
+
restored = HallucinationMetrics.from_dict(d)
|
| 194 |
+
assert restored.anchor_score == pytest.approx(result.anchor_score, abs=1e-5)
|
| 195 |
+
assert restored.is_hallucinating == result.is_hallucinating
|
| 196 |
+
assert len(restored.hallucinated_blocks) == len(result.hallucinated_blocks)
|
| 197 |
+
|
| 198 |
+
def test_aggregate_hallucination_metrics(self):
|
| 199 |
+
from picarones.core.hallucination import compute_hallucination_metrics, aggregate_hallucination_metrics
|
| 200 |
+
r1 = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
|
| 201 |
+
r2 = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
|
| 202 |
+
agg = aggregate_hallucination_metrics([r1, r2])
|
| 203 |
+
assert "anchor_score_mean" in agg
|
| 204 |
+
assert "length_ratio_mean" in agg
|
| 205 |
+
assert "hallucinating_doc_count" in agg
|
| 206 |
+
assert "document_count" in agg
|
| 207 |
+
assert agg["document_count"] == 2
|
| 208 |
+
assert agg["hallucinating_doc_count"] >= 1
|
| 209 |
+
|
| 210 |
+
def test_anchor_threshold_respected(self):
|
| 211 |
+
"""Un ancrage très bas déclenche le badge hallucination."""
|
| 212 |
+
from picarones.core.hallucination import compute_hallucination_metrics
|
| 213 |
+
result = compute_hallucination_metrics(
|
| 214 |
+
"abc def ghi", "xyz uvw rst opq lmn",
|
| 215 |
+
anchor_threshold=0.5
|
| 216 |
+
)
|
| 217 |
+
assert result.anchor_score < 0.5
|
| 218 |
+
assert result.is_hallucinating is True
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# ===========================================================================
|
| 222 |
+
# TestLineMetricsInResults
|
| 223 |
+
# ===========================================================================
|
| 224 |
+
|
| 225 |
+
class TestLineMetricsInResults:
|
| 226 |
+
"""Tests pour l'intégration des métriques Sprint 10 dans les modèles de données."""
|
| 227 |
+
|
| 228 |
+
def test_document_result_has_line_metrics_field(self):
|
| 229 |
+
from picarones.core.results import DocumentResult
|
| 230 |
+
from picarones.core.metrics import MetricsResult
|
| 231 |
+
dr = DocumentResult(
|
| 232 |
+
doc_id="test_001",
|
| 233 |
+
image_path="/test/img.jpg",
|
| 234 |
+
ground_truth=GT_SIMPLE,
|
| 235 |
+
hypothesis=HYP_ERRORS,
|
| 236 |
+
metrics=MetricsResult(
|
| 237 |
+
cer=0.1, cer_nfc=0.1, cer_caseless=0.09,
|
| 238 |
+
wer=0.2, wer_normalized=0.2,
|
| 239 |
+
mer=0.15, wil=0.18,
|
| 240 |
+
reference_length=50, hypothesis_length=48,
|
| 241 |
+
),
|
| 242 |
+
duration_seconds=1.0,
|
| 243 |
+
line_metrics={"gini": 0.3, "line_count": 3},
|
| 244 |
+
)
|
| 245 |
+
assert dr.line_metrics is not None
|
| 246 |
+
assert dr.line_metrics["gini"] == pytest.approx(0.3)
|
| 247 |
+
|
| 248 |
+
def test_document_result_has_hallucination_metrics_field(self):
|
| 249 |
+
from picarones.core.results import DocumentResult
|
| 250 |
+
from picarones.core.metrics import MetricsResult
|
| 251 |
+
dr = DocumentResult(
|
| 252 |
+
doc_id="test_002",
|
| 253 |
+
image_path="/test/img.jpg",
|
| 254 |
+
ground_truth=GT_SIMPLE,
|
| 255 |
+
hypothesis=HYP_HALLUCINATED,
|
| 256 |
+
metrics=MetricsResult(
|
| 257 |
+
cer=0.5, cer_nfc=0.5, cer_caseless=0.5,
|
| 258 |
+
wer=0.6, wer_normalized=0.6,
|
| 259 |
+
mer=0.55, wil=0.65,
|
| 260 |
+
reference_length=50, hypothesis_length=100,
|
| 261 |
+
),
|
| 262 |
+
duration_seconds=2.0,
|
| 263 |
+
hallucination_metrics={"anchor_score": 0.3, "is_hallucinating": True},
|
| 264 |
+
)
|
| 265 |
+
assert dr.hallucination_metrics is not None
|
| 266 |
+
assert dr.hallucination_metrics["is_hallucinating"] is True
|
| 267 |
+
|
| 268 |
+
def test_document_result_as_dict_includes_sprint10_fields(self):
|
| 269 |
+
from picarones.core.results import DocumentResult
|
| 270 |
+
from picarones.core.metrics import MetricsResult
|
| 271 |
+
dr = DocumentResult(
|
| 272 |
+
doc_id="test_003",
|
| 273 |
+
image_path="/test/img.jpg",
|
| 274 |
+
ground_truth=GT_SIMPLE,
|
| 275 |
+
hypothesis=HYP_PERFECT,
|
| 276 |
+
metrics=MetricsResult(
|
| 277 |
+
cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
|
| 278 |
+
wer=0.0, wer_normalized=0.0,
|
| 279 |
+
mer=0.0, wil=0.0,
|
| 280 |
+
reference_length=50, hypothesis_length=50,
|
| 281 |
+
),
|
| 282 |
+
duration_seconds=0.5,
|
| 283 |
+
line_metrics={"gini": 0.0, "line_count": 1},
|
| 284 |
+
hallucination_metrics={"anchor_score": 1.0, "is_hallucinating": False},
|
| 285 |
+
)
|
| 286 |
+
d = dr.as_dict()
|
| 287 |
+
assert "line_metrics" in d
|
| 288 |
+
assert "hallucination_metrics" in d
|
| 289 |
+
|
| 290 |
+
def test_engine_report_has_aggregated_sprint10_fields(self):
|
| 291 |
+
from picarones.core.results import EngineReport, DocumentResult
|
| 292 |
+
from picarones.core.metrics import MetricsResult
|
| 293 |
+
dr = DocumentResult(
|
| 294 |
+
doc_id="test_004",
|
| 295 |
+
image_path="/test/img.jpg",
|
| 296 |
+
ground_truth=GT_SIMPLE,
|
| 297 |
+
hypothesis=HYP_PERFECT,
|
| 298 |
+
metrics=MetricsResult(
|
| 299 |
+
cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
|
| 300 |
+
wer=0.0, wer_normalized=0.0,
|
| 301 |
+
mer=0.0, wil=0.0,
|
| 302 |
+
reference_length=50, hypothesis_length=50,
|
| 303 |
+
),
|
| 304 |
+
duration_seconds=0.5,
|
| 305 |
+
)
|
| 306 |
+
report = EngineReport(
|
| 307 |
+
engine_name="test_engine",
|
| 308 |
+
engine_version="1.0",
|
| 309 |
+
engine_config={},
|
| 310 |
+
document_results=[dr],
|
| 311 |
+
aggregated_line_metrics={"gini_mean": 0.1, "document_count": 1},
|
| 312 |
+
aggregated_hallucination={"anchor_score_mean": 0.95, "document_count": 1},
|
| 313 |
+
)
|
| 314 |
+
assert report.aggregated_line_metrics is not None
|
| 315 |
+
assert report.aggregated_hallucination is not None
|
| 316 |
+
d = report.as_dict()
|
| 317 |
+
assert "aggregated_line_metrics" in d
|
| 318 |
+
assert "aggregated_hallucination" in d
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# ===========================================================================
|
| 322 |
+
# TestFixturesVLM
|
| 323 |
+
# ===========================================================================
|
| 324 |
+
|
| 325 |
+
class TestFixturesVLM:
|
| 326 |
+
"""Tests pour le moteur VLM fictif dans picarones.fixtures."""
|
| 327 |
+
|
| 328 |
+
def test_generate_sample_benchmark_has_vlm_engine(self):
|
| 329 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 330 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 331 |
+
engine_names = [r.engine_name for r in bm.engine_reports]
|
| 332 |
+
assert any("vision" in name.lower() or "vlm" in name.lower() or "zero-shot" in name.lower()
|
| 333 |
+
for name in engine_names)
|
| 334 |
+
|
| 335 |
+
def test_vlm_engine_has_hallucination_metrics(self):
|
| 336 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 337 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 338 |
+
vlm_report = next(
|
| 339 |
+
(r for r in bm.engine_reports
|
| 340 |
+
if r.pipeline_info.get("is_vlm")),
|
| 341 |
+
None
|
| 342 |
+
)
|
| 343 |
+
assert vlm_report is not None, "Moteur VLM non trouvé"
|
| 344 |
+
assert vlm_report.aggregated_hallucination is not None
|
| 345 |
+
assert "anchor_score_mean" in vlm_report.aggregated_hallucination
|
| 346 |
+
|
| 347 |
+
def test_all_engines_have_line_metrics(self):
|
| 348 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 349 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 350 |
+
for report in bm.engine_reports:
|
| 351 |
+
assert report.aggregated_line_metrics is not None, \
|
| 352 |
+
f"Pas de line_metrics pour {report.engine_name}"
|
| 353 |
+
assert "gini_mean" in report.aggregated_line_metrics
|
| 354 |
+
|
| 355 |
+
def test_all_documents_have_line_metrics(self):
|
| 356 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 357 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 358 |
+
for report in bm.engine_reports:
|
| 359 |
+
for dr in report.document_results:
|
| 360 |
+
assert dr.line_metrics is not None, \
|
| 361 |
+
f"{report.engine_name}/{dr.doc_id}: line_metrics manquant"
|
| 362 |
+
assert "gini" in dr.line_metrics
|
| 363 |
+
|
| 364 |
+
def test_all_documents_have_hallucination_metrics(self):
|
| 365 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 366 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 367 |
+
for report in bm.engine_reports:
|
| 368 |
+
for dr in report.document_results:
|
| 369 |
+
assert dr.hallucination_metrics is not None, \
|
| 370 |
+
f"{report.engine_name}/{dr.doc_id}: hallucination_metrics manquant"
|
| 371 |
+
assert "anchor_score" in dr.hallucination_metrics
|
| 372 |
+
|
| 373 |
+
def test_vlm_engine_has_valid_hallucination_aggregation(self):
|
| 374 |
+
"""Le moteur VLM doit avoir des métriques d'hallucination agrégées valides."""
|
| 375 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 376 |
+
bm = generate_sample_benchmark(n_docs=6, seed=42)
|
| 377 |
+
vlm_report = next(
|
| 378 |
+
(r for r in bm.engine_reports if r.pipeline_info.get("is_vlm")),
|
| 379 |
+
None
|
| 380 |
+
)
|
| 381 |
+
if vlm_report is None:
|
| 382 |
+
pytest.skip("Moteur VLM non trouvé")
|
| 383 |
+
|
| 384 |
+
agg = vlm_report.aggregated_hallucination
|
| 385 |
+
assert agg is not None
|
| 386 |
+
assert 0.0 <= agg.get("anchor_score_mean", -1) <= 1.0
|
| 387 |
+
assert agg.get("length_ratio_mean", 0) >= 0.0
|
| 388 |
+
assert agg.get("document_count", 0) == 6
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
# ===========================================================================
|
| 392 |
+
# TestReportSprint10
|
| 393 |
+
# ===========================================================================
|
| 394 |
+
|
| 395 |
+
class TestReportSprint10:
|
| 396 |
+
"""Tests pour le rapport HTML — nouvelles métriques Sprint 10."""
|
| 397 |
+
|
| 398 |
+
@pytest.fixture(scope="class")
|
| 399 |
+
def html_report(self, tmp_path_factory):
|
| 400 |
+
"""Génère un rapport HTML de démonstration."""
|
| 401 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 402 |
+
from picarones.report.generator import ReportGenerator
|
| 403 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 404 |
+
tmp = tmp_path_factory.mktemp("report")
|
| 405 |
+
out = tmp / "sprint10_test.html"
|
| 406 |
+
ReportGenerator(bm).generate(str(out))
|
| 407 |
+
return out.read_text(encoding="utf-8")
|
| 408 |
+
|
| 409 |
+
def test_report_generated_not_empty(self, html_report):
|
| 410 |
+
assert len(html_report) > 50_000
|
| 411 |
+
|
| 412 |
+
def test_report_has_gini_column_header(self, html_report):
|
| 413 |
+
assert "Gini" in html_report
|
| 414 |
+
|
| 415 |
+
def test_report_has_ancrage_column_header(self, html_report):
|
| 416 |
+
assert "Ancrage" in html_report
|
| 417 |
+
|
| 418 |
+
def test_report_has_gini_cer_scatter_canvas(self, html_report):
|
| 419 |
+
assert "chart-gini-cer" in html_report
|
| 420 |
+
|
| 421 |
+
def test_report_has_ratio_anchor_scatter_canvas(self, html_report):
|
| 422 |
+
assert "chart-ratio-anchor" in html_report
|
| 423 |
+
|
| 424 |
+
def test_report_has_vlm_badge(self, html_report):
|
| 425 |
+
"""Le badge VLM doit apparaître pour le moteur zero-shot."""
|
| 426 |
+
assert "VLM" in html_report or "zero-shot" in html_report.lower() or "zero_shot" in html_report
|