Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Claude commited on Mar 8

Commit

4b1dc89

unverified ·

1 Parent(s): 35bbb2f

Sprint 10 — Distribution des erreurs par ligne et détection des hallucinations VLM

Nouveaux modules :
- picarones/core/line_metrics.py : CER par ligne, percentiles (p50–p99),
coefficient de Gini, taux de lignes catastrophiques, carte thermique de position,
agrégation sur corpus
- picarones/core/hallucination.py : taux d'insertion nette, ratio longueur sortie/GT,
score d'ancrage trigrammes, détection de blocs hallucinés, badge hallucination,
agrégation sur corpus

Modèles de données (results.py) :
- DocumentResult : nouveaux champs line_metrics + hallucination_metrics
- EngineReport : nouveaux champs aggregated_line_metrics + aggregated_hallucination

Données de démonstration (fixtures.py) :
- Nouveau moteur fictif VLM (gpt-4o-vision zero-shot) avec hallucinations simulées
- Toutes les phrases intercalées absentes du GT, modernisation systématique
- Calcul des métriques Sprint 10 pour les 5 moteurs (3 OCR + 2 pipelines LLM/VLM)

Rapport HTML (report/generator.py) :
- Colonnes Gini et Ancrage dans le tableau de classement (triables)
- Badge VLM 👁 pour les moteurs zero-shot
- Vue Document : carte thermique des erreurs par position, percentiles CER par ligne,
panneau hallucinations avec blocs détectés
- Vue Analyses : scatter plot Gini vs CER moyen, scatter plot ratio longueur vs ancrage
avec zones de danger (ancrage < 0.5, ratio > 1.2) dessinées par plugin Chart.js
- Export CSV enrichi (gini, anchor_score, length_ratio, is_hallucinating)

Tests (tests/test_sprint10_error_distribution.py) :
- 40 tests couvrant les 5 nouvelles classes : TestLineMetrics (12),
TestHallucinationMetrics (12), TestLineMetricsInResults (4),
TestFixturesVLM (6), TestReportSprint10 (6)

https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq

Files changed (6) hide show

picarones/core/hallucination.py +332 -0
picarones/core/line_metrics.py +286 -0
picarones/core/results.py +18 -0
picarones/fixtures.py +94 -0
picarones/report/generator.py +396 -1
tests/test_sprint10_error_distribution.py +426 -0

picarones/core/hallucination.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""Détection des hallucinations VLM/LLM — Sprint 10.
+Métriques calculées
+-------------------
+- Taux d'insertion net    : mots/caractères ajoutés absents du GT, distinct du WIL existant
+- Ratio de longueur       : len(hyp) / len(gt) — ratio > 1.2 → hallucination potentielle
+- Score d'ancrage         : proportion des n-grammes (trigrammes) de la sortie présents dans le GT
+- Blocs hallucinés        : segments continus de la sortie sans correspondance GT au-delà d'un seuil
+- Badge hallucination     : True si ancrage faible ou ratio de longueur anormal
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+# ---------------------------------------------------------------------------
+# Helpers texte
+# ---------------------------------------------------------------------------
+def _tokenize(text: str) -> list[str]:
+    """Découpe en mots (minuscules, sans ponctuation)."""
+    return re.findall(r"[^\s]+", text.lower())
+def _ngrams(tokens: list[str], n: int) -> list[tuple[str, ...]]:
+    """Génère les n-grammes d'une liste de tokens."""
+    if len(tokens) < n:
+        return [tuple(tokens)] if tokens else []
+    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
+# ---------------------------------------------------------------------------
+# Blocs hallucinés (segments continus sans ancrage)
+# ---------------------------------------------------------------------------
+@dataclass
+class HallucinatedBlock:
+    """Segment continu de la sortie sans correspondance dans le GT."""
+    start_token: int
+    end_token: int
+    text: str
+    length: int  # nombre de tokens
+    def as_dict(self) -> dict:
+        return {
+            "start_token": self.start_token,
+            "end_token": self.end_token,
+            "text": self.text,
+            "length": self.length,
+        }
+def _detect_hallucinated_blocks(
+    hyp_tokens: list[str],
+    gt_token_set: set[str],
+    tolerance: int = 3,
+    min_block_length: int = 4,
+) -> list[HallucinatedBlock]:
+    """Détecte les blocs de tokens hypothèse sans correspondance dans le GT.
+    Un bloc est un segment contigu de tokens hypothèse dont aucun n'est présent
+    dans le vocabulaire GT. Une tolérance de ``tolerance`` tokens connus interrompus
+    est acceptée avant de clore un bloc.
+    Parameters
+    ----------
+    hyp_tokens:
+        Tokens de la sortie OCR/VLM.
+    gt_token_set:
+        Ensemble des tokens du GT (pour recherche O(1)).
+    tolerance:
+        Nombre de tokens connus consécutifs interrompant un bloc avant de le clore.
+    min_block_length:
+        Longueur minimale (tokens) pour qu'un bloc soit signalé.
+    Returns
+    -------
+    list[HallucinatedBlock]
+    """
+    blocks: list[HallucinatedBlock] = []
+    if not hyp_tokens:
+        return blocks
+    in_block = False
+    block_start = 0
+    consecutive_known = 0
+    for i, tok in enumerate(hyp_tokens):
+        is_unknown = tok not in gt_token_set
+        if is_unknown:
+            if not in_block:
+                in_block = True
+                block_start = i
+                consecutive_known = 0
+            else:
+                consecutive_known = 0
+        else:
+            if in_block:
+                consecutive_known += 1
+                if consecutive_known >= tolerance:
+                    # Clore le bloc
+                    end = i - consecutive_known
+                    length = end - block_start + 1
+                    if length >= min_block_length:
+                        text = " ".join(hyp_tokens[block_start:end + 1])
+                        blocks.append(HallucinatedBlock(
+                            start_token=block_start,
+                            end_token=end,
+                            text=text,
+                            length=length,
+                        ))
+                    in_block = False
+                    consecutive_known = 0
+    # Bloc non terminé
+    if in_block:
+        end = len(hyp_tokens) - 1
+        length = end - block_start + 1
+        if length >= min_block_length:
+            text = " ".join(hyp_tokens[block_start:end + 1])
+            blocks.append(HallucinatedBlock(
+                start_token=block_start,
+                end_token=end,
+                text=text,
+                length=length,
+            ))
+    return blocks
+# ---------------------------------------------------------------------------
+# Résultat structuré
+# ---------------------------------------------------------------------------
+@dataclass
+class HallucinationMetrics:
+    """Métriques de détection des hallucinations pour une paire (GT, hypothèse)."""
+    net_insertion_rate: float
+    """Taux d'insertion nette : tokens hypothèse absents du GT / total tokens hypothèse."""
+    length_ratio: float
+    """Ratio de longueur : len(hyp) / len(gt) en caractères. > 1.2 = signal d'hallucination."""
+    anchor_score: float
+    """Score d'ancrage : proportion des trigrammes hypothèse présents dans les trigrammes GT.
+    Score élevé → l'hypothèse s'ancre bien dans le GT. Score faible → hallucinations probables."""
+    hallucinated_blocks: list[HallucinatedBlock]
+    """Segments continus de la sortie sans correspondance GT (au-dessus du seuil de tolérance)."""
+    is_hallucinating: bool
+    """True si anchor_score < anchor_threshold OU length_ratio > length_ratio_threshold."""
+    # Détails supplémentaires
+    gt_word_count: int = 0
+    hyp_word_count: int = 0
+    net_inserted_words: int = 0
+    anchor_threshold_used: float = 0.5
+    length_ratio_threshold_used: float = 1.2
+    ngram_size_used: int = 3
+    def as_dict(self) -> dict:
+        return {
+            "net_insertion_rate": round(self.net_insertion_rate, 6),
+            "length_ratio": round(self.length_ratio, 6),
+            "anchor_score": round(self.anchor_score, 6),
+            "hallucinated_blocks": [b.as_dict() for b in self.hallucinated_blocks],
+            "is_hallucinating": self.is_hallucinating,
+            "gt_word_count": self.gt_word_count,
+            "hyp_word_count": self.hyp_word_count,
+            "net_inserted_words": self.net_inserted_words,
+            "anchor_threshold_used": self.anchor_threshold_used,
+            "length_ratio_threshold_used": self.length_ratio_threshold_used,
+            "ngram_size_used": self.ngram_size_used,
+        }
+    @classmethod
+    def from_dict(cls, d: dict) -> "HallucinationMetrics":
+        blocks = [
+            HallucinatedBlock(**b) for b in d.get("hallucinated_blocks", [])
+        ]
+        return cls(
+            net_insertion_rate=d.get("net_insertion_rate", 0.0),
+            length_ratio=d.get("length_ratio", 1.0),
+            anchor_score=d.get("anchor_score", 1.0),
+            hallucinated_blocks=blocks,
+            is_hallucinating=d.get("is_hallucinating", False),
+            gt_word_count=d.get("gt_word_count", 0),
+            hyp_word_count=d.get("hyp_word_count", 0),
+            net_inserted_words=d.get("net_inserted_words", 0),
+            anchor_threshold_used=d.get("anchor_threshold_used", 0.5),
+            length_ratio_threshold_used=d.get("length_ratio_threshold_used", 1.2),
+            ngram_size_used=d.get("ngram_size_used", 3),
+        )
+# ---------------------------------------------------------------------------
+# Calcul principal
+# ---------------------------------------------------------------------------
+def compute_hallucination_metrics(
+    reference: str,
+    hypothesis: str,
+    n: int = 3,
+    length_ratio_threshold: float = 1.2,
+    anchor_threshold: float = 0.5,
+    block_tolerance: int = 3,
+    min_block_length: int = 4,
+) -> HallucinationMetrics:
+    """Calcule les métriques de détection des hallucinations VLM/LLM.
+    Parameters
+    ----------
+    reference:
+        Texte de vérité terrain (GT).
+    hypothesis:
+        Texte produit par le modèle.
+    n:
+        Taille des n-grammes pour le score d'ancrage (défaut : trigrammes).
+    length_ratio_threshold:
+        Seuil de ratio de longueur au-dessus duquel on signale une hallucination potentielle.
+    anchor_threshold:
+        Seuil de score d'ancrage en dessous duquel on signale une hallucination potentielle.
+    block_tolerance:
+        Nombre de tokens connus consécutifs acceptés dans un bloc halluciné.
+    min_block_length:
+        Longueur minimale (tokens) pour signaler un bloc halluciné.
+    Returns
+    -------
+    HallucinationMetrics
+    """
+    gt_tokens = _tokenize(reference)
+    hyp_tokens = _tokenize(hypothesis)
+    gt_len_chars = len(reference.strip())
+    hyp_len_chars = len(hypothesis.strip())
+    # ── Ratio de longueur ────────────────────────────────────────────────
+    if gt_len_chars == 0:
+        length_ratio = 1.0 if hyp_len_chars == 0 else float("inf")
+    else:
+        length_ratio = hyp_len_chars / gt_len_chars
+    # ── Taux d'insertion nette ───────────────────────────────────────────
+    gt_token_set = set(gt_tokens)
+    hyp_token_count = len(hyp_tokens)
+    if hyp_token_count == 0:
+        net_insertion_rate = 0.0
+        net_inserted_words = 0
+    else:
+        net_inserted = [t for t in hyp_tokens if t not in gt_token_set]
+        net_inserted_words = len(net_inserted)
+        net_insertion_rate = net_inserted_words / hyp_token_count
+    # ── Score d'ancrage (n-grammes) ──────────────────────────────────────
+    gt_ngrams = set(_ngrams(gt_tokens, n))
+    hyp_ngrams = _ngrams(hyp_tokens, n)
+    if not hyp_ngrams:
+        # Pas de n-grammes dans l'hypothèse → ancrage parfait (hypothèse vide ou trop courte)
+        anchor_score = 1.0 if not gt_ngrams else 0.0
+    elif not gt_ngrams:
+        anchor_score = 0.0
+    else:
+        anchored = sum(1 for ng in hyp_ngrams if ng in gt_ngrams)
+        anchor_score = anchored / len(hyp_ngrams)
+    # ── Blocs hallucinés ─────────────────────────────────────────────────
+    blocks = _detect_hallucinated_blocks(
+        hyp_tokens=hyp_tokens,
+        gt_token_set=gt_token_set,
+        tolerance=block_tolerance,
+        min_block_length=min_block_length,
+    )
+    # ── Badge hallucination ──────────────────────────────────────────────
+    is_hallucinating = (
+        anchor_score < anchor_threshold
+        or (length_ratio > length_ratio_threshold and length_ratio != float("inf"))
+    )
+    return HallucinationMetrics(
+        net_insertion_rate=net_insertion_rate,
+        length_ratio=min(length_ratio, 9.99),  # plafonner pour la sérialisation
+        anchor_score=anchor_score,
+        hallucinated_blocks=blocks,
+        is_hallucinating=is_hallucinating,
+        gt_word_count=len(gt_tokens),
+        hyp_word_count=hyp_token_count,
+        net_inserted_words=net_inserted_words,
+        anchor_threshold_used=anchor_threshold,
+        length_ratio_threshold_used=length_ratio_threshold,
+        ngram_size_used=n,
+    )
+# ---------------------------------------------------------------------------
+# Agrégation sur un corpus
+# ---------------------------------------------------------------------------
+def aggregate_hallucination_metrics(results: list[HallucinationMetrics]) -> dict:
+    """Agrège les métriques d'hallucination sur un corpus.
+    Returns
+    -------
+    dict
+        Statistiques agrégées : anchor_score moyen, taux de documents hallucinés…
+    """
+    if not results:
+        return {}
+    n = len(results)
+    anchor_values = [r.anchor_score for r in results]
+    ratio_values = [r.length_ratio for r in results]
+    insertion_values = [r.net_insertion_rate for r in results]
+    hallucinating_count = sum(1 for r in results if r.is_hallucinating)
+    return {
+        "anchor_score_mean": round(sum(anchor_values) / n, 6),
+        "anchor_score_min": round(min(anchor_values), 6),
+        "length_ratio_mean": round(sum(ratio_values) / n, 6),
+        "net_insertion_rate_mean": round(sum(insertion_values) / n, 6),
+        "hallucinating_doc_count": hallucinating_count,
+        "hallucinating_doc_rate": round(hallucinating_count / n, 6),
+        "document_count": n,
+    }

picarones/core/line_metrics.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""Distribution des erreurs CER par ligne — Sprint 10.
+Métriques calculées
+-------------------
+- CER par ligne    : distance d'édition caractère/longueur GT sur chaque paire de lignes
+- Percentiles      : p50, p75, p90, p95, p99 sur la distribution des CER ligne
+- Taux catastrophiques : % de lignes dépassant des seuils configurables (30 %, 50 %, 100 %)
+- Coefficient de Gini  : concentration des erreurs (0 = uniformes, 1 = toutes concentrées)
+- Carte thermique      : CER moyen par tranche de position dans le document
+"""
+from __future__ import annotations
+import unicodedata
+from dataclasses import dataclass, field
+from typing import Optional
+# ---------------------------------------------------------------------------
+# CER d'une paire de lignes (distance d'édition Levenshtein normalisée)
+# ---------------------------------------------------------------------------
+def _edit_distance(a: str, b: str) -> int:
+    """Distance de Levenshtein entre deux chaînes."""
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+def _line_cer(ref_line: str, hyp_line: str) -> float:
+    """CER pour une paire de lignes.  Retourne 1.0 si le GT est vide et que l'hyp ne l'est pas."""
+    ref = unicodedata.normalize("NFC", ref_line.strip())
+    hyp = unicodedata.normalize("NFC", hyp_line.strip())
+    if not ref:
+        return 0.0 if not hyp else 1.0
+    dist = _edit_distance(ref, hyp)
+    return dist / len(ref)
+# ---------------------------------------------------------------------------
+# Percentiles (implémentation pur-Python, sans numpy)
+# ---------------------------------------------------------------------------
+def _percentile(sorted_values: list[float], p: float) -> float:
+    """Retourne le p-ième percentile (0 ≤ p ≤ 100) d'une liste triée."""
+    if not sorted_values:
+        return 0.0
+    n = len(sorted_values)
+    index = p / 100 * (n - 1)
+    lo = int(index)
+    hi = min(lo + 1, n - 1)
+    frac = index - lo
+    return sorted_values[lo] + frac * (sorted_values[hi] - sorted_values[lo])
+# ---------------------------------------------------------------------------
+# Coefficient de Gini
+# ---------------------------------------------------------------------------
+def _gini(values: list[float]) -> float:
+    """Coefficient de Gini des erreurs (0 = uniformes, 1 = toutes concentrées).
+    Formule : G = (2 * Σ i*x_i) / (n * Σ x_i) - (n+1)/n
+    sur les valeurs triées par ordre croissant.
+    """
+    if not values:
+        return 0.0
+    xs = sorted(max(v, 0.0) for v in values)
+    n = len(xs)
+    total = sum(xs)
+    if total == 0.0:
+        return 0.0
+    weighted_sum = sum((i + 1) * x for i, x in enumerate(xs))
+    return (2.0 * weighted_sum) / (n * total) - (n + 1) / n
+# ---------------------------------------------------------------------------
+# Résultat structuré
+# ---------------------------------------------------------------------------
+@dataclass
+class LineMetrics:
+    """Distribution des erreurs CER par ligne pour une paire (GT, hypothèse)."""
+    cer_per_line: list[float]
+    """CER de chaque ligne (longueur = nombre de lignes GT)."""
+    percentiles: dict[str, float]
+    """Percentiles : p50, p75, p90, p95, p99."""
+    catastrophic_rate: dict[str, float]
+    """Taux de lignes catastrophiques pour chaque seuil (ex. {0.3: 0.12, 0.5: 0.07, 1.0: 0.02})."""
+    gini: float
+    """Coefficient de Gini des erreurs (0 → uniforme, 1 → concentrées)."""
+    heatmap: list[float]
+    """CER moyen par tranche de position dans le document (longueur = heatmap_bins)."""
+    line_count: int
+    """Nombre de lignes GT traitées."""
+    mean_cer: float
+    """CER moyen sur l'ensemble des lignes."""
+    def as_dict(self) -> dict:
+        return {
+            "cer_per_line": [round(v, 6) for v in self.cer_per_line],
+            "percentiles": {k: round(v, 6) for k, v in self.percentiles.items()},
+            "catastrophic_rate": {str(k): round(v, 6) for k, v in self.catastrophic_rate.items()},
+            "gini": round(self.gini, 6),
+            "heatmap": [round(v, 6) for v in self.heatmap],
+            "line_count": self.line_count,
+            "mean_cer": round(self.mean_cer, 6),
+        }
+    @classmethod
+    def from_dict(cls, d: dict) -> "LineMetrics":
+        return cls(
+            cer_per_line=d.get("cer_per_line", []),
+            percentiles=d.get("percentiles", {}),
+            catastrophic_rate={float(k): v for k, v in d.get("catastrophic_rate", {}).items()},
+            gini=d.get("gini", 0.0),
+            heatmap=d.get("heatmap", []),
+            line_count=d.get("line_count", 0),
+            mean_cer=d.get("mean_cer", 0.0),
+        )
+# ---------------------------------------------------------------------------
+# Calcul principal
+# ---------------------------------------------------------------------------
+def compute_line_metrics(
+    reference: str,
+    hypothesis: str,
+    thresholds: Optional[list[float]] = None,
+    heatmap_bins: int = 10,
+) -> LineMetrics:
+    """Calcule la distribution des erreurs CER ligne par ligne.
+    Parameters
+    ----------
+    reference:
+        Texte de vérité terrain (GT) avec sauts de ligne.
+    hypothesis:
+        Texte produit par le moteur OCR.
+    thresholds:
+        Seuils CER pour le taux catastrophique. Défaut : [0.30, 0.50, 1.00].
+    heatmap_bins:
+        Nombre de tranches de position pour la carte thermique.
+    Returns
+    -------
+    LineMetrics
+    """
+    if thresholds is None:
+        thresholds = [0.30, 0.50, 1.00]
+    ref_lines = reference.splitlines()
+    hyp_lines = hypothesis.splitlines()
+    # Aligner les lignes GT / hypothèse — on prend au moins autant de lignes que le GT
+    n = len(ref_lines)
+    if n == 0:
+        # Pas de lignes : retourner des métriques neutres
+        return LineMetrics(
+            cer_per_line=[],
+            percentiles={f"p{p}": 0.0 for p in (50, 75, 90, 95, 99)},
+            catastrophic_rate={t: 0.0 for t in thresholds},
+            gini=0.0,
+            heatmap=[0.0] * heatmap_bins,
+            line_count=0,
+            mean_cer=0.0,
+        )
+    # Aligner en ignorant les lignes d'hypothèse supplémentaires
+    # Si l'hypothèse a moins de lignes, les lignes manquantes comptent comme supprimées (CER = 1.0)
+    cer_per_line: list[float] = []
+    for i, ref_line in enumerate(ref_lines):
+        hyp_line = hyp_lines[i] if i < len(hyp_lines) else ""
+        cer_per_line.append(min(_line_cer(ref_line, hyp_line), 1.0))
+    sorted_cer = sorted(cer_per_line)
+    # Percentiles
+    percentiles = {
+        f"p{p}": _percentile(sorted_cer, p)
+        for p in (50, 75, 90, 95, 99)
+    }
+    # Taux catastrophiques
+    catastrophic_rate: dict[float, float] = {}
+    for t in thresholds:
+        count = sum(1 for v in cer_per_line if v > t)
+        catastrophic_rate[t] = count / n
+    # Gini
+    gini = _gini(cer_per_line)
+    # Carte thermique par tranche de position
+    bins = heatmap_bins
+    heatmap: list[float] = []
+    for b in range(bins):
+        start = int(b * n / bins)
+        end = int((b + 1) * n / bins)
+        slice_ = cer_per_line[start:end]
+        heatmap.append(sum(slice_) / len(slice_) if slice_ else 0.0)
+    mean_cer = sum(cer_per_line) / n
+    return LineMetrics(
+        cer_per_line=cer_per_line,
+        percentiles=percentiles,
+        catastrophic_rate=catastrophic_rate,
+        gini=gini,
+        heatmap=heatmap,
+        line_count=n,
+        mean_cer=mean_cer,
+    )
+# ---------------------------------------------------------------------------
+# Agrégation sur un corpus
+# ---------------------------------------------------------------------------
+def aggregate_line_metrics(results: list[LineMetrics]) -> dict:
+    """Agrège les métriques de distribution par ligne sur un corpus.
+    Returns
+    -------
+    dict
+        Statistiques agrégées : Gini moyen, percentiles moyens, taux catastrophiques moyens.
+    """
+    if not results:
+        return {}
+    import statistics as _stats
+    gini_values = [r.gini for r in results]
+    mean_cer_values = [r.mean_cer for r in results]
+    # Percentiles moyens
+    pct_keys = ["p50", "p75", "p90", "p95", "p99"]
+    avg_percentiles = {}
+    for k in pct_keys:
+        vals = [r.percentiles.get(k, 0.0) for r in results]
+        avg_percentiles[k] = round(sum(vals) / len(vals), 6) if vals else 0.0
+    # Taux catastrophiques moyens (union des seuils)
+    all_thresholds: set[float] = set()
+    for r in results:
+        all_thresholds.update(r.catastrophic_rate.keys())
+    avg_catastrophic: dict[str, float] = {}
+    for t in sorted(all_thresholds):
+        vals = [r.catastrophic_rate.get(t, 0.0) for r in results]
+        avg_catastrophic[str(t)] = round(sum(vals) / len(vals), 6) if vals else 0.0
+    # Heatmap moyenne (longueur = max des longueurs)
+    if results and results[0].heatmap:
+        n_bins = len(results[0].heatmap)
+        heatmap_avg = []
+        for b in range(n_bins):
+            vals = [r.heatmap[b] for r in results if b < len(r.heatmap)]
+            heatmap_avg.append(round(sum(vals) / len(vals), 6) if vals else 0.0)
+    else:
+        heatmap_avg = []
+    return {
+        "gini_mean": round(sum(gini_values) / len(gini_values), 6),
+        "gini_stdev": round(_stats.stdev(gini_values), 6) if len(gini_values) > 1 else 0.0,
+        "mean_cer_mean": round(sum(mean_cer_values) / len(mean_cer_values), 6),
+        "percentiles": avg_percentiles,
+        "catastrophic_rate": avg_catastrophic,
+        "heatmap": heatmap_avg,
+        "document_count": len(results),
+    }

picarones/core/results.py CHANGED Viewed

@@ -46,6 +46,11 @@ class DocumentResult:
     """Analyse structurelle (segmentation lignes, ordre lecture)."""
     image_quality: Optional[dict] = None
     """Métriques de qualité image."""
     def as_dict(self) -> dict:
         d = {
@@ -71,6 +76,10 @@ class DocumentResult:
             d["structure"] = self.structure
         if self.image_quality is not None:
             d["image_quality"] = self.image_quality
         return d
@@ -99,6 +108,11 @@ class EngineReport:
     """Métriques structurelles agrégées."""
     aggregated_image_quality: Optional[dict] = None
     """Métriques de qualité image agrégées."""
     def __post_init__(self) -> None:
         if not self.aggregated_metrics and self.document_results:
@@ -155,6 +169,10 @@ class EngineReport:
             d["aggregated_structure"] = self.aggregated_structure
         if self.aggregated_image_quality is not None:
             d["aggregated_image_quality"] = self.aggregated_image_quality
         return d

     """Analyse structurelle (segmentation lignes, ordre lecture)."""
     image_quality: Optional[dict] = None
     """Métriques de qualité image."""
+    # Champs Sprint 10 — distribution des erreurs + hallucinations VLM
+    line_metrics: Optional[dict] = None
+    """Distribution CER par ligne (percentiles, Gini, heatmap de position)."""
+    hallucination_metrics: Optional[dict] = None
+    """Métriques de détection des hallucinations VLM (ancrage, ratio longueur, blocs)."""
     def as_dict(self) -> dict:
         d = {
             d["structure"] = self.structure
         if self.image_quality is not None:
             d["image_quality"] = self.image_quality
+        if self.line_metrics is not None:
+            d["line_metrics"] = self.line_metrics
+        if self.hallucination_metrics is not None:
+            d["hallucination_metrics"] = self.hallucination_metrics
         return d
     """Métriques structurelles agrégées."""
     aggregated_image_quality: Optional[dict] = None
     """Métriques de qualité image agrégées."""
+    # Sprint 10
+    aggregated_line_metrics: Optional[dict] = None
+    """Distribution CER par ligne agrégée (Gini moyen, percentiles, heatmap, taux catastrophiques)."""
+    aggregated_hallucination: Optional[dict] = None
+    """Métriques d'hallucination VLM agrégées (ancrage moyen, taux de docs hallucinés…)."""
     def __post_init__(self) -> None:
         if not self.aggregated_metrics and self.document_results:
             d["aggregated_structure"] = self.aggregated_structure
         if self.aggregated_image_quality is not None:
             d["aggregated_image_quality"] = self.aggregated_image_quality
+        if self.aggregated_line_metrics is not None:
+            d["aggregated_line_metrics"] = self.aggregated_line_metrics
+        if self.aggregated_hallucination is not None:
+            d["aggregated_hallucination"] = self.aggregated_hallucination
         return d

picarones/fixtures.py CHANGED Viewed

@@ -25,6 +25,9 @@ from picarones.core.taxonomy import classify_errors, aggregate_taxonomy
 from picarones.core.structure import analyze_structure, aggregate_structure
 from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
 from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
 # ---------------------------------------------------------------------------
 # Textes GT réalistes (documents patrimoniaux BnF)
@@ -117,6 +120,51 @@ def _llm_correction(text: str, rng: random.Random) -> str:
     return text
 def _bad_engine_errors(text: str, rng: random.Random) -> str:
     """Moteur de mauvaise qualité : nombreuses erreurs."""
     words = text.split()
@@ -252,6 +300,30 @@ def generate_sample_benchmark(
                 ],
             },
         ),
     ]
     engine_reports: list[EngineReport] = []
@@ -297,6 +369,13 @@ def generate_sample_benchmark(
             metrics = _make_metrics(gt, hypothesis)
             # Sprint 5 — métriques avancées patrimoniales
             cm = build_confusion_matrix(gt, hypothesis)
             lig_score = compute_ligature_score(gt, hypothesis)
@@ -326,6 +405,8 @@ def generate_sample_benchmark(
                     taxonomy=taxonomy_result.as_dict(),
                     structure=struct_result.as_dict(),
                     image_quality={**iq_result.as_dict(), "script_type": _script_type},
                 )
             )
@@ -384,6 +465,17 @@ def generate_sample_benchmark(
             for dr in doc_results if dr.image_quality
         ])
         report = EngineReport(
             engine_name=engine_name,
             engine_version=engine_version,
@@ -395,6 +487,8 @@ def generate_sample_benchmark(
             aggregated_taxonomy=agg_taxonomy,
             aggregated_structure=agg_structure,
             aggregated_image_quality=agg_iq,
         )
         engine_reports.append(report)

 from picarones.core.structure import analyze_structure, aggregate_structure
 from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
 from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
+# Sprint 10 — distribution des erreurs + hallucinations VLM
+from picarones.core.line_metrics import compute_line_metrics, aggregate_line_metrics, LineMetrics
+from picarones.core.hallucination import compute_hallucination_metrics, aggregate_hallucination_metrics
 # ---------------------------------------------------------------------------
 # Textes GT réalistes (documents patrimoniaux BnF)
     return text
+def _vlm_hallucinations(text: str, rng: random.Random) -> str:
+    """Simule les hallucinations typiques d'un modèle VLM (vision-language).
+    Le modèle "voit" l'image et génère du texte proche du GT mais :
+    - Insère des phrases entières inventées (~30% de contenu supplémentaire)
+    - Mélange des graphies modernes avec des graphies médiévales
+    - Parfois ajoute des métadonnées (folio, date) inventées
+    - Garde une cohérence partielle avec le GT (pas totalement aléatoire)
+    """
+    # Correction partielle d'erreurs OCR (le VLM lit l'image directement)
+    text = text.replace("ſ", "s").replace("&", "et")
+    # Hallucination : phrases intercalées absentes du GT
+    hallucinated_phrases = [
+        "Ledit document fut enregistré au greffe le lendemain.",
+        "Signé et paraphé par le notaire royal en présence de témoins.",
+        "Archives nationales, cote F/7/1234, pièce n° 42.",
+        "Transcription réalisée d'après l'original conservé à la BnF.",
+        "Le présent acte a été lu et approuvé par toutes les parties.",
+        "En foi de quoi nous avons apposé notre sceau et notre signature.",
+        "Registre des délibérations du Parlement de Paris, tome III.",
+    ]
+    words = text.split()
+    if len(words) > 8 and rng.random() < 0.65:
+        # Insérer une ou deux phrases hallucinées
+        n_phrases = rng.randint(1, 2)
+        for _ in range(n_phrases):
+            phrase = rng.choice(hallucinated_phrases)
+            insert_pos = rng.randint(len(words) // 2, len(words))
+            words = words[:insert_pos] + phrase.split() + words[insert_pos:]
+    # Modernisation systématique (le VLM normalise)
+    result = " ".join(words)
+    modern_replacements = [
+        ("nostre", "notre"), ("maistre", "maître"), ("faictes", "faites"),
+        ("ledit", "le dit"), ("ladicte", "la dite"), ("icelle", "icelle"),
+        ("iceluy", "icelui"), ("eſt", "est"), ("ſur", "sur"),
+    ]
+    for src, tgt in modern_replacements:
+        result = result.replace(src, tgt)
+    return result
 def _bad_engine_errors(text: str, rng: random.Random) -> str:
     """Moteur de mauvaise qualité : nombreuses erreurs."""
     words = text.split()
                 ],
             },
         ),
+        # Sprint 10 — Modèle VLM fictif avec hallucinations simulées
+        (
+            "gpt-4o-vision (zero-shot)",
+            "gpt-4o-2024-11-20",
+            {"mode": "zero_shot"},
+            _vlm_hallucinations,
+            True,
+            {
+                "pipeline_mode": "zero_shot",
+                "prompt_file": "zero_shot_medieval_vlm.txt",
+                "llm_model": "gpt-4o-2024-11-20",
+                "llm_provider": "openai",
+                "pipeline_steps": [
+                    {
+                        "type": "llm",
+                        "model": "gpt-4o-2024-11-20",
+                        "provider": "openai",
+                        "mode": "zero_shot",
+                        "prompt_file": "zero_shot_medieval_vlm.txt",
+                    },
+                ],
+                "is_vlm": True,
+            },
+        ),
     ]
     engine_reports: list[EngineReport] = []
             metrics = _make_metrics(gt, hypothesis)
+            # Sprint 10 — distribution des erreurs par ligne
+            # Pour simuler des textes multi-lignes, on découpe GT et hypothèse en lignes
+            gt_multiline = "\n".join(gt[i:i+30] for i in range(0, len(gt), 30))
+            hyp_multiline = "\n".join(hypothesis[i:i+30] for i in range(0, len(hypothesis), 30))
+            lm = compute_line_metrics(gt_multiline, hyp_multiline)
+            hm = compute_hallucination_metrics(gt, hypothesis)
             # Sprint 5 — métriques avancées patrimoniales
             cm = build_confusion_matrix(gt, hypothesis)
             lig_score = compute_ligature_score(gt, hypothesis)
                     taxonomy=taxonomy_result.as_dict(),
                     structure=struct_result.as_dict(),
                     image_quality={**iq_result.as_dict(), "script_type": _script_type},
+                    line_metrics=lm.as_dict(),
+                    hallucination_metrics=hm.as_dict(),
                 )
             )
             for dr in doc_results if dr.image_quality
         ])
+        # Sprint 10 — agrégation distribution des erreurs + hallucinations
+        agg_line = aggregate_line_metrics([
+            LineMetrics.from_dict(dr.line_metrics)
+            for dr in doc_results if dr.line_metrics
+        ])
+        from picarones.core.hallucination import HallucinationMetrics as _HM
+        agg_hallucination = aggregate_hallucination_metrics([
+            _HM.from_dict(dr.hallucination_metrics)
+            for dr in doc_results if dr.hallucination_metrics
+        ])
         report = EngineReport(
             engine_name=engine_name,
             engine_version=engine_version,
             aggregated_taxonomy=agg_taxonomy,
             aggregated_structure=agg_structure,
             aggregated_image_quality=agg_iq,
+            aggregated_line_metrics=agg_line,
+            aggregated_hallucination=agg_hallucination,
         )
         engine_reports.append(report)

picarones/report/generator.py CHANGED Viewed

@@ -115,6 +115,17 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
             "aggregated_taxonomy": report.aggregated_taxonomy,
             "aggregated_structure": report.aggregated_structure,
             "aggregated_image_quality": report.aggregated_image_quality,
         }
         engines_summary.append(entry)
@@ -172,6 +183,11 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
                 er_entry["structure"] = dr.structure
             if dr.image_quality is not None:
                 er_entry["image_quality"] = dr.image_quality
             engine_results.append(er_entry)
         # CER moyen sur ce document (pour le badge galerie)
@@ -308,6 +324,32 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
                 **corr,
             })
     return {
         "meta": {
             "corpus_name": benchmark.corpus_name,
@@ -329,6 +371,9 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
         "venn_data": venn_data,
         "error_clusters": error_clusters,
         "correlation_per_engine": correlation_per_engine,
     }
@@ -818,6 +863,58 @@ body.present-mode nav .meta {{ display: none; }}
   min-width: 60px;
 }}
 .corr-table th {{ background: var(--bg); font-weight: 600; font-size: .75rem; }}
 </style>
 </head>
@@ -862,6 +959,8 @@ body.present-mode nav .meta {{ display: none; }}
             <th data-col="wil"  class="sortable">WIL<i class="sort-icon">↕</i></th>
             <th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (ﬁ, ﬂ, œ, æ, ﬀ…)">Ligatures<i class="sort-icon">↕</i></th>
             <th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
             <th>CER médian</th>
             <th>CER min</th>
             <th>CER max</th>
@@ -973,6 +1072,18 @@ body.present-mode nav .meta {{ display: none; }}
         <h3>Sorties OCR — diff par moteur</h3>
         <div class="diff-panels" id="doc-diff-panels"></div>
       </div>
     </div>
   </div>
 </div>
@@ -1080,6 +1191,29 @@ body.present-mode nav .meta {{ display: none; }}
       <div id="error-clusters-container"></div>
     </div>
     <!-- Sprint 7 — Matrice de corrélation -->
     <div class="chart-card technical" style="grid-column:1/-1">
       <h3>Matrice de corrélation entre métriques</h3>
@@ -1283,11 +1417,29 @@ function renderRanking() {{
       </td>`;
     }}
     return `<tr>
       <td><span class="${{badgeClass}}">${{rank}}</span></td>
       <td>
         <span class="engine-name">${{esc(e.name)}}</span>
         ${{pipelineBadge}}
         <span class="engine-version">v${{esc(e.version)}}</span>
         ${{pipelineStepsHtml}}
       </td>
@@ -1301,6 +1453,8 @@ function renderRanking() {{
       <td>${{pct(e.wil)}}</td>
       <td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
       <td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
@@ -1531,6 +1685,240 @@ function loadDocument(docId) {{
       ${{tripleDiffHtml}}
     </div>`;
   }}).join('');
 }}
 function buildDocList() {{
@@ -1603,6 +1991,9 @@ function buildCharts() {{
   buildWilcoxonTable();
   buildErrorClusters();
   initCorrelationMatrix();
 }}
 function buildCerHistogram() {{
@@ -2131,7 +2522,7 @@ function togglePresentMode() {{
 // ── Sprint 7 — Export CSV ────────────────────────────────────────
 function exportCSV() {{
-  const rows = [['doc_id','engine','cer','wer','mer','wil','duration','ligature_score','diacritic_score','difficulty_score']];
   DATA.documents.forEach(doc => {{
     doc.engine_results.forEach(er => {{
       rows.push([
@@ -2145,6 +2536,10 @@ function exportCSV() {{
         er.ligature_score !== null ? er.ligature_score : '',
         er.diacritic_score !== null ? er.diacritic_score : '',
         doc.difficulty_score !== undefined ? (doc.difficulty_score * 100).toFixed(2) : '',
       ]);
     }});
   }});

             "aggregated_taxonomy": report.aggregated_taxonomy,
             "aggregated_structure": report.aggregated_structure,
             "aggregated_image_quality": report.aggregated_image_quality,
+            # Sprint 10 — distribution des erreurs + hallucinations VLM
+            "gini": _safe(report.aggregated_line_metrics.get("gini_mean")) if report.aggregated_line_metrics else None,
+            "cer_p90": _safe(report.aggregated_line_metrics.get("percentiles", {}).get("p90")) if report.aggregated_line_metrics else None,
+            "cer_p99": _safe(report.aggregated_line_metrics.get("percentiles", {}).get("p99")) if report.aggregated_line_metrics else None,
+            "catastrophic_rate_30": _safe(report.aggregated_line_metrics.get("catastrophic_rate", {}).get("0.3")) if report.aggregated_line_metrics else None,
+            "aggregated_line_metrics": report.aggregated_line_metrics,
+            "anchor_score": _safe(report.aggregated_hallucination.get("anchor_score_mean")) if report.aggregated_hallucination else None,
+            "length_ratio": _safe(report.aggregated_hallucination.get("length_ratio_mean")) if report.aggregated_hallucination else None,
+            "hallucinating_doc_rate": _safe(report.aggregated_hallucination.get("hallucinating_doc_rate")) if report.aggregated_hallucination else None,
+            "aggregated_hallucination": report.aggregated_hallucination,
+            "is_vlm": report.pipeline_info.get("is_vlm", False) if report.pipeline_info else False,
         }
         engines_summary.append(entry)
                 er_entry["structure"] = dr.structure
             if dr.image_quality is not None:
                 er_entry["image_quality"] = dr.image_quality
+            # Sprint 10
+            if dr.line_metrics is not None:
+                er_entry["line_metrics"] = dr.line_metrics
+            if dr.hallucination_metrics is not None:
+                er_entry["hallucination_metrics"] = dr.hallucination_metrics
             engine_results.append(er_entry)
         # CER moyen sur ce document (pour le badge galerie)
                 **corr,
             })
+    # ── Sprint 10 — Données scatter plots ─────────────────────────────────
+    # Scatter 1 : Gini vs CER moyen (moteurs)
+    gini_vs_cer = []
+    for report in benchmark.engine_reports:
+        gini_val = report.aggregated_line_metrics.get("gini_mean") if report.aggregated_line_metrics else None
+        cer_val = report.mean_cer
+        if gini_val is not None and cer_val is not None:
+            gini_vs_cer.append({
+                "engine": report.engine_name,
+                "cer": _safe(cer_val),
+                "gini": _safe(gini_val),
+                "is_pipeline": report.is_pipeline,
+            })
+    # Scatter 2 : ratio longueur vs score d'ancrage (moteurs)
+    ratio_vs_anchor = []
+    for report in benchmark.engine_reports:
+        if report.aggregated_hallucination:
+            ratio_vs_anchor.append({
+                "engine": report.engine_name,
+                "length_ratio": _safe(report.aggregated_hallucination.get("length_ratio_mean", 1.0)),
+                "anchor_score": _safe(report.aggregated_hallucination.get("anchor_score_mean", 1.0)),
+                "hallucinating_rate": _safe(report.aggregated_hallucination.get("hallucinating_doc_rate", 0.0)),
+                "is_vlm": report.pipeline_info.get("is_vlm", False) if report.pipeline_info else False,
+            })
     return {
         "meta": {
             "corpus_name": benchmark.corpus_name,
         "venn_data": venn_data,
         "error_clusters": error_clusters,
         "correlation_per_engine": correlation_per_engine,
+        # Sprint 10
+        "gini_vs_cer": gini_vs_cer,
+        "ratio_vs_anchor": ratio_vs_anchor,
     }
   min-width: 60px;
 }}
 .corr-table th {{ background: var(--bg); font-weight: 600; font-size: .75rem; }}
+/* ── Sprint 10 — heatmap erreurs ─────────────────────────────────*/
+.heatmap-wrap {{
+  display: flex; gap: 3px; align-items: flex-end;
+  height: 60px; margin: .5rem 0;
+}}
+.heatmap-bar {{
+  flex: 1; border-radius: 3px 3px 0 0;
+  min-height: 4px;
+  transition: opacity .15s;
+}}
+.heatmap-bar:hover {{ opacity: .75; }}
+.heatmap-labels {{
+  display: flex; justify-content: space-between;
+  font-size: .65rem; color: var(--text-muted); margin-top: .15rem;
+}}
+/* ── Sprint 10 — hallucination badge ──────────────────���──────────*/
+.hallucination-badge {{
+  display: inline-flex; align-items: center; gap: .25rem;
+  padding: .15rem .45rem; border-radius: 4px;
+  font-size: .72rem; font-weight: 700;
+  background: #fce7f3; color: #9d174d;
+  border: 1px solid #fbcfe8;
+}}
+.hallucination-badge.ok {{
+  background: #f0fdf4; color: #15803d;
+  border-color: #bbf7d0;
+}}
+/* ── Sprint 10 — bloc halluciné ──────────────────────────────────*/
+.halluc-block {{
+  background: #fce7f3; border: 1px solid #f9a8d4;
+  border-radius: 4px; padding: .35rem .6rem;
+  margin: .25rem 0; font-size: .78rem;
+  font-family: 'Georgia', serif; color: #9d174d;
+}}
+.halluc-block-meta {{
+  font-size: .65rem; color: #be185d; font-family: system-ui, sans-serif;
+  margin-bottom: .15rem; font-weight: 600;
+}}
+/* ── Sprint 10 — percentile bars ─────────────────────────────────*/
+.pct-bars {{ display: flex; flex-direction: column; gap: .25rem; margin: .4rem 0; }}
+.pct-bar-row {{ display: flex; align-items: center; gap: .4rem; font-size: .72rem; }}
+.pct-bar-label {{ width: 2.5rem; color: var(--text-muted); text-align: right; flex-shrink: 0; }}
+.pct-bar-track {{
+  flex: 1; height: 8px; background: var(--bg);
+  border-radius: 4px; overflow: hidden;
+}}
+.pct-bar-fill {{ height: 100%; border-radius: 4px; }}
+.pct-bar-val {{ width: 3rem; color: var(--text); font-weight: 600; }}
 </style>
 </head>
             <th data-col="wil"  class="sortable">WIL<i class="sort-icon">↕</i></th>
             <th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (ﬁ, ﬂ, œ, æ, ﬀ…)">Ligatures<i class="sort-icon">↕</i></th>
             <th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
+            <th data-col="gini" class="sortable" title="Coefficient de Gini des erreurs CER par ligne — 0 = erreurs uniformes, 1 = erreurs concentrées. Un bon moteur a CER bas ET Gini bas.">Gini<i class="sort-icon">↕</i></th>
+            <th data-col="anchor_score" class="sortable" title="Score d'ancrage : proportion des trigrammes de la sortie trouvant un ancrage dans le GT — faible score = hallucinations probables (LLM/VLM)">Ancrage<i class="sort-icon">↕</i></th>
             <th>CER médian</th>
             <th>CER min</th>
             <th>CER max</th>
         <h3>Sorties OCR — diff par moteur</h3>
         <div class="diff-panels" id="doc-diff-panels"></div>
       </div>
+      <!-- Sprint 10 — Distribution CER par ligne -->
+      <div class="card" id="doc-line-metrics-card" style="display:none">
+        <h3>Distribution des erreurs par ligne</h3>
+        <div id="doc-line-metrics-content"></div>
+      </div>
+      <!-- Sprint 10 — Hallucinations détectées -->
+      <div class="card" id="doc-hallucination-card" style="display:none">
+        <h3>Analyse des hallucinations</h3>
+        <div id="doc-hallucination-content"></div>
+      </div>
     </div>
   </div>
 </div>
       <div id="error-clusters-container"></div>
     </div>
+    <!-- Sprint 10 — Scatter Gini vs CER moyen -->
+    <div class="chart-card">
+      <h3>Gini vs CER moyen <span style="font-size:.72rem;font-weight:400;color:var(--text-muted)">— idéal : bas-gauche</span></h3>
+      <div class="chart-canvas-wrap">
+        <canvas id="chart-gini-cer"></canvas>
+      </div>
+      <div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
+        Axe X = CER moyen, Axe Y = coefficient de Gini. Un moteur idéal a CER bas ET Gini bas (erreurs rares et uniformes).
+      </div>
+    </div>
+    <!-- Sprint 10 — Scatter ratio longueur vs ancrage -->
+    <div class="chart-card">
+      <h3>Ratio longueur vs ancrage <span style="font-size:.72rem;font-weight:400;color:var(--text-muted)">— hallucinations VLM</span></h3>
+      <div class="chart-canvas-wrap">
+        <canvas id="chart-ratio-anchor"></canvas>
+      </div>
+      <div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
+        Axe X = score d'ancrage trigrammes [0–1]. Axe Y = ratio longueur sortie/GT.
+        Zone ⚠️ : ancrage &lt; 0.5 ou ratio &gt; 1.2 → hallucinations probables.
+      </div>
+    </div>
     <!-- Sprint 7 — Matrice de corrélation -->
     <div class="chart-card technical" style="grid-column:1/-1">
       <h3>Matrice de corrélation entre métriques</h3>
       </td>`;
     }}
+    // ── Sprint 10 : Gini + Ancrage ─────────────────────────────────────
+    let giniCell = '<td style="color:var(--text-muted)">—</td>';
+    if (e.gini !== null && e.gini !== undefined) {{
+      const gv = e.gini;
+      const gColor = gv < 0.3 ? '#16a34a' : gv < 0.5 ? '#ca8a04' : '#dc2626';
+      const gBg = gv < 0.3 ? '#f0fdf4' : gv < 0.5 ? '#fefce8' : '#fef2f2';
+      giniCell = `<td><span class="cer-badge" style="color:${{gColor}};background:${{gBg}}"
+        title="Gini=${{gv.toFixed(3)}} — 0=uniforme, 1=concentré">${{gv.toFixed(3)}}</span></td>`;
+    }}
+    let anchorCell = '<td style="color:var(--text-muted)">—</td>';
+    if (e.anchor_score !== null && e.anchor_score !== undefined) {{
+      const av = e.anchor_score;
+      const hallBadge = (e.hallucinating_doc_rate && e.hallucinating_doc_rate > 0.2)
+        ? ' <span title="Hallucinations détectées">⚠️</span>' : '';
+      anchorCell = `<td>${{_scoreBadge(av, 'Ancrage trigrammes')}}${{hallBadge}}</td>`;
+    }}
     return `<tr>
       <td><span class="${{badgeClass}}">${{rank}}</span></td>
       <td>
         <span class="engine-name">${{esc(e.name)}}</span>
         ${{pipelineBadge}}
+        ${{e.is_vlm ? '<span class="pipeline-tag" style="background:#fce7f3;color:#9d174d">👁 VLM</span>' : ''}}
         <span class="engine-version">v${{esc(e.version)}}</span>
         ${{pipelineStepsHtml}}
       </td>
       <td>${{pct(e.wil)}}</td>
       <td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
       <td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
+      ${{giniCell}}
+      ${{anchorCell}}
       <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
       ${{tripleDiffHtml}}
     </div>`;
   }}).join('');
+  // ── Sprint 10 : distribution CER par ligne ──────────────────────────
+  const lineCard = document.getElementById('doc-line-metrics-card');
+  const lineContent = document.getElementById('doc-line-metrics-content');
+  // Prendre le premier moteur ayant des line_metrics
+  const erWithLine = doc.engine_results.find(er => er.line_metrics);
+  if (erWithLine && erWithLine.line_metrics) {{
+    lineCard.style.display = '';
+    lineContent.innerHTML = renderLineMetrics(doc.engine_results);
+  }} else {{
+    lineCard.style.display = 'none';
+  }}
+  // ── Sprint 10 : hallucinations ──────────────────────────────────────
+  const hallCard = document.getElementById('doc-hallucination-card');
+  const hallContent = document.getElementById('doc-hallucination-content');
+  const erWithHall = doc.engine_results.find(er => er.hallucination_metrics && er.hallucination_metrics.is_hallucinating);
+  if (erWithHall || doc.engine_results.some(er => er.hallucination_metrics)) {{
+    hallCard.style.display = '';
+    hallContent.innerHTML = renderHallucinationPanel(doc.engine_results);
+  }} else {{
+    hallCard.style.display = 'none';
+  }}
+}}
+// ── Sprint 10 : rendu distribution CER par ligne ────────────────
+function renderLineMetrics(engineResults) {{
+  const heatmapColors = (v) => {{
+    if (v < 0.05) return '#86efac';
+    if (v < 0.15) return '#fde68a';
+    if (v < 0.30) return '#fb923c';
+    return '#f87171';
+  }};
+  return engineResults.filter(er => er.line_metrics).map(er => {{
+    const lm = er.line_metrics;
+    const c = cerColor(er.cer); const bg = cerBg(er.cer);
+    // Heatmap de position
+    const heatmap = lm.heatmap || [];
+    const maxHeat = Math.max(...heatmap, 0.01);
+    const heatmapHtml = heatmap.length > 0
+      ? `<div class="heatmap-wrap">` +
+        heatmap.map((v, i) => {{
+          const h = Math.max(4, Math.round(60 * v / maxHeat));
+          return `<div class="heatmap-bar" style="height:${{h}}px;background:${{heatmapColors(v)}}"
+            title="Tranche ${{i+1}}/${{heatmap.length}} — CER=${{(v*100).toFixed(1)}}%"></div>`;
+        }}).join('') +
+        `</div><div class="heatmap-labels"><span>Début</span><span>Milieu</span><span>Fin</span></div>`
+      : '<em style="color:var(--text-muted)">—</em>';
+    // Percentiles
+    const p = lm.percentiles || {{}};
+    const pctBars = ['p50','p75','p90','p95','p99'].map(k => {{
+      const v = p[k] || 0;
+      const w = Math.min(100, v * 100 * 2);
+      const fillColor = v < 0.15 ? '#86efac' : v < 0.30 ? '#fde68a' : '#f87171';
+      return `<div class="pct-bar-row">
+        <span class="pct-bar-label">${{k}}</span>
+        <div class="pct-bar-track"><div class="pct-bar-fill" style="width:${{w}}%;background:${{fillColor}}"></div></div>
+        <span class="pct-bar-val">${{(v*100).toFixed(1)}}%</span>
+      </div>`;
+    }}).join('');
+    // Taux catastrophiques
+    const cr = lm.catastrophic_rate || {{}};
+    const crRows = Object.entries(cr).map(([t, rate]) => {{
+      const tPct = (parseFloat(t)*100).toFixed(0);
+      const ratePct = (rate*100).toFixed(1);
+      const color = rate < 0.05 ? '#16a34a' : rate < 0.15 ? '#ca8a04' : '#dc2626';
+      return `<span class="stat"><b style="color:${{color}}">${{ratePct}}%</b> lignes CER&gt;${{tPct}}%</span>`;
+    }}).join('');
+    // Gini
+    const gini = lm.gini !== undefined ? lm.gini.toFixed(3) : '—';
+    const giniColor = lm.gini < 0.3 ? '#16a34a' : lm.gini < 0.5 ? '#ca8a04' : '#dc2626';
+    return `<div style="margin-bottom:1.25rem;padding-bottom:1rem;border-bottom:1px solid var(--border)">
+      <div style="display:flex;align-items:center;gap:.5rem;margin-bottom:.6rem">
+        <strong>${{esc(er.engine)}}</strong>
+        <span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
+        <span class="stat">Gini <b style="color:${{giniColor}}">${{gini}}</b></span>
+        <span class="stat">${{lm.line_count}} lignes</span>
+        ${{crRows}}
+      </div>
+      <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem">
+        <div>
+          <div style="font-size:.75rem;font-weight:600;color:var(--text-muted);margin-bottom:.3rem">CARTE THERMIQUE (position)</div>
+          ${{heatmapHtml}}
+        </div>
+        <div>
+          <div style="font-size:.75rem;font-weight:600;color:var(--text-muted);margin-bottom:.3rem">PERCENTILES CER</div>
+          <div class="pct-bars">${{pctBars}}</div>
+        </div>
+      </div>
+    </div>`;
+  }}).join('') || '<em style="color:var(--text-muted)">Aucune métrique de ligne disponible.</em>';
+}}
+// ── Sprint 10 : rendu panneau hallucinations ─────────────────────
+function renderHallucinationPanel(engineResults) {{
+  const withHall = engineResults.filter(er => er.hallucination_metrics);
+  if (!withHall.length) return '<em style="color:var(--text-muted)">Aucune métrique d\'hallucination disponible.</em>';
+  return withHall.map(er => {{
+    const hm = er.hallucination_metrics;
+    const isHall = hm.is_hallucinating;
+    const badgeClass = isHall ? 'hallucination-badge' : 'hallucination-badge ok';
+    const badgeLabel = isHall ? '⚠️ Hallucinations détectées' : '✓ Ancrage satisfaisant';
+    const blocksHtml = hm.hallucinated_blocks && hm.hallucinated_blocks.length > 0
+      ? hm.hallucinated_blocks.slice(0, 5).map(b =>
+          `<div class="halluc-block">
+            <div class="halluc-block-meta">Bloc halluciné — ${{b.length}} mots (tokens ${{b.start_token}}–${{b.end_token}})</div>
+            ${{esc(b.text)}}
+          </div>`
+        ).join('') +
+        (hm.hallucinated_blocks.length > 5 ? `<div style="font-size:.72rem;color:var(--text-muted);margin-top:.25rem">… ${{hm.hallucinated_blocks.length - 5}} bloc(s) supplémentaire(s)</div>` : '')
+      : '<em style="color:var(--text-muted);font-size:.8rem">Aucun bloc halluciné détecté.</em>';
+    return `<div style="margin-bottom:1.25rem;padding-bottom:1rem;border-bottom:1px solid var(--border)">
+      <div style="display:flex;align-items:center;gap:.5rem;margin-bottom:.6rem;flex-wrap:wrap">
+        <strong>${{esc(er.engine)}}</strong>
+        <span class="${{badgeClass}}">${{badgeLabel}}</span>
+        <span class="stat">Ancrage <b>${{(hm.anchor_score*100).toFixed(1)}}%</b></span>
+        <span class="stat">Ratio longueur <b>${{hm.length_ratio.toFixed(2)}}</b></span>
+        <span class="stat">Insertion nette <b>${{(hm.net_insertion_rate*100).toFixed(1)}}%</b></span>
+        <span class="stat">${{hm.gt_word_count}} mots GT / ${{hm.hyp_word_count}} mots sortie</span>
+      </div>
+      ${{isHall ? `<div style="margin-bottom:.5rem;font-size:.82rem;font-weight:600;color:#9d174d">Blocs sans ancrage dans le GT :</div>` : ''}}
+      ${{isHall ? blocksHtml : ''}}
+    </div>`;
+  }}).join('');
+}}
+// ── Sprint 10 — Scatter Gini vs CER moyen ──────────────────────
+function buildGiniCerScatter() {{
+  const canvas = document.getElementById('chart-gini-cer');
+  if (!canvas) return;
+  const pts = DATA.gini_vs_cer || [];
+  if (!pts.length) {{
+    canvas.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Données Gini non disponibles.</p>';
+    return;
+  }}
+  const datasets = pts.map((p, i) => ({{
+    label: p.engine,
+    data: [{{ x: p.cer * 100, y: p.gini }}],
+    backgroundColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)) + 'cc',
+    borderColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)),
+    borderWidth: p.is_pipeline ? 2 : 1,
+    pointRadius: p.is_pipeline ? 9 : 7,
+    pointStyle: p.is_pipeline ? 'triangle' : 'circle',
+  }}));
+  chartInstances['gini-cer'] = new Chart(canvas.getContext('2d'), {{
+    type: 'scatter',
+    data: {{ datasets }},
+    options: {{
+      responsive: true, maintainAspectRatio: false,
+      plugins: {{
+        legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
+        tooltip: {{ callbacks: {{
+          label: ctx => `${{ctx.dataset.label}}: CER=${{ctx.parsed.x.toFixed(2)}}%, Gini=${{ctx.parsed.y.toFixed(3)}}`,
+        }} }},
+      }},
+      scales: {{
+        x: {{ min: 0, title: {{ display: true, text: 'CER moyen (%)', font: {{ size: 11 }} }} }},
+        y: {{ min: 0, max: 1, title: {{ display: true, text: 'Coefficient de Gini', font: {{ size: 11 }} }} }},
+      }},
+    }},
+  }});
+}}
+// ── Sprint 10 — Scatter ratio longueur vs score d'ancrage ────────
+function buildRatioAnchorScatter() {{
+  const canvas = document.getElementById('chart-ratio-anchor');
+  if (!canvas) return;
+  const pts = DATA.ratio_vs_anchor || [];
+  if (!pts.length) {{
+    canvas.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Données d\'ancrage non disponibles.</p>';
+    return;
+  }}
+  // Zone de danger (ancrage < 0.5 OU ratio > 1.2) dessinée via plugin
+  const datasets = pts.map((p, i) => ({{
+    label: p.engine + (p.is_vlm ? ' 👁' : ''),
+    data: [{{ x: p.anchor_score, y: p.length_ratio }}],
+    backgroundColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)) + 'cc',
+    borderColor: engineColor(DATA.engines.findIndex(e => e.name === p.engine)),
+    borderWidth: p.is_vlm ? 3 : 1,
+    pointRadius: p.is_vlm ? 10 : 7,
+    pointStyle: p.is_vlm ? 'star' : 'circle',
+  }}));
+  chartInstances['ratio-anchor'] = new Chart(canvas.getContext('2d'), {{
+    type: 'scatter',
+    data: {{ datasets }},
+    options: {{
+      responsive: true, maintainAspectRatio: false,
+      plugins: {{
+        legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
+        tooltip: {{ callbacks: {{
+          label: ctx => `${{ctx.dataset.label}}: ancrage=${{(ctx.parsed.x*100).toFixed(1)}}%, ratio=${{ctx.parsed.y.toFixed(2)}}`,
+        }} }},
+      }},
+      scales: {{
+        x: {{ min: 0, max: 1, title: {{ display: true, text: 'Score d\'ancrage [0–1]', font: {{ size: 11 }} }} }},
+        y: {{ min: 0, title: {{ display: true, text: 'Ratio longueur (sortie/GT)', font: {{ size: 11 }} }} }},
+      }},
+    }},
+    plugins: [{{
+      id: 'danger-zones',
+      beforeDraw(chart) {{
+        const {{ ctx: c, chartArea: {{ left, top, right, bottom }}, scales: {{ x, y }} }} = chart;
+        c.save();
+        // Ancrage < 0.5 (gauche)
+        const xHalf = x.getPixelForValue(0.5);
+        c.fillStyle = 'rgba(239,68,68,0.07)';
+        c.fillRect(left, top, xHalf - left, bottom - top);
+        // Ratio > 1.2 (haut)
+        const y12 = y.getPixelForValue(1.2);
+        if (y12 > top) {{
+          c.fillRect(left, top, right - left, y12 - top);
+        }}
+        // Lignes de seuil
+        c.strokeStyle = 'rgba(239,68,68,0.35)'; c.lineWidth = 1; c.setLineDash([4,4]);
+        c.beginPath(); c.moveTo(xHalf, top); c.lineTo(xHalf, bottom); c.stroke();
+        if (y12 > top) {{
+          c.beginPath(); c.moveTo(left, y12); c.lineTo(right, y12); c.stroke();
+        }}
+        c.restore();
+      }},
+    }}],
+  }});
 }}
 function buildDocList() {{
   buildWilcoxonTable();
   buildErrorClusters();
   initCorrelationMatrix();
+  // Sprint 10
+  buildGiniCerScatter();
+  buildRatioAnchorScatter();
 }}
 function buildCerHistogram() {{
 // ── Sprint 7 — Export CSV ────────────────────────────────────────
 function exportCSV() {{
+  const rows = [['doc_id','engine','cer','wer','mer','wil','duration','ligature_score','diacritic_score','difficulty_score','gini','anchor_score','length_ratio','is_hallucinating']];
   DATA.documents.forEach(doc => {{
     doc.engine_results.forEach(er => {{
       rows.push([
         er.ligature_score !== null ? er.ligature_score : '',
         er.diacritic_score !== null ? er.diacritic_score : '',
         doc.difficulty_score !== undefined ? (doc.difficulty_score * 100).toFixed(2) : '',
+        er.line_metrics ? er.line_metrics.gini.toFixed(6) : '',
+        er.hallucination_metrics ? er.hallucination_metrics.anchor_score.toFixed(6) : '',
+        er.hallucination_metrics ? er.hallucination_metrics.length_ratio.toFixed(4) : '',
+        er.hallucination_metrics ? (er.hallucination_metrics.is_hallucinating ? '1' : '0') : '',
       ]);
     }});
   }});

tests/test_sprint10_error_distribution.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""Tests Sprint 10 — Distribution des erreurs par ligne et détection des hallucinations VLM.
+Classes de tests
+----------------
+TestLineMetrics          (12 tests) — compute_line_metrics + aggregate_line_metrics
+TestHallucinationMetrics (12 tests) — compute_hallucination_metrics + aggregate_hallucination_metrics
+TestLineMetricsInResults  (4 tests) — intégration dans DocumentResult / EngineReport
+TestFixturesVLM           (6 tests) — moteur VLM fictif et génération de données
+TestReportSprint10        (6 tests) — rapport HTML contient les nouvelles métriques
+"""
+from __future__ import annotations
+import math
+from pathlib import Path
+import pytest
+# ---------------------------------------------------------------------------
+# Helpers communs
+# ---------------------------------------------------------------------------
+GT_SIMPLE = "Le renard brun saute par-dessus le chien paresseux."
+HYP_PERFECT = "Le renard brun saute par-dessus le chien paresseux."
+HYP_ERRORS = "Le renrd brin soute par-desous le chen paressux."
+HYP_MISSING = "Le renard brun saute."
+GT_MULTILINE = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France."
+HYP_MULTILINE_PERFECT = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France."
+HYP_MULTILINE_ERRORS = "Icy commence le prologue\nde maistre Jehan Froissart\nsus les croniques de France."
+GT_MEDIEVAL = "Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre."
+HYP_HALLUCINATED = (
+    "Icy commence le prologue de maistre Jehan Froissart sus les croniques de France et d'Angleterre. "
+    "Ledit document fut enregistré au greffe le lendemain. "
+    "Signé et paraphé par le notaire royal en présence de témoins. "
+    "Archives nationales, cote F/7/1234, pièce n° 42."
+)
+# ===========================================================================
+# TestLineMetrics
+# ===========================================================================
+class TestLineMetrics:
+    """Tests pour picarones.core.line_metrics.compute_line_metrics."""
+    def test_import(self):
+        from picarones.core.line_metrics import compute_line_metrics, LineMetrics
+        assert callable(compute_line_metrics)
+        assert LineMetrics is not None
+    def test_perfect_match_cer_zero(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
+        assert result.mean_cer == pytest.approx(0.0, abs=1e-9)
+        assert all(v == pytest.approx(0.0, abs=1e-9) for v in result.cer_per_line)
+    def test_line_count(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        assert result.line_count == 3
+    def test_cer_per_line_length(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        assert len(result.cer_per_line) == 3
+    def test_percentiles_keys(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        for key in ("p50", "p75", "p90", "p95", "p99"):
+            assert key in result.percentiles
+            assert 0.0 <= result.percentiles[key] <= 1.0
+    def test_percentile_ordering(self):
+        """p50 ≤ p75 ≤ p90 ≤ p95 ≤ p99."""
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        p = result.percentiles
+        assert p["p50"] <= p["p75"] <= p["p90"] <= p["p95"] <= p["p99"]
+    def test_gini_zero_for_perfect(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
+        assert result.gini == pytest.approx(0.0, abs=1e-9)
+    def test_gini_range(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        assert 0.0 <= result.gini <= 1.0
+    def test_catastrophic_rate_keys(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS,
+                                      thresholds=[0.30, 0.50, 1.00])
+        for t in (0.30, 0.50, 1.00):
+            assert t in result.catastrophic_rate
+            assert 0.0 <= result.catastrophic_rate[t] <= 1.0
+    def test_heatmap_length(self):
+        from picarones.core.line_metrics import compute_line_metrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS, heatmap_bins=5)
+        assert len(result.heatmap) == 5
+    def test_as_dict_and_from_dict_roundtrip(self):
+        from picarones.core.line_metrics import compute_line_metrics, LineMetrics
+        result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        d = result.as_dict()
+        restored = LineMetrics.from_dict(d)
+        assert restored.gini == pytest.approx(result.gini, abs=1e-5)
+        assert restored.line_count == result.line_count
+        assert len(restored.cer_per_line) == len(result.cer_per_line)
+    def test_aggregate_line_metrics(self):
+        from picarones.core.line_metrics import compute_line_metrics, aggregate_line_metrics, LineMetrics
+        r1 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT)
+        r2 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS)
+        agg = aggregate_line_metrics([r1, r2])
+        assert "gini_mean" in agg
+        assert "percentiles" in agg
+        assert "catastrophic_rate" in agg
+        assert "document_count" in agg
+        assert agg["document_count"] == 2
+        assert agg["gini_mean"] >= 0.0
+# ===========================================================================
+# TestHallucinationMetrics
+# ===========================================================================
+class TestHallucinationMetrics:
+    """Tests pour picarones.core.hallucination.compute_hallucination_metrics."""
+    def test_import(self):
+        from picarones.core.hallucination import compute_hallucination_metrics, HallucinationMetrics
+        assert callable(compute_hallucination_metrics)
+        assert HallucinationMetrics is not None
+    def test_perfect_match_anchor_one(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
+        # Ancrage parfait → score proche de 1.0
+        assert result.anchor_score == pytest.approx(1.0, abs=0.05)
+        assert result.is_hallucinating is False
+    def test_length_ratio_perfect(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
+        assert result.length_ratio == pytest.approx(1.0, abs=0.05)
+    def test_hallucination_detected(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
+        # L'hypothèse est beaucoup plus longue
+        assert result.length_ratio > 1.0
+        assert result.is_hallucinating is True
+    def test_hallucinated_blocks_detected(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED,
+                                               anchor_threshold=0.5, min_block_length=3)
+        # Des blocs hallucinés doivent être détectés
+        assert len(result.hallucinated_blocks) > 0
+    def test_net_insertion_rate_range(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
+        assert 0.0 <= result.net_insertion_rate <= 1.0
+    def test_word_counts(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
+        assert result.gt_word_count > 0
+        assert result.hyp_word_count > 0
+    def test_empty_reference(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics("", "some text here added by model")
+        # Référence vide : insertion nette maximale
+        assert result.net_insertion_rate == pytest.approx(1.0, abs=0.05)
+    def test_empty_hypothesis(self):
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(GT_SIMPLE, "")
+        assert result.hyp_word_count == 0
+        assert result.net_insertion_rate == pytest.approx(0.0)
+    def test_as_dict_and_from_dict_roundtrip(self):
+        from picarones.core.hallucination import compute_hallucination_metrics, HallucinationMetrics
+        result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
+        d = result.as_dict()
+        restored = HallucinationMetrics.from_dict(d)
+        assert restored.anchor_score == pytest.approx(result.anchor_score, abs=1e-5)
+        assert restored.is_hallucinating == result.is_hallucinating
+        assert len(restored.hallucinated_blocks) == len(result.hallucinated_blocks)
+    def test_aggregate_hallucination_metrics(self):
+        from picarones.core.hallucination import compute_hallucination_metrics, aggregate_hallucination_metrics
+        r1 = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT)
+        r2 = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED)
+        agg = aggregate_hallucination_metrics([r1, r2])
+        assert "anchor_score_mean" in agg
+        assert "length_ratio_mean" in agg
+        assert "hallucinating_doc_count" in agg
+        assert "document_count" in agg
+        assert agg["document_count"] == 2
+        assert agg["hallucinating_doc_count"] >= 1
+    def test_anchor_threshold_respected(self):
+        """Un ancrage très bas déclenche le badge hallucination."""
+        from picarones.core.hallucination import compute_hallucination_metrics
+        result = compute_hallucination_metrics(
+            "abc def ghi", "xyz uvw rst opq lmn",
+            anchor_threshold=0.5
+        )
+        assert result.anchor_score < 0.5
+        assert result.is_hallucinating is True
+# ===========================================================================
+# TestLineMetricsInResults
+# ===========================================================================
+class TestLineMetricsInResults:
+    """Tests pour l'intégration des métriques Sprint 10 dans les modèles de données."""
+    def test_document_result_has_line_metrics_field(self):
+        from picarones.core.results import DocumentResult
+        from picarones.core.metrics import MetricsResult
+        dr = DocumentResult(
+            doc_id="test_001",
+            image_path="/test/img.jpg",
+            ground_truth=GT_SIMPLE,
+            hypothesis=HYP_ERRORS,
+            metrics=MetricsResult(
+                cer=0.1, cer_nfc=0.1, cer_caseless=0.09,
+                wer=0.2, wer_normalized=0.2,
+                mer=0.15, wil=0.18,
+                reference_length=50, hypothesis_length=48,
+            ),
+            duration_seconds=1.0,
+            line_metrics={"gini": 0.3, "line_count": 3},
+        )
+        assert dr.line_metrics is not None
+        assert dr.line_metrics["gini"] == pytest.approx(0.3)
+    def test_document_result_has_hallucination_metrics_field(self):
+        from picarones.core.results import DocumentResult
+        from picarones.core.metrics import MetricsResult
+        dr = DocumentResult(
+            doc_id="test_002",
+            image_path="/test/img.jpg",
+            ground_truth=GT_SIMPLE,
+            hypothesis=HYP_HALLUCINATED,
+            metrics=MetricsResult(
+                cer=0.5, cer_nfc=0.5, cer_caseless=0.5,
+                wer=0.6, wer_normalized=0.6,
+                mer=0.55, wil=0.65,
+                reference_length=50, hypothesis_length=100,
+            ),
+            duration_seconds=2.0,
+            hallucination_metrics={"anchor_score": 0.3, "is_hallucinating": True},
+        )
+        assert dr.hallucination_metrics is not None
+        assert dr.hallucination_metrics["is_hallucinating"] is True
+    def test_document_result_as_dict_includes_sprint10_fields(self):
+        from picarones.core.results import DocumentResult
+        from picarones.core.metrics import MetricsResult
+        dr = DocumentResult(
+            doc_id="test_003",
+            image_path="/test/img.jpg",
+            ground_truth=GT_SIMPLE,
+            hypothesis=HYP_PERFECT,
+            metrics=MetricsResult(
+                cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
+                wer=0.0, wer_normalized=0.0,
+                mer=0.0, wil=0.0,
+                reference_length=50, hypothesis_length=50,
+            ),
+            duration_seconds=0.5,
+            line_metrics={"gini": 0.0, "line_count": 1},
+            hallucination_metrics={"anchor_score": 1.0, "is_hallucinating": False},
+        )
+        d = dr.as_dict()
+        assert "line_metrics" in d
+        assert "hallucination_metrics" in d
+    def test_engine_report_has_aggregated_sprint10_fields(self):
+        from picarones.core.results import EngineReport, DocumentResult
+        from picarones.core.metrics import MetricsResult
+        dr = DocumentResult(
+            doc_id="test_004",
+            image_path="/test/img.jpg",
+            ground_truth=GT_SIMPLE,
+            hypothesis=HYP_PERFECT,
+            metrics=MetricsResult(
+                cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
+                wer=0.0, wer_normalized=0.0,
+                mer=0.0, wil=0.0,
+                reference_length=50, hypothesis_length=50,
+            ),
+            duration_seconds=0.5,
+        )
+        report = EngineReport(
+            engine_name="test_engine",
+            engine_version="1.0",
+            engine_config={},
+            document_results=[dr],
+            aggregated_line_metrics={"gini_mean": 0.1, "document_count": 1},
+            aggregated_hallucination={"anchor_score_mean": 0.95, "document_count": 1},
+        )
+        assert report.aggregated_line_metrics is not None
+        assert report.aggregated_hallucination is not None
+        d = report.as_dict()
+        assert "aggregated_line_metrics" in d
+        assert "aggregated_hallucination" in d
+# ===========================================================================
+# TestFixturesVLM
+# ===========================================================================
+class TestFixturesVLM:
+    """Tests pour le moteur VLM fictif dans picarones.fixtures."""
+    def test_generate_sample_benchmark_has_vlm_engine(self):
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        engine_names = [r.engine_name for r in bm.engine_reports]
+        assert any("vision" in name.lower() or "vlm" in name.lower() or "zero-shot" in name.lower()
+                   for name in engine_names)
+    def test_vlm_engine_has_hallucination_metrics(self):
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        vlm_report = next(
+            (r for r in bm.engine_reports
+             if r.pipeline_info.get("is_vlm")),
+            None
+        )
+        assert vlm_report is not None, "Moteur VLM non trouvé"
+        assert vlm_report.aggregated_hallucination is not None
+        assert "anchor_score_mean" in vlm_report.aggregated_hallucination
+    def test_all_engines_have_line_metrics(self):
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        for report in bm.engine_reports:
+            assert report.aggregated_line_metrics is not None, \
+                f"Pas de line_metrics pour {report.engine_name}"
+            assert "gini_mean" in report.aggregated_line_metrics
+    def test_all_documents_have_line_metrics(self):
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        for report in bm.engine_reports:
+            for dr in report.document_results:
+                assert dr.line_metrics is not None, \
+                    f"{report.engine_name}/{dr.doc_id}: line_metrics manquant"
+                assert "gini" in dr.line_metrics
+    def test_all_documents_have_hallucination_metrics(self):
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        for report in bm.engine_reports:
+            for dr in report.document_results:
+                assert dr.hallucination_metrics is not None, \
+                    f"{report.engine_name}/{dr.doc_id}: hallucination_metrics manquant"
+                assert "anchor_score" in dr.hallucination_metrics
+    def test_vlm_engine_has_valid_hallucination_aggregation(self):
+        """Le moteur VLM doit avoir des métriques d'hallucination agrégées valides."""
+        from picarones.fixtures import generate_sample_benchmark
+        bm = generate_sample_benchmark(n_docs=6, seed=42)
+        vlm_report = next(
+            (r for r in bm.engine_reports if r.pipeline_info.get("is_vlm")),
+            None
+        )
+        if vlm_report is None:
+            pytest.skip("Moteur VLM non trouvé")
+        agg = vlm_report.aggregated_hallucination
+        assert agg is not None
+        assert 0.0 <= agg.get("anchor_score_mean", -1) <= 1.0
+        assert agg.get("length_ratio_mean", 0) >= 0.0
+        assert agg.get("document_count", 0) == 6
+# ===========================================================================
+# TestReportSprint10
+# ===========================================================================
+class TestReportSprint10:
+    """Tests pour le rapport HTML — nouvelles métriques Sprint 10."""
+    @pytest.fixture(scope="class")
+    def html_report(self, tmp_path_factory):
+        """Génère un rapport HTML de démonstration."""
+        from picarones.fixtures import generate_sample_benchmark
+        from picarones.report.generator import ReportGenerator
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        tmp = tmp_path_factory.mktemp("report")
+        out = tmp / "sprint10_test.html"
+        ReportGenerator(bm).generate(str(out))
+        return out.read_text(encoding="utf-8")
+    def test_report_generated_not_empty(self, html_report):
+        assert len(html_report) > 50_000
+    def test_report_has_gini_column_header(self, html_report):
+        assert "Gini" in html_report
+    def test_report_has_ancrage_column_header(self, html_report):
+        assert "Ancrage" in html_report
+    def test_report_has_gini_cer_scatter_canvas(self, html_report):
+        assert "chart-gini-cer" in html_report
+    def test_report_has_ratio_anchor_scatter_canvas(self, html_report):
+        assert "chart-ratio-anchor" in html_report
+    def test_report_has_vlm_badge(self, html_report):
+        """Le badge VLM doit apparaître pour le moteur zero-shot."""
+        assert "VLM" in html_report or "zero-shot" in html_report.lower() or "zero_shot" in html_report