"""Tests Sprint 10 — Distribution des erreurs par ligne et détection des hallucinations VLM. Classes de tests ---------------- TestLineMetrics (12 tests) — compute_line_metrics + aggregate_line_metrics TestHallucinationMetrics (12 tests) — compute_hallucination_metrics + aggregate_hallucination_metrics TestLineMetricsInResults (4 tests) — intégration dans DocumentResult / EngineReport TestFixturesVLM (6 tests) — moteur VLM fictif et génération de données TestReportSprint10 (6 tests) — rapport HTML contient les nouvelles métriques """ from __future__ import annotations import pytest # --------------------------------------------------------------------------- # Helpers communs # --------------------------------------------------------------------------- GT_SIMPLE = "Le renard brun saute par-dessus le chien paresseux." HYP_PERFECT = "Le renard brun saute par-dessus le chien paresseux." HYP_ERRORS = "Le renrd brin soute par-desous le chen paressux." HYP_MISSING = "Le renard brun saute." GT_MULTILINE = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France." HYP_MULTILINE_PERFECT = "Icy commence le prologue\nde maiſtre Jehan Froiſſart\nſus les croniques de France." HYP_MULTILINE_ERRORS = "Icy commence le prologue\nde maistre Jehan Froissart\nsus les croniques de France." GT_MEDIEVAL = "Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre." HYP_HALLUCINATED = ( "Icy commence le prologue de maistre Jehan Froissart sus les croniques de France et d'Angleterre. " "Ledit document fut enregistré au greffe le lendemain. " "Signé et paraphé par le notaire royal en présence de témoins. " "Archives nationales, cote F/7/1234, pièce n° 42." ) # =========================================================================== # TestLineMetrics # =========================================================================== class TestLineMetrics: """Tests pour picarones.measurements.line_metrics.compute_line_metrics.""" def test_import(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics, LineMetrics assert callable(compute_line_metrics) assert LineMetrics is not None def test_perfect_match_cer_zero(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT) assert result.mean_cer == pytest.approx(0.0, abs=1e-9) assert all(v == pytest.approx(0.0, abs=1e-9) for v in result.cer_per_line) def test_line_count(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) assert result.line_count == 3 def test_cer_per_line_length(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) assert len(result.cer_per_line) == 3 def test_percentiles_keys(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) for key in ("p50", "p75", "p90", "p95", "p99"): assert key in result.percentiles assert 0.0 <= result.percentiles[key] <= 1.0 def test_percentile_ordering(self): """p50 ≤ p75 ≤ p90 ≤ p95 ≤ p99.""" from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) p = result.percentiles assert p["p50"] <= p["p75"] <= p["p90"] <= p["p95"] <= p["p99"] def test_gini_zero_for_perfect(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT) assert result.gini == pytest.approx(0.0, abs=1e-9) def test_gini_range(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) assert 0.0 <= result.gini <= 1.0 def test_catastrophic_rate_keys(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS, thresholds=[0.30, 0.50, 1.00]) for t in (0.30, 0.50, 1.00): assert t in result.catastrophic_rate assert 0.0 <= result.catastrophic_rate[t] <= 1.0 def test_heatmap_length(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS, heatmap_bins=5) assert len(result.heatmap) == 5 def test_as_dict_and_from_dict_roundtrip(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics, LineMetrics result = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) d = result.as_dict() restored = LineMetrics.from_dict(d) assert restored.gini == pytest.approx(result.gini, abs=1e-5) assert restored.line_count == result.line_count assert len(restored.cer_per_line) == len(result.cer_per_line) def test_aggregate_line_metrics(self): from picarones.evaluation.metrics.line_metrics import compute_line_metrics, aggregate_line_metrics r1 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_PERFECT) r2 = compute_line_metrics(GT_MULTILINE, HYP_MULTILINE_ERRORS) agg = aggregate_line_metrics([r1, r2]) assert "gini_mean" in agg assert "percentiles" in agg assert "catastrophic_rate" in agg assert "document_count" in agg assert agg["document_count"] == 2 assert agg["gini_mean"] >= 0.0 # =========================================================================== # TestHallucinationMetrics # =========================================================================== class TestHallucinationMetrics: """Tests pour picarones.measurements.hallucination.compute_hallucination_metrics.""" def test_import(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics, HallucinationMetrics assert callable(compute_hallucination_metrics) assert HallucinationMetrics is not None def test_perfect_match_anchor_one(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT) # Ancrage parfait → score proche de 1.0 assert result.anchor_score == pytest.approx(1.0, abs=0.05) assert result.is_hallucinating is False def test_length_ratio_perfect(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT) assert result.length_ratio == pytest.approx(1.0, abs=0.05) def test_hallucination_detected(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED) # L'hypothèse est beaucoup plus longue assert result.length_ratio > 1.0 assert result.is_hallucinating is True def test_hallucinated_blocks_detected(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED, anchor_threshold=0.5, min_block_length=3) # Des blocs hallucinés doivent être détectés assert len(result.hallucinated_blocks) > 0 def test_net_insertion_rate_range(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED) assert 0.0 <= result.net_insertion_rate <= 1.0 def test_word_counts(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT) assert result.gt_word_count > 0 assert result.hyp_word_count > 0 def test_empty_reference(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics("", "some text here added by model") # Référence vide : insertion nette maximale assert result.net_insertion_rate == pytest.approx(1.0, abs=0.05) def test_empty_hypothesis(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics(GT_SIMPLE, "") assert result.hyp_word_count == 0 assert result.net_insertion_rate == pytest.approx(0.0) def test_as_dict_and_from_dict_roundtrip(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics, HallucinationMetrics result = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED) d = result.as_dict() restored = HallucinationMetrics.from_dict(d) assert restored.anchor_score == pytest.approx(result.anchor_score, abs=1e-5) assert restored.is_hallucinating == result.is_hallucinating assert len(restored.hallucinated_blocks) == len(result.hallucinated_blocks) def test_aggregate_hallucination_metrics(self): from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics, aggregate_hallucination_metrics r1 = compute_hallucination_metrics(GT_SIMPLE, HYP_PERFECT) r2 = compute_hallucination_metrics(GT_MEDIEVAL, HYP_HALLUCINATED) agg = aggregate_hallucination_metrics([r1, r2]) assert "anchor_score_mean" in agg assert "length_ratio_mean" in agg assert "hallucinating_doc_count" in agg assert "document_count" in agg assert agg["document_count"] == 2 assert agg["hallucinating_doc_count"] >= 1 def test_anchor_threshold_respected(self): """Un ancrage très bas déclenche le badge hallucination.""" from picarones.evaluation.metrics.hallucination import compute_hallucination_metrics result = compute_hallucination_metrics( "abc def ghi", "xyz uvw rst opq lmn", anchor_threshold=0.5 ) assert result.anchor_score < 0.5 assert result.is_hallucinating is True # =========================================================================== # TestLineMetricsInResults # =========================================================================== class TestLineMetricsInResults: """Tests pour l'intégration des métriques Sprint 10 dans les modèles de données.""" def test_document_result_has_line_metrics_field(self): from picarones.evaluation.benchmark_result import DocumentResult from picarones.evaluation.metric_result import MetricsResult dr = DocumentResult( doc_id="test_001", image_path="/test/img.jpg", ground_truth=GT_SIMPLE, hypothesis=HYP_ERRORS, metrics=MetricsResult( cer=0.1, cer_nfc=0.1, cer_caseless=0.09, wer=0.2, wer_normalized=0.2, mer=0.15, wil=0.18, reference_length=50, hypothesis_length=48, ), duration_seconds=1.0, line_metrics={"gini": 0.3, "line_count": 3}, ) assert dr.line_metrics is not None assert dr.line_metrics["gini"] == pytest.approx(0.3) def test_document_result_has_hallucination_metrics_field(self): from picarones.evaluation.benchmark_result import DocumentResult from picarones.evaluation.metric_result import MetricsResult dr = DocumentResult( doc_id="test_002", image_path="/test/img.jpg", ground_truth=GT_SIMPLE, hypothesis=HYP_HALLUCINATED, metrics=MetricsResult( cer=0.5, cer_nfc=0.5, cer_caseless=0.5, wer=0.6, wer_normalized=0.6, mer=0.55, wil=0.65, reference_length=50, hypothesis_length=100, ), duration_seconds=2.0, hallucination_metrics={"anchor_score": 0.3, "is_hallucinating": True}, ) assert dr.hallucination_metrics is not None assert dr.hallucination_metrics["is_hallucinating"] is True def test_document_result_as_dict_includes_sprint10_fields(self): from picarones.evaluation.benchmark_result import DocumentResult from picarones.evaluation.metric_result import MetricsResult dr = DocumentResult( doc_id="test_003", image_path="/test/img.jpg", ground_truth=GT_SIMPLE, hypothesis=HYP_PERFECT, metrics=MetricsResult( cer=0.0, cer_nfc=0.0, cer_caseless=0.0, wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0, reference_length=50, hypothesis_length=50, ), duration_seconds=0.5, line_metrics={"gini": 0.0, "line_count": 1}, hallucination_metrics={"anchor_score": 1.0, "is_hallucinating": False}, ) d = dr.as_dict() assert "line_metrics" in d assert "hallucination_metrics" in d def test_engine_report_has_aggregated_sprint10_fields(self): from picarones.evaluation.benchmark_result import EngineReport, DocumentResult from picarones.evaluation.metric_result import MetricsResult dr = DocumentResult( doc_id="test_004", image_path="/test/img.jpg", ground_truth=GT_SIMPLE, hypothesis=HYP_PERFECT, metrics=MetricsResult( cer=0.0, cer_nfc=0.0, cer_caseless=0.0, wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0, reference_length=50, hypothesis_length=50, ), duration_seconds=0.5, ) report = EngineReport( engine_name="test_engine", engine_version="1.0", engine_config={}, document_results=[dr], aggregated_line_metrics={"gini_mean": 0.1, "document_count": 1}, aggregated_hallucination={"anchor_score_mean": 0.95, "document_count": 1}, ) assert report.aggregated_line_metrics is not None assert report.aggregated_hallucination is not None d = report.as_dict() assert "aggregated_line_metrics" in d assert "aggregated_hallucination" in d # =========================================================================== # TestFixturesVLM # =========================================================================== class TestFixturesVLM: """Tests pour le moteur VLM fictif dans picarones.fixtures.""" def test_generate_sample_benchmark_has_vlm_engine(self): from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=3, seed=42) engine_names = [r.engine_name for r in bm.engine_reports] assert any("vision" in name.lower() or "vlm" in name.lower() or "zero-shot" in name.lower() for name in engine_names) def test_vlm_engine_has_hallucination_metrics(self): from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=3, seed=42) vlm_report = next( (r for r in bm.engine_reports if r.pipeline_info.get("is_vlm")), None ) assert vlm_report is not None, "Moteur VLM non trouvé" assert vlm_report.aggregated_hallucination is not None assert "anchor_score_mean" in vlm_report.aggregated_hallucination def test_all_engines_have_line_metrics(self): from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=3, seed=42) for report in bm.engine_reports: assert report.aggregated_line_metrics is not None, \ f"Pas de line_metrics pour {report.engine_name}" assert "gini_mean" in report.aggregated_line_metrics def test_all_documents_have_line_metrics(self): from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=3, seed=42) for report in bm.engine_reports: for dr in report.document_results: assert dr.line_metrics is not None, \ f"{report.engine_name}/{dr.doc_id}: line_metrics manquant" assert "gini" in dr.line_metrics def test_all_documents_have_hallucination_metrics(self): from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=3, seed=42) for report in bm.engine_reports: for dr in report.document_results: assert dr.hallucination_metrics is not None, \ f"{report.engine_name}/{dr.doc_id}: hallucination_metrics manquant" assert "anchor_score" in dr.hallucination_metrics def test_vlm_engine_has_valid_hallucination_aggregation(self): """Le moteur VLM doit avoir des métriques d'hallucination agrégées valides.""" from picarones.evaluation.synthetic import generate_sample_benchmark bm = generate_sample_benchmark(n_docs=6, seed=42) vlm_report = next( (r for r in bm.engine_reports if r.pipeline_info.get("is_vlm")), None ) if vlm_report is None: pytest.skip("Moteur VLM non trouvé") agg = vlm_report.aggregated_hallucination assert agg is not None assert 0.0 <= agg.get("anchor_score_mean", -1) <= 1.0 assert agg.get("length_ratio_mean", 0) >= 0.0 assert agg.get("document_count", 0) == 6 # =========================================================================== # TestReportSprint10 # =========================================================================== class TestReportSprint10: """Tests pour le rapport HTML — nouvelles métriques Sprint 10.""" @pytest.fixture(scope="class") def html_report(self, tmp_path_factory): """Génère un rapport HTML de démonstration.""" from picarones.evaluation.synthetic import generate_sample_benchmark from picarones.reports.html.generator import ReportGenerator bm = generate_sample_benchmark(n_docs=3, seed=42) tmp = tmp_path_factory.mktemp("report") out = tmp / "sprint10_test.html" ReportGenerator(bm).generate(str(out)) return out.read_text(encoding="utf-8") def test_report_generated_not_empty(self, html_report): assert len(html_report) > 50_000 def test_report_has_gini_column_header(self, html_report): assert "Gini" in html_report def test_report_has_ancrage_column_header(self, html_report): assert "Ancrage" in html_report def test_report_has_gini_cer_scatter_canvas(self, html_report): # X2 + Z3 : aria-label exact du SVG généré par # _build_cer_gini_scatter, après strip HTML comments pour # empêcher le sabotage par decoy. from tests._strip_helpers import strip_comments assert 'aria-label="Croisement CER vs Gini"' in strip_comments(html_report, "html") def test_report_has_ratio_anchor_scatter_canvas(self, html_report): # X2 + Z3 : aria-label exact unique, après strip HTML comments. from tests._strip_helpers import strip_comments assert 'aria-label="Croisement ancrage vs ratio de longueur"' in strip_comments(html_report, "html") def test_report_has_vlm_badge(self, html_report): """Le badge VLM doit apparaître pour le moteur zero-shot. X2 (audit) : assertion durcie — l'ancien ``"VLM" in html or "zero-shot".lower() or "zero_shot"`` était trivialement vrai : ``"VLM"`` matche le titre de section « pipelines VLM », ``zero_shot`` apparaît dans les configs JSON embarquées (~12 fois), donc le test passait même sans badge rendu. Désormais on vérifie la chaîne exacte du badge HTML (chip pipeline-tag rendu par engines.cer_distribution renderer). """ from tests._strip_helpers import strip_comments assert "👁 VLM" in strip_comments(html_report, "html"), ( "Badge VLM (chip 👁 VLM) absent du HTML actif — pipeline zero-shot " "non rendu correctement (Z3 : strip HTML comments contre decoy)" ) class TestLineMetricsNoQuadraticBlowup: """Garde-fou anti-régression perf : ``_edit_distance`` doit rester en O(n) effectif (rapidfuzz/C), jamais une DP pur-Python O(n·m). Le hook ``line_metrics`` tourne sur CHAQUE document de CHAQUE benchmark — une DP quadratique faisait passer un run de 6 docs de < 5 min à 45 min (audit ``bc7e13c``). Seuil large (non-flaky) : la DP pur-Python prenait > 25 s sur cette entrée, rapidfuzz < 50 ms.""" def test_edit_distance_matches_reference_levenshtein(self) -> None: from picarones.evaluation.metrics.line_metrics import _edit_distance # Identité numérique sur cas limites (résultat inchangé). assert _edit_distance("", "") == 0 assert _edit_distance("", "abc") == 3 assert _edit_distance("abc", "") == 3 assert _edit_distance("kitten", "sitting") == 3 assert _edit_distance("café", "cafe") == 1 def test_large_page_pair_is_fast(self) -> None: import time from picarones.evaluation.metrics.line_metrics import ( compute_line_metrics, ) # ~13 ko : transcription patrimoniale dense réaliste. gt = ("Au nom de Dieu soit fait. L an mil six cent trente " "et un, par devant nous notaire royal soubsigne. ") * 200 hyp = gt.replace("Dieu", "Dîeu").replace("nous", "nons") start = time.perf_counter() compute_line_metrics(gt, hyp) elapsed = time.perf_counter() - start assert elapsed < 5.0, ( f"compute_line_metrics a pris {elapsed:.1f}s sur ~13 ko — " "régression O(n·m) probable dans _edit_distance " "(doit déléguer à rapidfuzz)." )