Picarones / tests /evaluation /metrics /test_over_normalization_hook.py
Claude
feat(audit): Phase 3 partielle — câblage des features inachevées (S2, S4, S6)
bd5c812 unverified
"""Phase 3.4 audit code-quality — la sur-normalisation LLM est
désormais agrégée automatiquement via le registre
:mod:`picarones.evaluation.metric_hooks`.
Avant la Phase 3.4, ``aggregate_over_normalization`` existait dans
``picarones/evaluation/metrics/over_normalization.py`` mais :
- n'avait aucun ``@register_corpus_aggregator`` ;
- le module n'était même pas importé par ``evaluation/metrics/__init__.py``
(mentionné en docstring uniquement) ;
- ``synthetic.py`` réimplémentait l'agrégation manuellement
(duplication silencieuse).
Le hook ``_aggregate_over_normalization_hook`` (auto-enregistré)
extrait désormais l'info depuis
``DocumentResult.pipeline_metadata["over_normalization"]`` et
alimente ``EngineReport.aggregated_over_normalization`` pour les
profils ``philological``, ``diagnostics`` et ``full``.
"""
from __future__ import annotations
from picarones.evaluation.benchmark_result import DocumentResult, EngineReport
from picarones.evaluation.metric_hooks import (
PROFILE_DIAGNOSTICS,
PROFILE_FULL,
PROFILE_MINIMAL,
PROFILE_PHILOLOGICAL,
PROFILE_STANDARD,
_all_corpus_aggregator_names,
run_corpus_aggregators,
select_corpus_aggregators,
)
from picarones.evaluation.metric_result import MetricsResult
from picarones.evaluation.metrics.over_normalization import (
OverNormalizationResult,
aggregate_over_normalization,
)
# --------------------------------------------------------------------------
# Auto-enregistrement
# --------------------------------------------------------------------------
def test_over_normalization_aggregator_is_registered() -> None:
"""L'import de ``picarones.evaluation.metrics`` doit déclencher
l'enregistrement de l'agrégateur ``over_normalization``."""
import picarones.evaluation.metrics # noqa: F401 — déclenchement
assert "over_normalization" in _all_corpus_aggregator_names(), (
"Le hook ``_aggregate_over_normalization_hook`` n'est pas "
"enregistré. Vérifier que ``over_normalization`` est dans "
"``picarones/evaluation/metrics/__init__.py`` (Phase 3.4)."
)
def test_aggregator_in_correct_profiles() -> None:
"""L'agrégateur doit être actif pour ``philological``,
``diagnostics``, ``full`` — pas pour ``minimal`` ni ``standard``."""
import picarones.evaluation.metrics # noqa: F401
for profile in (PROFILE_PHILOLOGICAL, PROFILE_DIAGNOSTICS, PROFILE_FULL):
names = [a.name for a in select_corpus_aggregators(profile)]
assert "over_normalization" in names, (
f"Profil ``{profile}`` n'inclut pas l'agrégateur over_normalization."
)
for profile in (PROFILE_MINIMAL, PROFILE_STANDARD):
names = [a.name for a in select_corpus_aggregators(profile)]
assert "over_normalization" not in names, (
f"Profil ``{profile}`` ne devrait pas inclure over_normalization."
)
# --------------------------------------------------------------------------
# Fonction pure aggregate_over_normalization (rétrocompat)
# --------------------------------------------------------------------------
def test_pure_aggregate_empty_list_returns_zero() -> None:
"""Pas de docs → score None, compteurs à zéro (rétrocompat de la
fonction utilitaire pure)."""
out = aggregate_over_normalization([])
assert out == {
"score": None,
"total_correct_ocr_words": 0,
"over_normalized_count": 0,
}
def test_pure_aggregate_sums_counts() -> None:
"""L'agrégation somme les compteurs bruts puis recalcule le score."""
r1 = OverNormalizationResult(
total_correct_ocr_words=100,
over_normalized_count=10,
)
r2 = OverNormalizationResult(
total_correct_ocr_words=50,
over_normalized_count=5,
)
out = aggregate_over_normalization([r1, r2, None]) # None ignoré
assert out == {
"score": 0.1, # 15 / 150
"total_correct_ocr_words": 150,
"over_normalized_count": 15,
"document_count": 2,
}
# --------------------------------------------------------------------------
# Hook décoré — extraction depuis DocumentResult.pipeline_metadata
# --------------------------------------------------------------------------
def _make_dr(
doc_id: str,
over_norm_dict: dict | None,
) -> DocumentResult:
return DocumentResult(
doc_id=doc_id,
image_path=f"/tmp/{doc_id}.png",
ground_truth="fait",
hypothesis="fait",
metrics=MetricsResult(cer=0.0, wer=0.0),
duration_seconds=1.0,
ocr_intermediate="faict",
pipeline_metadata=(
{"over_normalization": over_norm_dict}
if over_norm_dict is not None
else {}
),
)
def test_hook_returns_none_when_no_pipeline_metadata() -> None:
"""Benchmark OCR seul (sans LLM) → aucun ``pipeline_metadata``,
donc le hook retourne ``None`` et ``aggregated_over_normalization``
reste à ``None``."""
import picarones.evaluation.metrics # noqa: F401
docs = [_make_dr("d1", None), _make_dr("d2", None)]
out = run_corpus_aggregators(PROFILE_FULL, docs)
assert "aggregated_over_normalization" not in out
def test_hook_aggregates_from_pipeline_metadata() -> None:
"""Pipeline OCR+LLM → ``pipeline_metadata["over_normalization"]``
est extrait et agrégé."""
import picarones.evaluation.metrics # noqa: F401
docs = [
_make_dr("d1", {
"score": 0.1,
"total_correct_ocr_words": 100,
"over_normalized_count": 10,
"over_normalized_passages": [],
}),
_make_dr("d2", {
"score": 0.2,
"total_correct_ocr_words": 50,
"over_normalized_count": 10,
"over_normalized_passages": [],
}),
]
out = run_corpus_aggregators(PROFILE_PHILOLOGICAL, docs)
assert "aggregated_over_normalization" in out
result = out["aggregated_over_normalization"]
# 20 over-normalized / 150 correct OCR = 0.1333
assert result["over_normalized_count"] == 20
assert result["total_correct_ocr_words"] == 150
assert result["document_count"] == 2
assert 0.13 < result["score"] < 0.14
def test_hook_resilient_to_malformed_dict() -> None:
"""Si un document a un ``pipeline_metadata["over_normalization"]``
mal formé (manque un champ, valeur non castable), il est skipé
avec un warning — l'agrégateur n'échoue pas."""
import picarones.evaluation.metrics # noqa: F401
docs = [
_make_dr("d1", {"total_correct_ocr_words": 100, "over_normalized_count": 5}),
_make_dr("d2", {"total_correct_ocr_words": "garbage", "over_normalized_count": 0}),
_make_dr("d3", None),
]
out = run_corpus_aggregators(PROFILE_FULL, docs)
# d1 est valide → l'agrégateur retourne un dict, même si d2 est ignoré
assert "aggregated_over_normalization" in out
assert out["aggregated_over_normalization"]["over_normalized_count"] == 5
# --------------------------------------------------------------------------
# Sérialisation EngineReport
# --------------------------------------------------------------------------
def test_engine_report_round_trip_with_over_normalization() -> None:
"""Le champ ``aggregated_over_normalization`` est préservé par
``as_dict`` / ``from_dict``."""
er = EngineReport(
engine_name="tesseract+ministral",
engine_version="5.3.0",
engine_config={},
document_results=[],
aggregated_over_normalization={
"score": 0.15,
"total_correct_ocr_words": 200,
"over_normalized_count": 30,
"document_count": 5,
},
)
d = er.as_dict()
assert d["aggregated_over_normalization"]["score"] == 0.15
rebuilt = EngineReport.from_dict(d)
assert rebuilt.aggregated_over_normalization == er.aggregated_over_normalization