Spaces:
Sleeping
Sleeping
File size: 7,986 Bytes
bd5c812 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | """Phase 3.4 audit code-quality — la sur-normalisation LLM est
désormais agrégée automatiquement via le registre
:mod:`picarones.evaluation.metric_hooks`.
Avant la Phase 3.4, ``aggregate_over_normalization`` existait dans
``picarones/evaluation/metrics/over_normalization.py`` mais :
- n'avait aucun ``@register_corpus_aggregator`` ;
- le module n'était même pas importé par ``evaluation/metrics/__init__.py``
(mentionné en docstring uniquement) ;
- ``synthetic.py`` réimplémentait l'agrégation manuellement
(duplication silencieuse).
Le hook ``_aggregate_over_normalization_hook`` (auto-enregistré)
extrait désormais l'info depuis
``DocumentResult.pipeline_metadata["over_normalization"]`` et
alimente ``EngineReport.aggregated_over_normalization`` pour les
profils ``philological``, ``diagnostics`` et ``full``.
"""
from __future__ import annotations
from picarones.evaluation.benchmark_result import DocumentResult, EngineReport
from picarones.evaluation.metric_hooks import (
PROFILE_DIAGNOSTICS,
PROFILE_FULL,
PROFILE_MINIMAL,
PROFILE_PHILOLOGICAL,
PROFILE_STANDARD,
_all_corpus_aggregator_names,
run_corpus_aggregators,
select_corpus_aggregators,
)
from picarones.evaluation.metric_result import MetricsResult
from picarones.evaluation.metrics.over_normalization import (
OverNormalizationResult,
aggregate_over_normalization,
)
# --------------------------------------------------------------------------
# Auto-enregistrement
# --------------------------------------------------------------------------
def test_over_normalization_aggregator_is_registered() -> None:
"""L'import de ``picarones.evaluation.metrics`` doit déclencher
l'enregistrement de l'agrégateur ``over_normalization``."""
import picarones.evaluation.metrics # noqa: F401 — déclenchement
assert "over_normalization" in _all_corpus_aggregator_names(), (
"Le hook ``_aggregate_over_normalization_hook`` n'est pas "
"enregistré. Vérifier que ``over_normalization`` est dans "
"``picarones/evaluation/metrics/__init__.py`` (Phase 3.4)."
)
def test_aggregator_in_correct_profiles() -> None:
"""L'agrégateur doit être actif pour ``philological``,
``diagnostics``, ``full`` — pas pour ``minimal`` ni ``standard``."""
import picarones.evaluation.metrics # noqa: F401
for profile in (PROFILE_PHILOLOGICAL, PROFILE_DIAGNOSTICS, PROFILE_FULL):
names = [a.name for a in select_corpus_aggregators(profile)]
assert "over_normalization" in names, (
f"Profil ``{profile}`` n'inclut pas l'agrégateur over_normalization."
)
for profile in (PROFILE_MINIMAL, PROFILE_STANDARD):
names = [a.name for a in select_corpus_aggregators(profile)]
assert "over_normalization" not in names, (
f"Profil ``{profile}`` ne devrait pas inclure over_normalization."
)
# --------------------------------------------------------------------------
# Fonction pure aggregate_over_normalization (rétrocompat)
# --------------------------------------------------------------------------
def test_pure_aggregate_empty_list_returns_zero() -> None:
"""Pas de docs → score None, compteurs à zéro (rétrocompat de la
fonction utilitaire pure)."""
out = aggregate_over_normalization([])
assert out == {
"score": None,
"total_correct_ocr_words": 0,
"over_normalized_count": 0,
}
def test_pure_aggregate_sums_counts() -> None:
"""L'agrégation somme les compteurs bruts puis recalcule le score."""
r1 = OverNormalizationResult(
total_correct_ocr_words=100,
over_normalized_count=10,
)
r2 = OverNormalizationResult(
total_correct_ocr_words=50,
over_normalized_count=5,
)
out = aggregate_over_normalization([r1, r2, None]) # None ignoré
assert out == {
"score": 0.1, # 15 / 150
"total_correct_ocr_words": 150,
"over_normalized_count": 15,
"document_count": 2,
}
# --------------------------------------------------------------------------
# Hook décoré — extraction depuis DocumentResult.pipeline_metadata
# --------------------------------------------------------------------------
def _make_dr(
doc_id: str,
over_norm_dict: dict | None,
) -> DocumentResult:
return DocumentResult(
doc_id=doc_id,
image_path=f"/tmp/{doc_id}.png",
ground_truth="fait",
hypothesis="fait",
metrics=MetricsResult(cer=0.0, wer=0.0),
duration_seconds=1.0,
ocr_intermediate="faict",
pipeline_metadata=(
{"over_normalization": over_norm_dict}
if over_norm_dict is not None
else {}
),
)
def test_hook_returns_none_when_no_pipeline_metadata() -> None:
"""Benchmark OCR seul (sans LLM) → aucun ``pipeline_metadata``,
donc le hook retourne ``None`` et ``aggregated_over_normalization``
reste à ``None``."""
import picarones.evaluation.metrics # noqa: F401
docs = [_make_dr("d1", None), _make_dr("d2", None)]
out = run_corpus_aggregators(PROFILE_FULL, docs)
assert "aggregated_over_normalization" not in out
def test_hook_aggregates_from_pipeline_metadata() -> None:
"""Pipeline OCR+LLM → ``pipeline_metadata["over_normalization"]``
est extrait et agrégé."""
import picarones.evaluation.metrics # noqa: F401
docs = [
_make_dr("d1", {
"score": 0.1,
"total_correct_ocr_words": 100,
"over_normalized_count": 10,
"over_normalized_passages": [],
}),
_make_dr("d2", {
"score": 0.2,
"total_correct_ocr_words": 50,
"over_normalized_count": 10,
"over_normalized_passages": [],
}),
]
out = run_corpus_aggregators(PROFILE_PHILOLOGICAL, docs)
assert "aggregated_over_normalization" in out
result = out["aggregated_over_normalization"]
# 20 over-normalized / 150 correct OCR = 0.1333
assert result["over_normalized_count"] == 20
assert result["total_correct_ocr_words"] == 150
assert result["document_count"] == 2
assert 0.13 < result["score"] < 0.14
def test_hook_resilient_to_malformed_dict() -> None:
"""Si un document a un ``pipeline_metadata["over_normalization"]``
mal formé (manque un champ, valeur non castable), il est skipé
avec un warning — l'agrégateur n'échoue pas."""
import picarones.evaluation.metrics # noqa: F401
docs = [
_make_dr("d1", {"total_correct_ocr_words": 100, "over_normalized_count": 5}),
_make_dr("d2", {"total_correct_ocr_words": "garbage", "over_normalized_count": 0}),
_make_dr("d3", None),
]
out = run_corpus_aggregators(PROFILE_FULL, docs)
# d1 est valide → l'agrégateur retourne un dict, même si d2 est ignoré
assert "aggregated_over_normalization" in out
assert out["aggregated_over_normalization"]["over_normalized_count"] == 5
# --------------------------------------------------------------------------
# Sérialisation EngineReport
# --------------------------------------------------------------------------
def test_engine_report_round_trip_with_over_normalization() -> None:
"""Le champ ``aggregated_over_normalization`` est préservé par
``as_dict`` / ``from_dict``."""
er = EngineReport(
engine_name="tesseract+ministral",
engine_version="5.3.0",
engine_config={},
document_results=[],
aggregated_over_normalization={
"score": 0.15,
"total_correct_ocr_words": 200,
"over_normalized_count": 30,
"document_count": 5,
},
)
d = er.as_dict()
assert d["aggregated_over_normalization"]["score"] == 0.15
rebuilt = EngineReport.from_dict(d)
assert rebuilt.aggregated_over_normalization == er.aggregated_over_normalization
|