File size: 7,986 Bytes
bd5c812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
"""Phase 3.4 audit code-quality — la sur-normalisation LLM est
désormais agrégée automatiquement via le registre
:mod:`picarones.evaluation.metric_hooks`.

Avant la Phase 3.4, ``aggregate_over_normalization`` existait dans
``picarones/evaluation/metrics/over_normalization.py`` mais :

- n'avait aucun ``@register_corpus_aggregator`` ;
- le module n'était même pas importé par ``evaluation/metrics/__init__.py``
  (mentionné en docstring uniquement) ;
- ``synthetic.py`` réimplémentait l'agrégation manuellement
  (duplication silencieuse).

Le hook ``_aggregate_over_normalization_hook`` (auto-enregistré)
extrait désormais l'info depuis
``DocumentResult.pipeline_metadata["over_normalization"]`` et
alimente ``EngineReport.aggregated_over_normalization`` pour les
profils ``philological``, ``diagnostics`` et ``full``.
"""

from __future__ import annotations

from picarones.evaluation.benchmark_result import DocumentResult, EngineReport
from picarones.evaluation.metric_hooks import (
    PROFILE_DIAGNOSTICS,
    PROFILE_FULL,
    PROFILE_MINIMAL,
    PROFILE_PHILOLOGICAL,
    PROFILE_STANDARD,
    _all_corpus_aggregator_names,
    run_corpus_aggregators,
    select_corpus_aggregators,
)
from picarones.evaluation.metric_result import MetricsResult
from picarones.evaluation.metrics.over_normalization import (
    OverNormalizationResult,
    aggregate_over_normalization,
)


# --------------------------------------------------------------------------
# Auto-enregistrement
# --------------------------------------------------------------------------


def test_over_normalization_aggregator_is_registered() -> None:
    """L'import de ``picarones.evaluation.metrics`` doit déclencher
    l'enregistrement de l'agrégateur ``over_normalization``."""
    import picarones.evaluation.metrics  # noqa: F401 — déclenchement

    assert "over_normalization" in _all_corpus_aggregator_names(), (
        "Le hook ``_aggregate_over_normalization_hook`` n'est pas "
        "enregistré.  Vérifier que ``over_normalization`` est dans "
        "``picarones/evaluation/metrics/__init__.py`` (Phase 3.4)."
    )


def test_aggregator_in_correct_profiles() -> None:
    """L'agrégateur doit être actif pour ``philological``,
    ``diagnostics``, ``full`` — pas pour ``minimal`` ni ``standard``."""
    import picarones.evaluation.metrics  # noqa: F401

    for profile in (PROFILE_PHILOLOGICAL, PROFILE_DIAGNOSTICS, PROFILE_FULL):
        names = [a.name for a in select_corpus_aggregators(profile)]
        assert "over_normalization" in names, (
            f"Profil ``{profile}`` n'inclut pas l'agrégateur over_normalization."
        )

    for profile in (PROFILE_MINIMAL, PROFILE_STANDARD):
        names = [a.name for a in select_corpus_aggregators(profile)]
        assert "over_normalization" not in names, (
            f"Profil ``{profile}`` ne devrait pas inclure over_normalization."
        )


# --------------------------------------------------------------------------
# Fonction pure aggregate_over_normalization (rétrocompat)
# --------------------------------------------------------------------------


def test_pure_aggregate_empty_list_returns_zero() -> None:
    """Pas de docs → score None, compteurs à zéro (rétrocompat de la
    fonction utilitaire pure)."""
    out = aggregate_over_normalization([])
    assert out == {
        "score": None,
        "total_correct_ocr_words": 0,
        "over_normalized_count": 0,
    }


def test_pure_aggregate_sums_counts() -> None:
    """L'agrégation somme les compteurs bruts puis recalcule le score."""
    r1 = OverNormalizationResult(
        total_correct_ocr_words=100,
        over_normalized_count=10,
    )
    r2 = OverNormalizationResult(
        total_correct_ocr_words=50,
        over_normalized_count=5,
    )
    out = aggregate_over_normalization([r1, r2, None])  # None ignoré
    assert out == {
        "score": 0.1,  # 15 / 150
        "total_correct_ocr_words": 150,
        "over_normalized_count": 15,
        "document_count": 2,
    }


# --------------------------------------------------------------------------
# Hook décoré — extraction depuis DocumentResult.pipeline_metadata
# --------------------------------------------------------------------------


def _make_dr(
    doc_id: str,
    over_norm_dict: dict | None,
) -> DocumentResult:
    return DocumentResult(
        doc_id=doc_id,
        image_path=f"/tmp/{doc_id}.png",
        ground_truth="fait",
        hypothesis="fait",
        metrics=MetricsResult(cer=0.0, wer=0.0),
        duration_seconds=1.0,
        ocr_intermediate="faict",
        pipeline_metadata=(
            {"over_normalization": over_norm_dict}
            if over_norm_dict is not None
            else {}
        ),
    )


def test_hook_returns_none_when_no_pipeline_metadata() -> None:
    """Benchmark OCR seul (sans LLM) → aucun ``pipeline_metadata``,
    donc le hook retourne ``None`` et ``aggregated_over_normalization``
    reste à ``None``."""
    import picarones.evaluation.metrics  # noqa: F401

    docs = [_make_dr("d1", None), _make_dr("d2", None)]
    out = run_corpus_aggregators(PROFILE_FULL, docs)
    assert "aggregated_over_normalization" not in out


def test_hook_aggregates_from_pipeline_metadata() -> None:
    """Pipeline OCR+LLM → ``pipeline_metadata["over_normalization"]``
    est extrait et agrégé."""
    import picarones.evaluation.metrics  # noqa: F401

    docs = [
        _make_dr("d1", {
            "score": 0.1,
            "total_correct_ocr_words": 100,
            "over_normalized_count": 10,
            "over_normalized_passages": [],
        }),
        _make_dr("d2", {
            "score": 0.2,
            "total_correct_ocr_words": 50,
            "over_normalized_count": 10,
            "over_normalized_passages": [],
        }),
    ]
    out = run_corpus_aggregators(PROFILE_PHILOLOGICAL, docs)
    assert "aggregated_over_normalization" in out
    result = out["aggregated_over_normalization"]
    # 20 over-normalized / 150 correct OCR = 0.1333
    assert result["over_normalized_count"] == 20
    assert result["total_correct_ocr_words"] == 150
    assert result["document_count"] == 2
    assert 0.13 < result["score"] < 0.14


def test_hook_resilient_to_malformed_dict() -> None:
    """Si un document a un ``pipeline_metadata["over_normalization"]``
    mal formé (manque un champ, valeur non castable), il est skipé
    avec un warning — l'agrégateur n'échoue pas."""
    import picarones.evaluation.metrics  # noqa: F401

    docs = [
        _make_dr("d1", {"total_correct_ocr_words": 100, "over_normalized_count": 5}),
        _make_dr("d2", {"total_correct_ocr_words": "garbage", "over_normalized_count": 0}),
        _make_dr("d3", None),
    ]
    out = run_corpus_aggregators(PROFILE_FULL, docs)
    # d1 est valide → l'agrégateur retourne un dict, même si d2 est ignoré
    assert "aggregated_over_normalization" in out
    assert out["aggregated_over_normalization"]["over_normalized_count"] == 5


# --------------------------------------------------------------------------
# Sérialisation EngineReport
# --------------------------------------------------------------------------


def test_engine_report_round_trip_with_over_normalization() -> None:
    """Le champ ``aggregated_over_normalization`` est préservé par
    ``as_dict`` / ``from_dict``."""
    er = EngineReport(
        engine_name="tesseract+ministral",
        engine_version="5.3.0",
        engine_config={},
        document_results=[],
        aggregated_over_normalization={
            "score": 0.15,
            "total_correct_ocr_words": 200,
            "over_normalized_count": 30,
            "document_count": 5,
        },
    )
    d = er.as_dict()
    assert d["aggregated_over_normalization"]["score"] == 0.15

    rebuilt = EngineReport.from_dict(d)
    assert rebuilt.aggregated_over_normalization == er.aggregated_over_normalization