File size: 15,925 Bytes
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""
Unit tests for the post-processing layer in `4_inference.py`:
  - the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER)
  - `_mandat_checkbox_score` + `_detect_mandat_checkbox`
  - `_clean_field_extractions` on synthetic raw model outputs

These tests don't load the model โ€” we exercise the pure functions directly.
"""
from __future__ import annotations

import pytest


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_REFURB โ€” urbanism reference detection
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, expected_match", [
    # Should match (valid PC / PA / DP / CU + digit body)
    ("PC 044 035 25 00035",             True),
    ("PC0440352500035",                 True),
    ("Pc0440352500035",                 True),    # case-insensitive prefix
    ("PA 022 360 22 00027",             True),
    ("DP 044 035",                      True),
    # Should NOT match โ€” French word "rue" must not trigger RU prefix
    ("rue Abbรฉ Guinard",                False),
    # Should NOT match โ€” "Parcelle" must not trigger PA prefix
    ("Parcelle",                        False),
    ("Paysagiste Bureau de contrรดle",   False),
    # Empty
    ("",                                False),
])
def test_re_refurb_strict_prefix(inference_mod, text, expected_match):
    m = inference_mod._RE_REFURB.search(text)
    assert (m is not None) is expected_match


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_PHONE_FR โ€” French phone number patterns
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, has_match", [
    ("Tel : 0670934655 disponible",        True),
    ("06 85 46 87 86 Mail",                True),
    ("06.85.46.87.86",                     True),
    ("07-85-62-03-00",                     True),
    # Negatives
    ("Code postal 44240",                  False),   # 5 digits โ‰  10-digit phone
    ("1234",                               False),
    ("01 02",                              False),   # too short
])
def test_re_phone_fr(inference_mod, text, has_match):
    m = inference_mod._RE_PHONE_FR.search(text)
    assert (m is not None) is has_match


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_EMAIL โ€” email validation
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, has_match", [
    ("sebastien.gue@orange.com",                       True),
    ("immobilier.be-orange@orange.com",                True),
    ("marine.pascalin+test@orange.com",                True),
    # Negatives
    ("Pas un email",                                    False),
    ("@orange.com sans prefix",                         False),
    ("user@",                                           False),
])
def test_re_email(inference_mod, text, has_match):
    m = inference_mod._RE_EMAIL.search(text)
    assert (m is not None) is has_match


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _mandat_checkbox_score โ€” strict scorer for OCR-rendered checkbox markers
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("marker, expected_min_score", [
    # Strong: explicit X
    ("[X]",   5),
    ("X",     5),
    ("PX",    5),    # OCR misread of [X]
    ("FX",    5),
    # Strong: digit (Tesseract often reads X as 1 or 9)
    ("C1]",   3),
    ("[1]",   3),
    ("9",     3),
    # Mark-like multi-chars
    ("**[]",  3),
    # Orphan bracket
    ("C]",    2),
])
def test_mandat_score_strong(inference_mod, marker, expected_min_score):
    assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score


@pytest.mark.parametrize("marker", [
    "",        # empty
    "[]",      # canonical empty box
    "()",
    "D",       # single letter (Tesseract often reads [] as D)
    "O",
    "Q",
    "!",       # single punctuation โ€” was the PF0442 bug, must score 0
    "si",      # OCR noise โ€” was the PF0442 bug, must score 0
    "DA",      # two random letters
])
def test_mandat_score_weak_or_empty(inference_mod, marker):
    """All these markers should score 0 โ€” they're ambiguous OCR garble,
    not evidence of an X-mark."""
    assert inference_mod._mandat_checkbox_score(marker) == 0


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _detect_mandat_checkbox โ€” full pipeline on synthetic OCR strings
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_detect_mandat_oui_clear(inference_mod):
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


def test_detect_mandat_non_clear(inference_mod):
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [] / NON [X] si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) == "NON"


def test_detect_mandat_oui_garbled(inference_mod):
    """Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'."""
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


def test_detect_mandat_ambiguous_returns_none(inference_mod):
    """The PF0442 case: both markers are weak (`!` vs `si`). Return None
    rather than commit on a coin flip."""
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) is None


def test_detect_mandat_no_anchor(inference_mod):
    """No 'mandat' / 'ouvrage' / 'dispose' keywords nearby โ†’ return None
    rather than match an unrelated OUI/NON pair (e.g., the AU question)."""
    ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la rรฉfรฉrence"
    assert inference_mod._detect_mandat_checkbox(ocr) is None


def test_detect_mandat_picks_right_pair(inference_mod):
    """Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON).
    Detector must skip the AU pair and find the mandat one."""
    ocr = (
        "Autorisation d'Urbanisme OUI [] / NON [X] indiquer la rรฉfรฉrence ..."
        " Coordonnรฉes du futur syndic ..."
        " Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
    )
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _clean_field_extractions โ€” end-to-end cleaner behaviour
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _ext(inference_mod, value, conf=0.9):
    return inference_mod.FieldExtraction(value=value, confidence=conf)


def test_clean_strips_trailing_noise_from_name(inference_mod):
    """Model returns 'GUE Sรฉbastien Conseiller Neuf Mobile' โ€” cleaner should
    keep the name and drop the trailing role keywords."""
    raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sรฉbastien Conseiller Neuf Mobile", conf=0.62)}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "Representant_Nom_Complet" in cleaned
    val = cleaned["Representant_Nom_Complet"].value
    assert "Conseiller" not in val
    assert "Mobile" not in val
    assert "Sรฉbastien" in val


def test_clean_extracts_phone_from_noisy_span(inference_mod):
    """Model returns phone + trailing word 'Mail'. Cleaner should keep only
    the phone digits."""
    raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86")
    assert "Mail" not in cleaned["Representant_Telephone"].value


def test_clean_extracts_pc_code_from_bundled_text(inference_mod):
    """Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts
    just the PC code."""
    raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value
    assert "Vv" not in cleaned["Reference_Urbanisme"].value


def test_clean_drops_low_confidence_freetext_fields(inference_mod):
    """Free-text fields (cabinet_conseil, Batiment_Adresse,
    Representant_Nom_Complet) with confidence < 0.40 should be dropped
    entirely โ€” they're typically the model hallucinating on uncertain
    inputs."""
    raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "cabinet_conseil" not in cleaned


def test_clean_email_backstop_from_ocr_text(inference_mod):
    """Model returned nothing for email, but OCR has a valid email โ†’
    backstop fills it in."""
    cleaned = inference_mod._clean_field_extractions(
        {},
        ocr_text="Email: test.user@orange.com Tel: 0670934655"
    )
    assert "Representant_Email" in cleaned
    assert cleaned["Representant_Email"].value == "test.user@orange.com"


def test_clean_logement_total_backstop_from_ocr(inference_mod):
    """`nb_log_totale` not extracted by the model โ€” backstop reads it from
    the form text 'logements/locaux/lots : 1'."""
    ocr = (
        "Nb total de Nb total de lots : Nb total de macrolots : "
        "logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont"
    )
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert cleaned.get("nb_log_totale") is not None
    assert cleaned["nb_log_totale"].value == "1"


def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod):
    """The cleaner's Disposition_Mandat handling should call the checkbox
    detector and prefer its result over any model-supplied value."""
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui"
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert cleaned.get("Disposition_Mandat") is not None
    assert cleaned["Disposition_Mandat"].value == "OUI"


def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod):
    """The PF0442 case โ€” both markers ambiguous โ†’ field dropped entirely,
    consultant flags it via manual_review at engine level."""
    ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert "Disposition_Mandat" not in cleaned


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Batiment_Adresse โ€” stopword stripping + OCR backstop
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_address_regex_matches_typical_french_addresses(inference_mod):
    pattern = inference_mod._RE_ADDR_FR
    assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre")
    assert pattern.search("Adresse 1 rue Abbรฉ Guinard 44100")
    assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE")
    assert pattern.search("Sis ร  5 avenue de la Gare 31000 Toulouse")


def test_address_regex_rejects_non_addresses(inference_mod):
    pattern = inference_mod._RE_ADDR_FR
    assert pattern.search("PC0440352500035") is None              # urbanism ref
    assert pattern.search("FICHE DE RENSEIGNEMENT") is None       # form header
    assert pattern.search("Tel mobile 0670123456") is None        # phone


def test_clean_address_strips_form_header_noise(inference_mod):
    """A real model output bundles MAITRE D'OUVRAGE with the address โ€”
    we should strip the header, not reject the whole field."""
    raw = {"Batiment_Adresse": _ext(
        inference_mod,
        "MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle",
        conf=0.8,
    )}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "Batiment_Adresse" in cleaned
    val = cleaned["Batiment_Adresse"].value
    assert "MAITRE" not in val.upper().replace("'", "")
    assert "Cotalard" in val


def test_clean_address_dropped_when_only_headers(inference_mod):
    """If the entire span is header noise with no real address content,
    the field should still be dropped โ€” but via length check, not
    blanket rejection of every span containing a stopword."""
    raw = {"Batiment_Adresse": _ext(
        inference_mod,
        "FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU",
        conf=0.4,
    )}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    # After stripping all the stopwords, only "/" separators remain โ†’ dropped
    assert "Batiment_Adresse" not in cleaned


def test_clean_address_backstop_from_ocr(inference_mod):
    """Model returned nothing for Batiment_Adresse โ€” the OCR text contains
    an address, the regex backstop fills it in."""
    ocr = (
        "DESCRIPTION DE L'OPERATION ... "
        "Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... "
        "DLPI: 01/09/2026"
    )
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert "Batiment_Adresse" in cleaned
    assert "Cotalard" in cleaned["Batiment_Adresse"].value


def test_clean_address_backstop_no_match_leaves_empty(inference_mod):
    """If the OCR has no recognisable address pattern, don't fabricate one."""
    cleaned = inference_mod._clean_field_extractions(
        {}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text"
    )
    assert "Batiment_Adresse" not in cleaned