Spaces:
Configuration error
Configuration error
File size: 15,925 Bytes
33ddb61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | """
Unit tests for the post-processing layer in `4_inference.py`:
- the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER)
- `_mandat_checkbox_score` + `_detect_mandat_checkbox`
- `_clean_field_extractions` on synthetic raw model outputs
These tests don't load the model โ we exercise the pure functions directly.
"""
from __future__ import annotations
import pytest
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _RE_REFURB โ urbanism reference detection
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
@pytest.mark.parametrize("text, expected_match", [
# Should match (valid PC / PA / DP / CU + digit body)
("PC 044 035 25 00035", True),
("PC0440352500035", True),
("Pc0440352500035", True), # case-insensitive prefix
("PA 022 360 22 00027", True),
("DP 044 035", True),
# Should NOT match โ French word "rue" must not trigger RU prefix
("rue Abbรฉ Guinard", False),
# Should NOT match โ "Parcelle" must not trigger PA prefix
("Parcelle", False),
("Paysagiste Bureau de contrรดle", False),
# Empty
("", False),
])
def test_re_refurb_strict_prefix(inference_mod, text, expected_match):
m = inference_mod._RE_REFURB.search(text)
assert (m is not None) is expected_match
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _RE_PHONE_FR โ French phone number patterns
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
@pytest.mark.parametrize("text, has_match", [
("Tel : 0670934655 disponible", True),
("06 85 46 87 86 Mail", True),
("06.85.46.87.86", True),
("07-85-62-03-00", True),
# Negatives
("Code postal 44240", False), # 5 digits โ 10-digit phone
("1234", False),
("01 02", False), # too short
])
def test_re_phone_fr(inference_mod, text, has_match):
m = inference_mod._RE_PHONE_FR.search(text)
assert (m is not None) is has_match
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _RE_EMAIL โ email validation
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
@pytest.mark.parametrize("text, has_match", [
("sebastien.gue@orange.com", True),
("immobilier.be-orange@orange.com", True),
("marine.pascalin+test@orange.com", True),
# Negatives
("Pas un email", False),
("@orange.com sans prefix", False),
("user@", False),
])
def test_re_email(inference_mod, text, has_match):
m = inference_mod._RE_EMAIL.search(text)
assert (m is not None) is has_match
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _mandat_checkbox_score โ strict scorer for OCR-rendered checkbox markers
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
@pytest.mark.parametrize("marker, expected_min_score", [
# Strong: explicit X
("[X]", 5),
("X", 5),
("PX", 5), # OCR misread of [X]
("FX", 5),
# Strong: digit (Tesseract often reads X as 1 or 9)
("C1]", 3),
("[1]", 3),
("9", 3),
# Mark-like multi-chars
("**[]", 3),
# Orphan bracket
("C]", 2),
])
def test_mandat_score_strong(inference_mod, marker, expected_min_score):
assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score
@pytest.mark.parametrize("marker", [
"", # empty
"[]", # canonical empty box
"()",
"D", # single letter (Tesseract often reads [] as D)
"O",
"Q",
"!", # single punctuation โ was the PF0442 bug, must score 0
"si", # OCR noise โ was the PF0442 bug, must score 0
"DA", # two random letters
])
def test_mandat_score_weak_or_empty(inference_mod, marker):
"""All these markers should score 0 โ they're ambiguous OCR garble,
not evidence of an X-mark."""
assert inference_mod._mandat_checkbox_score(marker) == 0
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _detect_mandat_checkbox โ full pipeline on synthetic OCR strings
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def test_detect_mandat_oui_clear(inference_mod):
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
def test_detect_mandat_non_clear(inference_mod):
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [] / NON [X] si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) == "NON"
def test_detect_mandat_oui_garbled(inference_mod):
"""Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
def test_detect_mandat_ambiguous_returns_none(inference_mod):
"""The PF0442 case: both markers are weak (`!` vs `si`). Return None
rather than commit on a coin flip."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) is None
def test_detect_mandat_no_anchor(inference_mod):
"""No 'mandat' / 'ouvrage' / 'dispose' keywords nearby โ return None
rather than match an unrelated OUI/NON pair (e.g., the AU question)."""
ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la rรฉfรฉrence"
assert inference_mod._detect_mandat_checkbox(ocr) is None
def test_detect_mandat_picks_right_pair(inference_mod):
"""Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON).
Detector must skip the AU pair and find the mandat one."""
ocr = (
"Autorisation d'Urbanisme OUI [] / NON [X] indiquer la rรฉfรฉrence ..."
" Coordonnรฉes du futur syndic ..."
" Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
)
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# _clean_field_extractions โ end-to-end cleaner behaviour
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def _ext(inference_mod, value, conf=0.9):
return inference_mod.FieldExtraction(value=value, confidence=conf)
def test_clean_strips_trailing_noise_from_name(inference_mod):
"""Model returns 'GUE Sรฉbastien Conseiller Neuf Mobile' โ cleaner should
keep the name and drop the trailing role keywords."""
raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sรฉbastien Conseiller Neuf Mobile", conf=0.62)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "Representant_Nom_Complet" in cleaned
val = cleaned["Representant_Nom_Complet"].value
assert "Conseiller" not in val
assert "Mobile" not in val
assert "Sรฉbastien" in val
def test_clean_extracts_phone_from_noisy_span(inference_mod):
"""Model returns phone + trailing word 'Mail'. Cleaner should keep only
the phone digits."""
raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86")
assert "Mail" not in cleaned["Representant_Telephone"].value
def test_clean_extracts_pc_code_from_bundled_text(inference_mod):
"""Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts
just the PC code."""
raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value
assert "Vv" not in cleaned["Reference_Urbanisme"].value
def test_clean_drops_low_confidence_freetext_fields(inference_mod):
"""Free-text fields (cabinet_conseil, Batiment_Adresse,
Representant_Nom_Complet) with confidence < 0.40 should be dropped
entirely โ they're typically the model hallucinating on uncertain
inputs."""
raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "cabinet_conseil" not in cleaned
def test_clean_email_backstop_from_ocr_text(inference_mod):
"""Model returned nothing for email, but OCR has a valid email โ
backstop fills it in."""
cleaned = inference_mod._clean_field_extractions(
{},
ocr_text="Email: test.user@orange.com Tel: 0670934655"
)
assert "Representant_Email" in cleaned
assert cleaned["Representant_Email"].value == "test.user@orange.com"
def test_clean_logement_total_backstop_from_ocr(inference_mod):
"""`nb_log_totale` not extracted by the model โ backstop reads it from
the form text 'logements/locaux/lots : 1'."""
ocr = (
"Nb total de Nb total de lots : Nb total de macrolots : "
"logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont"
)
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert cleaned.get("nb_log_totale") is not None
assert cleaned["nb_log_totale"].value == "1"
def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod):
"""The cleaner's Disposition_Mandat handling should call the checkbox
detector and prefer its result over any model-supplied value."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui"
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert cleaned.get("Disposition_Mandat") is not None
assert cleaned["Disposition_Mandat"].value == "OUI"
def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod):
"""The PF0442 case โ both markers ambiguous โ field dropped entirely,
consultant flags it via manual_review at engine level."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert "Disposition_Mandat" not in cleaned
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# Batiment_Adresse โ stopword stripping + OCR backstop
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def test_address_regex_matches_typical_french_addresses(inference_mod):
pattern = inference_mod._RE_ADDR_FR
assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre")
assert pattern.search("Adresse 1 rue Abbรฉ Guinard 44100")
assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE")
assert pattern.search("Sis ร 5 avenue de la Gare 31000 Toulouse")
def test_address_regex_rejects_non_addresses(inference_mod):
pattern = inference_mod._RE_ADDR_FR
assert pattern.search("PC0440352500035") is None # urbanism ref
assert pattern.search("FICHE DE RENSEIGNEMENT") is None # form header
assert pattern.search("Tel mobile 0670123456") is None # phone
def test_clean_address_strips_form_header_noise(inference_mod):
"""A real model output bundles MAITRE D'OUVRAGE with the address โ
we should strip the header, not reject the whole field."""
raw = {"Batiment_Adresse": _ext(
inference_mod,
"MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle",
conf=0.8,
)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "Batiment_Adresse" in cleaned
val = cleaned["Batiment_Adresse"].value
assert "MAITRE" not in val.upper().replace("'", "")
assert "Cotalard" in val
def test_clean_address_dropped_when_only_headers(inference_mod):
"""If the entire span is header noise with no real address content,
the field should still be dropped โ but via length check, not
blanket rejection of every span containing a stopword."""
raw = {"Batiment_Adresse": _ext(
inference_mod,
"FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU",
conf=0.4,
)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
# After stripping all the stopwords, only "/" separators remain โ dropped
assert "Batiment_Adresse" not in cleaned
def test_clean_address_backstop_from_ocr(inference_mod):
"""Model returned nothing for Batiment_Adresse โ the OCR text contains
an address, the regex backstop fills it in."""
ocr = (
"DESCRIPTION DE L'OPERATION ... "
"Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... "
"DLPI: 01/09/2026"
)
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert "Batiment_Adresse" in cleaned
assert "Cotalard" in cleaned["Batiment_Adresse"].value
def test_clean_address_backstop_no_match_leaves_empty(inference_mod):
"""If the OCR has no recognisable address pattern, don't fabricate one."""
cleaned = inference_mod._clean_field_extractions(
{}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text"
)
assert "Batiment_Adresse" not in cleaned
|