""" Unit tests for the post-processing layer in `4_inference.py`: - the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER) - `_mandat_checkbox_score` + `_detect_mandat_checkbox` - `_clean_field_extractions` on synthetic raw model outputs These tests don't load the model — we exercise the pure functions directly. """ from __future__ import annotations import pytest # ────────────────────────────────────────────────────────────────────────── # _RE_REFURB — urbanism reference detection # ────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("text, expected_match", [ # Should match (valid PC / PA / DP / CU + digit body) ("PC 044 035 25 00035", True), ("PC0440352500035", True), ("Pc0440352500035", True), # case-insensitive prefix ("PA 022 360 22 00027", True), ("DP 044 035", True), # Should NOT match — French word "rue" must not trigger RU prefix ("rue Abbé Guinard", False), # Should NOT match — "Parcelle" must not trigger PA prefix ("Parcelle", False), ("Paysagiste Bureau de contrôle", False), # Empty ("", False), ]) def test_re_refurb_strict_prefix(inference_mod, text, expected_match): m = inference_mod._RE_REFURB.search(text) assert (m is not None) is expected_match # ────────────────────────────────────────────────────────────────────────── # _RE_PHONE_FR — French phone number patterns # ────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("text, has_match", [ ("Tel : 0670934655 disponible", True), ("06 85 46 87 86 Mail", True), ("06.85.46.87.86", True), ("07-85-62-03-00", True), # Negatives ("Code postal 44240", False), # 5 digits ≠ 10-digit phone ("1234", False), ("01 02", False), # too short ]) def test_re_phone_fr(inference_mod, text, has_match): m = inference_mod._RE_PHONE_FR.search(text) assert (m is not None) is has_match # ────────────────────────────────────────────────────────────────────────── # _RE_EMAIL — email validation # ────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("text, has_match", [ ("sebastien.gue@orange.com", True), ("immobilier.be-orange@orange.com", True), ("marine.pascalin+test@orange.com", True), # Negatives ("Pas un email", False), ("@orange.com sans prefix", False), ("user@", False), ]) def test_re_email(inference_mod, text, has_match): m = inference_mod._RE_EMAIL.search(text) assert (m is not None) is has_match # ────────────────────────────────────────────────────────────────────────── # _mandat_checkbox_score — strict scorer for OCR-rendered checkbox markers # ────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("marker, expected_min_score", [ # Strong: explicit X ("[X]", 5), ("X", 5), ("PX", 5), # OCR misread of [X] ("FX", 5), # Strong: digit (Tesseract often reads X as 1 or 9) ("C1]", 3), ("[1]", 3), ("9", 3), # Mark-like multi-chars ("**[]", 3), # Orphan bracket ("C]", 2), ]) def test_mandat_score_strong(inference_mod, marker, expected_min_score): assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score @pytest.mark.parametrize("marker", [ "", # empty "[]", # canonical empty box "()", "D", # single letter (Tesseract often reads [] as D) "O", "Q", "!", # single punctuation — was the PF0442 bug, must score 0 "si", # OCR noise — was the PF0442 bug, must score 0 "DA", # two random letters ]) def test_mandat_score_weak_or_empty(inference_mod, marker): """All these markers should score 0 — they're ambiguous OCR garble, not evidence of an X-mark.""" assert inference_mod._mandat_checkbox_score(marker) == 0 # ────────────────────────────────────────────────────────────────────────── # _detect_mandat_checkbox — full pipeline on synthetic OCR strings # ────────────────────────────────────────────────────────────────────────── def test_detect_mandat_oui_clear(inference_mod): ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui fournir le mandat" assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" def test_detect_mandat_non_clear(inference_mod): ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [] / NON [X] si oui fournir le mandat" assert inference_mod._detect_mandat_checkbox(ocr) == "NON" def test_detect_mandat_oui_garbled(inference_mod): """Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'.""" ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui" assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" def test_detect_mandat_ambiguous_returns_none(inference_mod): """The PF0442 case: both markers are weak (`!` vs `si`). Return None rather than commit on a coin flip.""" ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat" assert inference_mod._detect_mandat_checkbox(ocr) is None def test_detect_mandat_no_anchor(inference_mod): """No 'mandat' / 'ouvrage' / 'dispose' keywords nearby → return None rather than match an unrelated OUI/NON pair (e.g., the AU question).""" ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la référence" assert inference_mod._detect_mandat_checkbox(ocr) is None def test_detect_mandat_picks_right_pair(inference_mod): """Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON). Detector must skip the AU pair and find the mandat one.""" ocr = ( "Autorisation d'Urbanisme OUI [] / NON [X] indiquer la référence ..." " Coordonnées du futur syndic ..." " Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui" ) assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" # ────────────────────────────────────────────────────────────────────────── # _clean_field_extractions — end-to-end cleaner behaviour # ────────────────────────────────────────────────────────────────────────── def _ext(inference_mod, value, conf=0.9): return inference_mod.FieldExtraction(value=value, confidence=conf) def test_clean_strips_trailing_noise_from_name(inference_mod): """Model returns 'GUE Sébastien Conseiller Neuf Mobile' — cleaner should keep the name and drop the trailing role keywords.""" raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sébastien Conseiller Neuf Mobile", conf=0.62)} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") assert "Representant_Nom_Complet" in cleaned val = cleaned["Representant_Nom_Complet"].value assert "Conseiller" not in val assert "Mobile" not in val assert "Sébastien" in val def test_clean_extracts_phone_from_noisy_span(inference_mod): """Model returns phone + trailing word 'Mail'. Cleaner should keep only the phone digits.""" raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86") assert "Mail" not in cleaned["Representant_Telephone"].value def test_clean_extracts_pc_code_from_bundled_text(inference_mod): """Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts just the PC code.""" raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value assert "Vv" not in cleaned["Reference_Urbanisme"].value def test_clean_drops_low_confidence_freetext_fields(inference_mod): """Free-text fields (cabinet_conseil, Batiment_Adresse, Representant_Nom_Complet) with confidence < 0.40 should be dropped entirely — they're typically the model hallucinating on uncertain inputs.""" raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") assert "cabinet_conseil" not in cleaned def test_clean_email_backstop_from_ocr_text(inference_mod): """Model returned nothing for email, but OCR has a valid email → backstop fills it in.""" cleaned = inference_mod._clean_field_extractions( {}, ocr_text="Email: test.user@orange.com Tel: 0670934655" ) assert "Representant_Email" in cleaned assert cleaned["Representant_Email"].value == "test.user@orange.com" def test_clean_logement_total_backstop_from_ocr(inference_mod): """`nb_log_totale` not extracted by the model — backstop reads it from the form text 'logements/locaux/lots : 1'.""" ocr = ( "Nb total de Nb total de lots : Nb total de macrolots : " "logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont" ) cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) assert cleaned.get("nb_log_totale") is not None assert cleaned["nb_log_totale"].value == "1" def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod): """The cleaner's Disposition_Mandat handling should call the checkbox detector and prefer its result over any model-supplied value.""" ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui" cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) assert cleaned.get("Disposition_Mandat") is not None assert cleaned["Disposition_Mandat"].value == "OUI" def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod): """The PF0442 case — both markers ambiguous → field dropped entirely, consultant flags it via manual_review at engine level.""" ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat" cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) assert "Disposition_Mandat" not in cleaned # ────────────────────────────────────────────────────────────────────────── # Batiment_Adresse — stopword stripping + OCR backstop # ────────────────────────────────────────────────────────────────────────── def test_address_regex_matches_typical_french_addresses(inference_mod): pattern = inference_mod._RE_ADDR_FR assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre") assert pattern.search("Adresse 1 rue Abbé Guinard 44100") assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE") assert pattern.search("Sis à 5 avenue de la Gare 31000 Toulouse") def test_address_regex_rejects_non_addresses(inference_mod): pattern = inference_mod._RE_ADDR_FR assert pattern.search("PC0440352500035") is None # urbanism ref assert pattern.search("FICHE DE RENSEIGNEMENT") is None # form header assert pattern.search("Tel mobile 0670123456") is None # phone def test_clean_address_strips_form_header_noise(inference_mod): """A real model output bundles MAITRE D'OUVRAGE with the address — we should strip the header, not reject the whole field.""" raw = {"Batiment_Adresse": _ext( inference_mod, "MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle", conf=0.8, )} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") assert "Batiment_Adresse" in cleaned val = cleaned["Batiment_Adresse"].value assert "MAITRE" not in val.upper().replace("'", "") assert "Cotalard" in val def test_clean_address_dropped_when_only_headers(inference_mod): """If the entire span is header noise with no real address content, the field should still be dropped — but via length check, not blanket rejection of every span containing a stopword.""" raw = {"Batiment_Adresse": _ext( inference_mod, "FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU", conf=0.4, )} cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") # After stripping all the stopwords, only "/" separators remain → dropped assert "Batiment_Adresse" not in cleaned def test_clean_address_backstop_from_ocr(inference_mod): """Model returned nothing for Batiment_Adresse — the OCR text contains an address, the regex backstop fills it in.""" ocr = ( "DESCRIPTION DE L'OPERATION ... " "Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... " "DLPI: 01/09/2026" ) cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) assert "Batiment_Adresse" in cleaned assert "Cotalard" in cleaned["Batiment_Adresse"].value def test_clean_address_backstop_no_match_leaves_empty(inference_mod): """If the OCR has no recognisable address pattern, don't fabricate one.""" cleaned = inference_mod._clean_field_extractions( {}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text" ) assert "Batiment_Adresse" not in cleaned