Spaces:
Sleeping
Sleeping
| """ | |
| Unit tests for the post-processing layer in `4_inference.py`: | |
| - the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER) | |
| - `_mandat_checkbox_score` + `_detect_mandat_checkbox` | |
| - `_clean_field_extractions` on synthetic raw model outputs | |
| These tests don't load the model โ we exercise the pure functions directly. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _RE_REFURB โ urbanism reference detection | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_re_refurb_strict_prefix(inference_mod, text, expected_match): | |
| m = inference_mod._RE_REFURB.search(text) | |
| assert (m is not None) is expected_match | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _RE_PHONE_FR โ French phone number patterns | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_re_phone_fr(inference_mod, text, has_match): | |
| m = inference_mod._RE_PHONE_FR.search(text) | |
| assert (m is not None) is has_match | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _RE_EMAIL โ email validation | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_re_email(inference_mod, text, has_match): | |
| m = inference_mod._RE_EMAIL.search(text) | |
| assert (m is not None) is has_match | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _mandat_checkbox_score โ strict scorer for OCR-rendered checkbox markers | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_mandat_score_strong(inference_mod, marker, expected_min_score): | |
| assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score | |
| def test_mandat_score_weak_or_empty(inference_mod, marker): | |
| """All these markers should score 0 โ they're ambiguous OCR garble, | |
| not evidence of an X-mark.""" | |
| assert inference_mod._mandat_checkbox_score(marker) == 0 | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _detect_mandat_checkbox โ full pipeline on synthetic OCR strings | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_detect_mandat_oui_clear(inference_mod): | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui fournir le mandat" | |
| assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" | |
| def test_detect_mandat_non_clear(inference_mod): | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [] / NON [X] si oui fournir le mandat" | |
| assert inference_mod._detect_mandat_checkbox(ocr) == "NON" | |
| def test_detect_mandat_oui_garbled(inference_mod): | |
| """Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'.""" | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui" | |
| assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" | |
| def test_detect_mandat_ambiguous_returns_none(inference_mod): | |
| """The PF0442 case: both markers are weak (`!` vs `si`). Return None | |
| rather than commit on a coin flip.""" | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat" | |
| assert inference_mod._detect_mandat_checkbox(ocr) is None | |
| def test_detect_mandat_no_anchor(inference_mod): | |
| """No 'mandat' / 'ouvrage' / 'dispose' keywords nearby โ return None | |
| rather than match an unrelated OUI/NON pair (e.g., the AU question).""" | |
| ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la rรฉfรฉrence" | |
| assert inference_mod._detect_mandat_checkbox(ocr) is None | |
| def test_detect_mandat_picks_right_pair(inference_mod): | |
| """Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON). | |
| Detector must skip the AU pair and find the mandat one.""" | |
| ocr = ( | |
| "Autorisation d'Urbanisme OUI [] / NON [X] indiquer la rรฉfรฉrence ..." | |
| " Coordonnรฉes du futur syndic ..." | |
| " Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui" | |
| ) | |
| assert inference_mod._detect_mandat_checkbox(ocr) == "OUI" | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # _clean_field_extractions โ end-to-end cleaner behaviour | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _ext(inference_mod, value, conf=0.9): | |
| return inference_mod.FieldExtraction(value=value, confidence=conf) | |
| def test_clean_strips_trailing_noise_from_name(inference_mod): | |
| """Model returns 'GUE Sรฉbastien Conseiller Neuf Mobile' โ cleaner should | |
| keep the name and drop the trailing role keywords.""" | |
| raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sรฉbastien Conseiller Neuf Mobile", conf=0.62)} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| assert "Representant_Nom_Complet" in cleaned | |
| val = cleaned["Representant_Nom_Complet"].value | |
| assert "Conseiller" not in val | |
| assert "Mobile" not in val | |
| assert "Sรฉbastien" in val | |
| def test_clean_extracts_phone_from_noisy_span(inference_mod): | |
| """Model returns phone + trailing word 'Mail'. Cleaner should keep only | |
| the phone digits.""" | |
| raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86") | |
| assert "Mail" not in cleaned["Representant_Telephone"].value | |
| def test_clean_extracts_pc_code_from_bundled_text(inference_mod): | |
| """Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts | |
| just the PC code.""" | |
| raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value | |
| assert "Vv" not in cleaned["Reference_Urbanisme"].value | |
| def test_clean_drops_low_confidence_freetext_fields(inference_mod): | |
| """Free-text fields (cabinet_conseil, Batiment_Adresse, | |
| Representant_Nom_Complet) with confidence < 0.40 should be dropped | |
| entirely โ they're typically the model hallucinating on uncertain | |
| inputs.""" | |
| raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| assert "cabinet_conseil" not in cleaned | |
| def test_clean_email_backstop_from_ocr_text(inference_mod): | |
| """Model returned nothing for email, but OCR has a valid email โ | |
| backstop fills it in.""" | |
| cleaned = inference_mod._clean_field_extractions( | |
| {}, | |
| ocr_text="Email: test.user@orange.com Tel: 0670934655" | |
| ) | |
| assert "Representant_Email" in cleaned | |
| assert cleaned["Representant_Email"].value == "test.user@orange.com" | |
| def test_clean_logement_total_backstop_from_ocr(inference_mod): | |
| """`nb_log_totale` not extracted by the model โ backstop reads it from | |
| the form text 'logements/locaux/lots : 1'.""" | |
| ocr = ( | |
| "Nb total de Nb total de lots : Nb total de macrolots : " | |
| "logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont" | |
| ) | |
| cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) | |
| assert cleaned.get("nb_log_totale") is not None | |
| assert cleaned["nb_log_totale"].value == "1" | |
| def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod): | |
| """The cleaner's Disposition_Mandat handling should call the checkbox | |
| detector and prefer its result over any model-supplied value.""" | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui" | |
| cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) | |
| assert cleaned.get("Disposition_Mandat") is not None | |
| assert cleaned["Disposition_Mandat"].value == "OUI" | |
| def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod): | |
| """The PF0442 case โ both markers ambiguous โ field dropped entirely, | |
| consultant flags it via manual_review at engine level.""" | |
| ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat" | |
| cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) | |
| assert "Disposition_Mandat" not in cleaned | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Batiment_Adresse โ stopword stripping + OCR backstop | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_address_regex_matches_typical_french_addresses(inference_mod): | |
| pattern = inference_mod._RE_ADDR_FR | |
| assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre") | |
| assert pattern.search("Adresse 1 rue Abbรฉ Guinard 44100") | |
| assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE") | |
| assert pattern.search("Sis ร 5 avenue de la Gare 31000 Toulouse") | |
| def test_address_regex_rejects_non_addresses(inference_mod): | |
| pattern = inference_mod._RE_ADDR_FR | |
| assert pattern.search("PC0440352500035") is None # urbanism ref | |
| assert pattern.search("FICHE DE RENSEIGNEMENT") is None # form header | |
| assert pattern.search("Tel mobile 0670123456") is None # phone | |
| def test_clean_address_strips_form_header_noise(inference_mod): | |
| """A real model output bundles MAITRE D'OUVRAGE with the address โ | |
| we should strip the header, not reject the whole field.""" | |
| raw = {"Batiment_Adresse": _ext( | |
| inference_mod, | |
| "MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle", | |
| conf=0.8, | |
| )} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| assert "Batiment_Adresse" in cleaned | |
| val = cleaned["Batiment_Adresse"].value | |
| assert "MAITRE" not in val.upper().replace("'", "") | |
| assert "Cotalard" in val | |
| def test_clean_address_dropped_when_only_headers(inference_mod): | |
| """If the entire span is header noise with no real address content, | |
| the field should still be dropped โ but via length check, not | |
| blanket rejection of every span containing a stopword.""" | |
| raw = {"Batiment_Adresse": _ext( | |
| inference_mod, | |
| "FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU", | |
| conf=0.4, | |
| )} | |
| cleaned = inference_mod._clean_field_extractions(raw, ocr_text="") | |
| # After stripping all the stopwords, only "/" separators remain โ dropped | |
| assert "Batiment_Adresse" not in cleaned | |
| def test_clean_address_backstop_from_ocr(inference_mod): | |
| """Model returned nothing for Batiment_Adresse โ the OCR text contains | |
| an address, the regex backstop fills it in.""" | |
| ocr = ( | |
| "DESCRIPTION DE L'OPERATION ... " | |
| "Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... " | |
| "DLPI: 01/09/2026" | |
| ) | |
| cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr) | |
| assert "Batiment_Adresse" in cleaned | |
| assert "Cotalard" in cleaned["Batiment_Adresse"].value | |
| def test_clean_address_backstop_no_match_leaves_empty(inference_mod): | |
| """If the OCR has no recognisable address pattern, don't fabricate one.""" | |
| cleaned = inference_mod._clean_field_extractions( | |
| {}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text" | |
| ) | |
| assert "Batiment_Adresse" not in cleaned | |