FiberGate / tests /test_inference_postprocess.py
AzizMiladi's picture
fix(ci): make ruff + mypy green on the new src/ layout
dc73111
Raw
History Blame
15.9 kB
"""
Unit tests for the post-processing layer in `4_inference.py`:
- the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER)
- `_mandat_checkbox_score` + `_detect_mandat_checkbox`
- `_clean_field_extractions` on synthetic raw model outputs
These tests don't load the model โ€” we exercise the pure functions directly.
"""
from __future__ import annotations
import pytest
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_REFURB โ€” urbanism reference detection
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, expected_match", [
# Should match (valid PC / PA / DP / CU + digit body)
("PC 044 035 25 00035", True),
("PC0440352500035", True),
("Pc0440352500035", True), # case-insensitive prefix
("PA 022 360 22 00027", True),
("DP 044 035", True),
# Should NOT match โ€” French word "rue" must not trigger RU prefix
("rue Abbรฉ Guinard", False),
# Should NOT match โ€” "Parcelle" must not trigger PA prefix
("Parcelle", False),
("Paysagiste Bureau de contrรดle", False),
# Empty
("", False),
])
def test_re_refurb_strict_prefix(inference_mod, text, expected_match):
m = inference_mod._RE_REFURB.search(text)
assert (m is not None) is expected_match
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_PHONE_FR โ€” French phone number patterns
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, has_match", [
("Tel : 0670934655 disponible", True),
("06 85 46 87 86 Mail", True),
("06.85.46.87.86", True),
("07-85-62-03-00", True),
# Negatives
("Code postal 44240", False), # 5 digits โ‰  10-digit phone
("1234", False),
("01 02", False), # too short
])
def test_re_phone_fr(inference_mod, text, has_match):
m = inference_mod._RE_PHONE_FR.search(text)
assert (m is not None) is has_match
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _RE_EMAIL โ€” email validation
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("text, has_match", [
("sebastien.gue@orange.com", True),
("immobilier.be-orange@orange.com", True),
("marine.pascalin+test@orange.com", True),
# Negatives
("Pas un email", False),
("@orange.com sans prefix", False),
("user@", False),
])
def test_re_email(inference_mod, text, has_match):
m = inference_mod._RE_EMAIL.search(text)
assert (m is not None) is has_match
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _mandat_checkbox_score โ€” strict scorer for OCR-rendered checkbox markers
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("marker, expected_min_score", [
# Strong: explicit X
("[X]", 5),
("X", 5),
("PX", 5), # OCR misread of [X]
("FX", 5),
# Strong: digit (Tesseract often reads X as 1 or 9)
("C1]", 3),
("[1]", 3),
("9", 3),
# Mark-like multi-chars
("**[]", 3),
# Orphan bracket
("C]", 2),
])
def test_mandat_score_strong(inference_mod, marker, expected_min_score):
assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score
@pytest.mark.parametrize("marker", [
"", # empty
"[]", # canonical empty box
"()",
"D", # single letter (Tesseract often reads [] as D)
"O",
"Q",
"!", # single punctuation โ€” was the PF0442 bug, must score 0
"si", # OCR noise โ€” was the PF0442 bug, must score 0
"DA", # two random letters
])
def test_mandat_score_weak_or_empty(inference_mod, marker):
"""All these markers should score 0 โ€” they're ambiguous OCR garble,
not evidence of an X-mark."""
assert inference_mod._mandat_checkbox_score(marker) == 0
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _detect_mandat_checkbox โ€” full pipeline on synthetic OCR strings
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_detect_mandat_oui_clear(inference_mod):
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
def test_detect_mandat_non_clear(inference_mod):
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [] / NON [X] si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) == "NON"
def test_detect_mandat_oui_garbled(inference_mod):
"""Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
def test_detect_mandat_ambiguous_returns_none(inference_mod):
"""The PF0442 case: both markers are weak (`!` vs `si`). Return None
rather than commit on a coin flip."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
assert inference_mod._detect_mandat_checkbox(ocr) is None
def test_detect_mandat_no_anchor(inference_mod):
"""No 'mandat' / 'ouvrage' / 'dispose' keywords nearby โ†’ return None
rather than match an unrelated OUI/NON pair (e.g., the AU question)."""
ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la rรฉfรฉrence"
assert inference_mod._detect_mandat_checkbox(ocr) is None
def test_detect_mandat_picks_right_pair(inference_mod):
"""Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON).
Detector must skip the AU pair and find the mandat one."""
ocr = (
"Autorisation d'Urbanisme OUI [] / NON [X] indiquer la rรฉfรฉrence ..."
" Coordonnรฉes du futur syndic ..."
" Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI C1] / NON [] si oui"
)
assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# _clean_field_extractions โ€” end-to-end cleaner behaviour
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _ext(inference_mod, value, conf=0.9):
return inference_mod.FieldExtraction(value=value, confidence=conf)
def test_clean_strips_trailing_noise_from_name(inference_mod):
"""Model returns 'GUE Sรฉbastien Conseiller Neuf Mobile' โ€” cleaner should
keep the name and drop the trailing role keywords."""
raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sรฉbastien Conseiller Neuf Mobile", conf=0.62)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "Representant_Nom_Complet" in cleaned
val = cleaned["Representant_Nom_Complet"].value
assert "Conseiller" not in val
assert "Mobile" not in val
assert "Sรฉbastien" in val
def test_clean_extracts_phone_from_noisy_span(inference_mod):
"""Model returns phone + trailing word 'Mail'. Cleaner should keep only
the phone digits."""
raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86")
assert "Mail" not in cleaned["Representant_Telephone"].value
def test_clean_extracts_pc_code_from_bundled_text(inference_mod):
"""Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts
just the PC code."""
raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value
assert "Vv" not in cleaned["Reference_Urbanisme"].value
def test_clean_drops_low_confidence_freetext_fields(inference_mod):
"""Free-text fields (cabinet_conseil, Batiment_Adresse,
Representant_Nom_Complet) with confidence < 0.40 should be dropped
entirely โ€” they're typically the model hallucinating on uncertain
inputs."""
raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "cabinet_conseil" not in cleaned
def test_clean_email_backstop_from_ocr_text(inference_mod):
"""Model returned nothing for email, but OCR has a valid email โ†’
backstop fills it in."""
cleaned = inference_mod._clean_field_extractions(
{},
ocr_text="Email: test.user@orange.com Tel: 0670934655"
)
assert "Representant_Email" in cleaned
assert cleaned["Representant_Email"].value == "test.user@orange.com"
def test_clean_logement_total_backstop_from_ocr(inference_mod):
"""`nb_log_totale` not extracted by the model โ€” backstop reads it from
the form text 'logements/locaux/lots : 1'."""
ocr = (
"Nb total de Nb total de lots : Nb total de macrolots : "
"logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont"
)
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert cleaned.get("nb_log_totale") is not None
assert cleaned["nb_log_totale"].value == "1"
def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod):
"""The cleaner's Disposition_Mandat handling should call the checkbox
detector and prefer its result over any model-supplied value."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI [X] / NON [] si oui"
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert cleaned.get("Disposition_Mandat") is not None
assert cleaned["Disposition_Mandat"].value == "OUI"
def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod):
"""The PF0442 case โ€” both markers ambiguous โ†’ field dropped entirely,
consultant flags it via manual_review at engine level."""
ocr = "Je dispose d'un mandat de reprรฉsentation du Maรฎtre d'ouvrage : OUI ! / NON si oui fournir le mandat"
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert "Disposition_Mandat" not in cleaned
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Batiment_Adresse โ€” stopword stripping + OCR backstop
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_address_regex_matches_typical_french_addresses(inference_mod):
pattern = inference_mod._RE_ADDR_FR
assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre")
assert pattern.search("Adresse 1 rue Abbรฉ Guinard 44100")
assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE")
assert pattern.search("Sis ร  5 avenue de la Gare 31000 Toulouse")
def test_address_regex_rejects_non_addresses(inference_mod):
pattern = inference_mod._RE_ADDR_FR
assert pattern.search("PC0440352500035") is None # urbanism ref
assert pattern.search("FICHE DE RENSEIGNEMENT") is None # form header
assert pattern.search("Tel mobile 0670123456") is None # phone
def test_clean_address_strips_form_header_noise(inference_mod):
"""A real model output bundles MAITRE D'OUVRAGE with the address โ€”
we should strip the header, not reject the whole field."""
raw = {"Batiment_Adresse": _ext(
inference_mod,
"MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle",
conf=0.8,
)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
assert "Batiment_Adresse" in cleaned
val = cleaned["Batiment_Adresse"].value
assert "MAITRE" not in val.upper().replace("'", "")
assert "Cotalard" in val
def test_clean_address_dropped_when_only_headers(inference_mod):
"""If the entire span is header noise with no real address content,
the field should still be dropped โ€” but via length check, not
blanket rejection of every span containing a stopword."""
raw = {"Batiment_Adresse": _ext(
inference_mod,
"FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU",
conf=0.4,
)}
cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
# After stripping all the stopwords, only "/" separators remain โ†’ dropped
assert "Batiment_Adresse" not in cleaned
def test_clean_address_backstop_from_ocr(inference_mod):
"""Model returned nothing for Batiment_Adresse โ€” the OCR text contains
an address, the regex backstop fills it in."""
ocr = (
"DESCRIPTION DE L'OPERATION ... "
"Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... "
"DLPI: 01/09/2026"
)
cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
assert "Batiment_Adresse" in cleaned
assert "Cotalard" in cleaned["Batiment_Adresse"].value
def test_clean_address_backstop_no_match_leaves_empty(inference_mod):
"""If the OCR has no recognisable address pattern, don't fabricate one."""
cleaned = inference_mod._clean_field_extractions(
{}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text"
)
assert "Batiment_Adresse" not in cleaned