Spaces:

AzizMiladi
/

FiberGate

Sleeping

App Files Files

FiberGate / tests /test_inference_postprocess.py

AzizMiladi

fix(ci): make ruff + mypy green on the new src/ layout

dc73111 about 1 month ago

Raw

History Blame

15.9 kB

	"""
	Unit tests for the post-processing layer in `4_inference.py`:
	- the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER)
	- `_mandat_checkbox_score` + `_detect_mandat_checkbox`
	- `_clean_field_extractions` on synthetic raw model outputs

	These tests don't load the model — we exercise the pure functions directly.
	"""
	from __future__ import annotations

	import pytest


	# ──────────────────────────────────────────────────────────────────────────
	# _RE_REFURB — urbanism reference detection
	# ──────────────────────────────────────────────────────────────────────────
	@pytest.mark.parametrize("text, expected_match", [
	# Should match (valid PC / PA / DP / CU + digit body)
	("PC 044 035 25 00035", True),
	("PC0440352500035", True),
	("Pc0440352500035", True), # case-insensitive prefix
	("PA 022 360 22 00027", True),
	("DP 044 035", True),
	# Should NOT match — French word "rue" must not trigger RU prefix
	("rue Abbé Guinard", False),
	# Should NOT match — "Parcelle" must not trigger PA prefix
	("Parcelle", False),
	("Paysagiste Bureau de contrôle", False),
	# Empty
	("", False),
	])
	def test_re_refurb_strict_prefix(inference_mod, text, expected_match):
	m = inference_mod._RE_REFURB.search(text)
	assert (m is not None) is expected_match


	# ──────────────────────────────────────────────────────────────────────────
	# _RE_PHONE_FR — French phone number patterns
	# ──────────────────────────────────────────────────────────────────────────
	@pytest.mark.parametrize("text, has_match", [
	("Tel : 0670934655 disponible", True),
	("06 85 46 87 86 Mail", True),
	("06.85.46.87.86", True),
	("07-85-62-03-00", True),
	# Negatives
	("Code postal 44240", False), # 5 digits ≠ 10-digit phone
	("1234", False),
	("01 02", False), # too short
	])
	def test_re_phone_fr(inference_mod, text, has_match):
	m = inference_mod._RE_PHONE_FR.search(text)
	assert (m is not None) is has_match


	# ──────────────────────────────────────────────────────────────────────────
	# _RE_EMAIL — email validation
	# ──────────────────────────────────────────────────────────────────────────
	@pytest.mark.parametrize("text, has_match", [
	("sebastien.gue@orange.com", True),
	("immobilier.be-orange@orange.com", True),
	("marine.pascalin+test@orange.com", True),
	# Negatives
	("Pas un email", False),
	("@orange.com sans prefix", False),
	("user@", False),
	])
	def test_re_email(inference_mod, text, has_match):
	m = inference_mod._RE_EMAIL.search(text)
	assert (m is not None) is has_match


	# ──────────────────────────────────────────────────────────────────────────
	# _mandat_checkbox_score — strict scorer for OCR-rendered checkbox markers
	# ──────────────────────────────────────────────────────────────────────────
	@pytest.mark.parametrize("marker, expected_min_score", [
	# Strong: explicit X
	("[X]", 5),
	("X", 5),
	("PX", 5), # OCR misread of [X]
	("FX", 5),
	# Strong: digit (Tesseract often reads X as 1 or 9)
	("C1]", 3),
	("[1]", 3),
	("9", 3),
	# Mark-like multi-chars
	("**[]", 3),
	# Orphan bracket
	("C]", 2),
	])
	def test_mandat_score_strong(inference_mod, marker, expected_min_score):
	assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score


	@pytest.mark.parametrize("marker", [
	"", # empty
	"[]", # canonical empty box
	"()",
	"D", # single letter (Tesseract often reads [] as D)
	"O",
	"Q",
	"!", # single punctuation — was the PF0442 bug, must score 0
	"si", # OCR noise — was the PF0442 bug, must score 0
	"DA", # two random letters
	])
	def test_mandat_score_weak_or_empty(inference_mod, marker):
	"""All these markers should score 0 — they're ambiguous OCR garble,
	not evidence of an X-mark."""
	assert inference_mod._mandat_checkbox_score(marker) == 0


	# ──────────────────────────────────────────────────────────────────────────
	# _detect_mandat_checkbox — full pipeline on synthetic OCR strings
	# ──────────────────────────────────────────────────────────────────────────
	def test_detect_mandat_oui_clear(inference_mod):
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui fournir le mandat"
	assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


	def test_detect_mandat_non_clear(inference_mod):
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [] / NON [X] si oui fournir le mandat"
	assert inference_mod._detect_mandat_checkbox(ocr) == "NON"


	def test_detect_mandat_oui_garbled(inference_mod):
	"""Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'."""
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui"
	assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


	def test_detect_mandat_ambiguous_returns_none(inference_mod):
	"""The PF0442 case: both markers are weak (`!` vs `si`). Return None
	rather than commit on a coin flip."""
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat"
	assert inference_mod._detect_mandat_checkbox(ocr) is None


	def test_detect_mandat_no_anchor(inference_mod):
	"""No 'mandat' / 'ouvrage' / 'dispose' keywords nearby → return None
	rather than match an unrelated OUI/NON pair (e.g., the AU question)."""
	ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la référence"
	assert inference_mod._detect_mandat_checkbox(ocr) is None


	def test_detect_mandat_picks_right_pair(inference_mod):
	"""Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON).
	Detector must skip the AU pair and find the mandat one."""
	ocr = (
	"Autorisation d'Urbanisme OUI [] / NON [X] indiquer la référence ..."
	" Coordonnées du futur syndic ..."
	" Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui"
	)
	assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


	# ──────────────────────────────────────────────────────────────────────────
	# _clean_field_extractions — end-to-end cleaner behaviour
	# ──────────────────────────────────────────────────────────────────────────
	def _ext(inference_mod, value, conf=0.9):
	return inference_mod.FieldExtraction(value=value, confidence=conf)


	def test_clean_strips_trailing_noise_from_name(inference_mod):
	"""Model returns 'GUE Sébastien Conseiller Neuf Mobile' — cleaner should
	keep the name and drop the trailing role keywords."""
	raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sébastien Conseiller Neuf Mobile", conf=0.62)}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	assert "Representant_Nom_Complet" in cleaned
	val = cleaned["Representant_Nom_Complet"].value
	assert "Conseiller" not in val
	assert "Mobile" not in val
	assert "Sébastien" in val


	def test_clean_extracts_phone_from_noisy_span(inference_mod):
	"""Model returns phone + trailing word 'Mail'. Cleaner should keep only
	the phone digits."""
	raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86")
	assert "Mail" not in cleaned["Representant_Telephone"].value


	def test_clean_extracts_pc_code_from_bundled_text(inference_mod):
	"""Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts
	just the PC code."""
	raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value
	assert "Vv" not in cleaned["Reference_Urbanisme"].value


	def test_clean_drops_low_confidence_freetext_fields(inference_mod):
	"""Free-text fields (cabinet_conseil, Batiment_Adresse,
	Representant_Nom_Complet) with confidence < 0.40 should be dropped
	entirely — they're typically the model hallucinating on uncertain
	inputs."""
	raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	assert "cabinet_conseil" not in cleaned


	def test_clean_email_backstop_from_ocr_text(inference_mod):
	"""Model returned nothing for email, but OCR has a valid email →
	backstop fills it in."""
	cleaned = inference_mod._clean_field_extractions(
	{},
	ocr_text="Email: test.user@orange.com Tel: 0670934655"
	)
	assert "Representant_Email" in cleaned
	assert cleaned["Representant_Email"].value == "test.user@orange.com"


	def test_clean_logement_total_backstop_from_ocr(inference_mod):
	"""`nb_log_totale` not extracted by the model — backstop reads it from
	the form text 'logements/locaux/lots : 1'."""
	ocr = (
	"Nb total de Nb total de lots : Nb total de macrolots : "
	"logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont"
	)
	cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
	assert cleaned.get("nb_log_totale") is not None
	assert cleaned["nb_log_totale"].value == "1"


	def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod):
	"""The cleaner's Disposition_Mandat handling should call the checkbox
	detector and prefer its result over any model-supplied value."""
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui"
	cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
	assert cleaned.get("Disposition_Mandat") is not None
	assert cleaned["Disposition_Mandat"].value == "OUI"


	def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod):
	"""The PF0442 case — both markers ambiguous → field dropped entirely,
	consultant flags it via manual_review at engine level."""
	ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat"
	cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
	assert "Disposition_Mandat" not in cleaned


	# ──────────────────────────────────────────────────────────────────────────
	# Batiment_Adresse — stopword stripping + OCR backstop
	# ──────────────────────────────────────────────────────────────────────────
	def test_address_regex_matches_typical_french_addresses(inference_mod):
	pattern = inference_mod._RE_ADDR_FR
	assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre")
	assert pattern.search("Adresse 1 rue Abbé Guinard 44100")
	assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE")
	assert pattern.search("Sis à 5 avenue de la Gare 31000 Toulouse")


	def test_address_regex_rejects_non_addresses(inference_mod):
	pattern = inference_mod._RE_ADDR_FR
	assert pattern.search("PC0440352500035") is None # urbanism ref
	assert pattern.search("FICHE DE RENSEIGNEMENT") is None # form header
	assert pattern.search("Tel mobile 0670123456") is None # phone


	def test_clean_address_strips_form_header_noise(inference_mod):
	"""A real model output bundles MAITRE D'OUVRAGE with the address —
	we should strip the header, not reject the whole field."""
	raw = {"Batiment_Adresse": _ext(
	inference_mod,
	"MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle",
	conf=0.8,
	)}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	assert "Batiment_Adresse" in cleaned
	val = cleaned["Batiment_Adresse"].value
	assert "MAITRE" not in val.upper().replace("'", "")
	assert "Cotalard" in val


	def test_clean_address_dropped_when_only_headers(inference_mod):
	"""If the entire span is header noise with no real address content,
	the field should still be dropped — but via length check, not
	blanket rejection of every span containing a stopword."""
	raw = {"Batiment_Adresse": _ext(
	inference_mod,
	"FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU",
	conf=0.4,
	)}
	cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
	# After stripping all the stopwords, only "/" separators remain → dropped
	assert "Batiment_Adresse" not in cleaned


	def test_clean_address_backstop_from_ocr(inference_mod):
	"""Model returned nothing for Batiment_Adresse — the OCR text contains
	an address, the regex backstop fills it in."""
	ocr = (
	"DESCRIPTION DE L'OPERATION ... "
	"Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... "
	"DLPI: 01/09/2026"
	)
	cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
	assert "Batiment_Adresse" in cleaned
	assert "Cotalard" in cleaned["Batiment_Adresse"].value


	def test_clean_address_backstop_no_match_leaves_empty(inference_mod):
	"""If the OCR has no recognisable address pattern, don't fabricate one."""
	cleaned = inference_mod._clean_field_extractions(
	{}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text"
	)
	assert "Batiment_Adresse" not in cleaned