Spaces:

AzizMiladi
/

FiberGate

Configuration error

File size: 15,925 Bytes

33ddb61

"""
Unit tests for the post-processing layer in `4_inference.py`:
  - the regex constants (_RE_REFURB, _RE_PHONE_FR, _RE_EMAIL, _RE_INTEGER)
  - `_mandat_checkbox_score` + `_detect_mandat_checkbox`
  - `_clean_field_extractions` on synthetic raw model outputs

These tests don't load the model — we exercise the pure functions directly.
"""
from __future__ import annotations

import pytest


# ──────────────────────────────────────────────────────────────────────────
# _RE_REFURB — urbanism reference detection
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("text, expected_match", [
    # Should match (valid PC / PA / DP / CU + digit body)
    ("PC 044 035 25 00035",             True),
    ("PC0440352500035",                 True),
    ("Pc0440352500035",                 True),    # case-insensitive prefix
    ("PA 022 360 22 00027",             True),
    ("DP 044 035",                      True),
    # Should NOT match — French word "rue" must not trigger RU prefix
    ("rue Abbé Guinard",                False),
    # Should NOT match — "Parcelle" must not trigger PA prefix
    ("Parcelle",                        False),
    ("Paysagiste Bureau de contrôle",   False),
    # Empty
    ("",                                False),
])
def test_re_refurb_strict_prefix(inference_mod, text, expected_match):
    m = inference_mod._RE_REFURB.search(text)
    assert (m is not None) is expected_match


# ──────────────────────────────────────────────────────────────────────────
# _RE_PHONE_FR — French phone number patterns
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("text, has_match", [
    ("Tel : 0670934655 disponible",        True),
    ("06 85 46 87 86 Mail",                True),
    ("06.85.46.87.86",                     True),
    ("07-85-62-03-00",                     True),
    # Negatives
    ("Code postal 44240",                  False),   # 5 digits ≠ 10-digit phone
    ("1234",                               False),
    ("01 02",                              False),   # too short
])
def test_re_phone_fr(inference_mod, text, has_match):
    m = inference_mod._RE_PHONE_FR.search(text)
    assert (m is not None) is has_match


# ──────────────────────────────────────────────────────────────────────────
# _RE_EMAIL — email validation
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("text, has_match", [
    ("sebastien.gue@orange.com",                       True),
    ("immobilier.be-orange@orange.com",                True),
    ("marine.pascalin+test@orange.com",                True),
    # Negatives
    ("Pas un email",                                    False),
    ("@orange.com sans prefix",                         False),
    ("user@",                                           False),
])
def test_re_email(inference_mod, text, has_match):
    m = inference_mod._RE_EMAIL.search(text)
    assert (m is not None) is has_match


# ──────────────────────────────────────────────────────────────────────────
# _mandat_checkbox_score — strict scorer for OCR-rendered checkbox markers
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("marker, expected_min_score", [
    # Strong: explicit X
    ("[X]",   5),
    ("X",     5),
    ("PX",    5),    # OCR misread of [X]
    ("FX",    5),
    # Strong: digit (Tesseract often reads X as 1 or 9)
    ("C1]",   3),
    ("[1]",   3),
    ("9",     3),
    # Mark-like multi-chars
    ("**[]",  3),
    # Orphan bracket
    ("C]",    2),
])
def test_mandat_score_strong(inference_mod, marker, expected_min_score):
    assert inference_mod._mandat_checkbox_score(marker) >= expected_min_score


@pytest.mark.parametrize("marker", [
    "",        # empty
    "[]",      # canonical empty box
    "()",
    "D",       # single letter (Tesseract often reads [] as D)
    "O",
    "Q",
    "!",       # single punctuation — was the PF0442 bug, must score 0
    "si",      # OCR noise — was the PF0442 bug, must score 0
    "DA",      # two random letters
])
def test_mandat_score_weak_or_empty(inference_mod, marker):
    """All these markers should score 0 — they're ambiguous OCR garble,
    not evidence of an X-mark."""
    assert inference_mod._mandat_checkbox_score(marker) == 0


# ──────────────────────────────────────────────────────────────────────────
# _detect_mandat_checkbox — full pipeline on synthetic OCR strings
# ──────────────────────────────────────────────────────────────────────────
def test_detect_mandat_oui_clear(inference_mod):
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


def test_detect_mandat_non_clear(inference_mod):
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [] / NON [X] si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) == "NON"


def test_detect_mandat_oui_garbled(inference_mod):
    """Real OCR pattern from PF0090002500001: '[X]' becomes 'C1]'."""
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui"
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


def test_detect_mandat_ambiguous_returns_none(inference_mod):
    """The PF0442 case: both markers are weak (`!` vs `si`). Return None
    rather than commit on a coin flip."""
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat"
    assert inference_mod._detect_mandat_checkbox(ocr) is None


def test_detect_mandat_no_anchor(inference_mod):
    """No 'mandat' / 'ouvrage' / 'dispose' keywords nearby → return None
    rather than match an unrelated OUI/NON pair (e.g., the AU question)."""
    ocr = "Autorisation d'urbanisme requise : OUI [X] / NON [] indiquer la référence"
    assert inference_mod._detect_mandat_checkbox(ocr) is None


def test_detect_mandat_picks_right_pair(inference_mod):
    """Real form: AU question (OUI/NON) comes BEFORE mandat (OUI/NON).
    Detector must skip the AU pair and find the mandat one."""
    ocr = (
        "Autorisation d'Urbanisme OUI [] / NON [X] indiquer la référence ..."
        " Coordonnées du futur syndic ..."
        " Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI C1] / NON [] si oui"
    )
    assert inference_mod._detect_mandat_checkbox(ocr) == "OUI"


# ──────────────────────────────────────────────────────────────────────────
# _clean_field_extractions — end-to-end cleaner behaviour
# ──────────────────────────────────────────────────────────────────────────
def _ext(inference_mod, value, conf=0.9):
    return inference_mod.FieldExtraction(value=value, confidence=conf)


def test_clean_strips_trailing_noise_from_name(inference_mod):
    """Model returns 'GUE Sébastien Conseiller Neuf Mobile' — cleaner should
    keep the name and drop the trailing role keywords."""
    raw = {"Representant_Nom_Complet": _ext(inference_mod, "GUE Sébastien Conseiller Neuf Mobile", conf=0.62)}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "Representant_Nom_Complet" in cleaned
    val = cleaned["Representant_Nom_Complet"].value
    assert "Conseiller" not in val
    assert "Mobile" not in val
    assert "Sébastien" in val


def test_clean_extracts_phone_from_noisy_span(inference_mod):
    """Model returns phone + trailing word 'Mail'. Cleaner should keep only
    the phone digits."""
    raw = {"Representant_Telephone": _ext(inference_mod, "06 85 46 87 86 Mail")}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert cleaned["Representant_Telephone"].value.startswith("06 85 46 87 86")
    assert "Mail" not in cleaned["Representant_Telephone"].value


def test_clean_extracts_pc_code_from_bundled_text(inference_mod):
    """Model returns 'Vv01092025 OPERATION PC0651002500019'. Cleaner extracts
    just the PC code."""
    raw = {"Reference_Urbanisme": _ext(inference_mod, "Vv01092025 OPERATION PC0651002500019")}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "PC0651002500019" in cleaned["Reference_Urbanisme"].value
    assert "Vv" not in cleaned["Reference_Urbanisme"].value


def test_clean_drops_low_confidence_freetext_fields(inference_mod):
    """Free-text fields (cabinet_conseil, Batiment_Adresse,
    Representant_Nom_Complet) with confidence < 0.40 should be dropped
    entirely — they're typically the model hallucinating on uncertain
    inputs."""
    raw = {"cabinet_conseil": _ext(inference_mod, "pour Vu la demande", conf=0.22)}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "cabinet_conseil" not in cleaned


def test_clean_email_backstop_from_ocr_text(inference_mod):
    """Model returned nothing for email, but OCR has a valid email →
    backstop fills it in."""
    cleaned = inference_mod._clean_field_extractions(
        {},
        ocr_text="Email: test.user@orange.com Tel: 0670934655"
    )
    assert "Representant_Email" in cleaned
    assert cleaned["Representant_Email"].value == "test.user@orange.com"


def test_clean_logement_total_backstop_from_ocr(inference_mod):
    """`nb_log_totale` not extracted by the model — backstop reads it from
    the form text 'logements/locaux/lots : 1'."""
    ocr = (
        "Nb total de Nb total de lots : Nb total de macrolots : "
        "logements/locaux/lots : 1 Nb total de macrolots <= 3 logements : Dont"
    )
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert cleaned.get("nb_log_totale") is not None
    assert cleaned["nb_log_totale"].value == "1"


def test_clean_disposition_mandat_uses_checkbox_detector(inference_mod):
    """The cleaner's Disposition_Mandat handling should call the checkbox
    detector and prefer its result over any model-supplied value."""
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI [X] / NON [] si oui"
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert cleaned.get("Disposition_Mandat") is not None
    assert cleaned["Disposition_Mandat"].value == "OUI"


def test_clean_disposition_mandat_dropped_when_ambiguous(inference_mod):
    """The PF0442 case — both markers ambiguous → field dropped entirely,
    consultant flags it via manual_review at engine level."""
    ocr = "Je dispose d'un mandat de représentation du Maître d'ouvrage : OUI ! / NON si oui fournir le mandat"
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert "Disposition_Mandat" not in cleaned


# ──────────────────────────────────────────────────────────────────────────
# Batiment_Adresse — stopword stripping + OCR backstop
# ──────────────────────────────────────────────────────────────────────────
def test_address_regex_matches_typical_french_addresses(inference_mod):
    pattern = inference_mod._RE_ADDR_FR
    assert pattern.search("10 rue de Cotalard, 44240 La Chapelle-sur-Erdre")
    assert pattern.search("Adresse 1 rue Abbé Guinard 44100")
    assert pattern.search("350 BIS AVENUE J R G GAUTIER, 13290 AIX EN PROVENCE")
    assert pattern.search("Sis à 5 avenue de la Gare 31000 Toulouse")


def test_address_regex_rejects_non_addresses(inference_mod):
    pattern = inference_mod._RE_ADDR_FR
    assert pattern.search("PC0440352500035") is None              # urbanism ref
    assert pattern.search("FICHE DE RENSEIGNEMENT") is None       # form header
    assert pattern.search("Tel mobile 0670123456") is None        # phone


def test_clean_address_strips_form_header_noise(inference_mod):
    """A real model output bundles MAITRE D'OUVRAGE with the address —
    we should strip the header, not reject the whole field."""
    raw = {"Batiment_Adresse": _ext(
        inference_mod,
        "MAITRE D'OUVRAGE / PROPRIETAIRE 10 rue de Cotalard, 44240 La Chapelle",
        conf=0.8,
    )}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    assert "Batiment_Adresse" in cleaned
    val = cleaned["Batiment_Adresse"].value
    assert "MAITRE" not in val.upper().replace("'", "")
    assert "Cotalard" in val


def test_clean_address_dropped_when_only_headers(inference_mod):
    """If the entire span is header noise with no real address content,
    the field should still be dropped — but via length check, not
    blanket rejection of every span containing a stopword."""
    raw = {"Batiment_Adresse": _ext(
        inference_mod,
        "FICHE DESCRIPTION MAITRE D'OUVRAGE / MAITRE D'OEUVRE / CABINET CONSEIL BUREAU",
        conf=0.4,
    )}
    cleaned = inference_mod._clean_field_extractions(raw, ocr_text="")
    # After stripping all the stopwords, only "/" separators remain → dropped
    assert "Batiment_Adresse" not in cleaned


def test_clean_address_backstop_from_ocr(inference_mod):
    """Model returned nothing for Batiment_Adresse — the OCR text contains
    an address, the regex backstop fills it in."""
    ocr = (
        "DESCRIPTION DE L'OPERATION ... "
        "Adresse: 10 rue de Cotalard, 44240 La Chapelle-sur-Erdre ... "
        "DLPI: 01/09/2026"
    )
    cleaned = inference_mod._clean_field_extractions({}, ocr_text=ocr)
    assert "Batiment_Adresse" in cleaned
    assert "Cotalard" in cleaned["Batiment_Adresse"].value


def test_clean_address_backstop_no_match_leaves_empty(inference_mod):
    """If the OCR has no recognisable address pattern, don't fabricate one."""
    cleaned = inference_mod._clean_field_extractions(
        {}, ocr_text="Reference PC1234 DLPI 01/09/2026 random text"
    )
    assert "Batiment_Adresse" not in cleaned