FiberGate / tests /test_recommendation_engine.py
AzizMiladi's picture
Add v3 extractor, recommendation engine, CMS generator, Streamlit demo, and tests
33ddb61
Raw
History Blame
15.1 kB
"""
Unit tests for `6_recommendation_engine.py` β€” the rule engine that decides
demande de localisation PAR completeness.
The tests bypass the LayoutLMv3 pipeline entirely: we build `DocumentSummary`
instances by hand (with synthetic field extractions) and call the rule
methods directly. Fast (~1 s once the module is loaded).
"""
from __future__ import annotations
import pytest
# ──────────────────────────────────────────────────────────────────────────
# _norm_ref β€” separator strip + diacritic / digit-glyph folding
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("raw, expected", [
("PC 044 035 25 00035", "PC0440352500035"),
("PC-044-035-25-00035", "PC0440352500035"),
("PC/044/035", "PC044035"),
("PC YOO65", "PC Y0065".replace(" ", "")), # O β†’ 0 fold
("PCY0065", "PCY0065"),
("", ""),
(None, ""),
])
def test_norm_ref(reco_mod, raw, expected):
assert reco_mod._norm_ref(raw) == expected
# ──────────────────────────────────────────────────────────────────────────
# _edit_distance β€” pure Levenshtein
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("a, b, expected", [
("abc", "abc", 0),
("abc", "abd", 1),
("abc", "ab", 1),
("", "abc", 3),
("PC03306323Z0475", "PC0330632Z0475", 1), # missing one digit
("PC03306323Z0475", "PC03306323Z0475", 0), # identical
])
def test_edit_distance(reco_mod, a, b, expected):
assert reco_mod._edit_distance(a, b) == expected
# ──────────────────────────────────────────────────────────────────────────
# _autorisation_matches β€” tri-state (True / False / None)
# ──────────────────────────────────────────────────────────────────────────
def _doc(reco_mod, doc_class="Autorisation", ref=None):
fields = {}
if ref is not None:
fields["Reference_Urbanisme"] = {"value": ref, "confidence": 0.99}
return reco_mod.DocumentSummary(
file=f"file_{doc_class}.pdf",
doc_class=doc_class,
doc_confidence=0.95,
fields=fields,
flags=[],
)
def test_autorisation_matches_exact(reco_mod, engine_no_pipeline):
autos = [_doc(reco_mod, ref="PC 044 035 25 00035")]
assert engine_no_pipeline._autorisation_matches("PC0440352500035", autos) is True
def test_autorisation_matches_with_ocr_drift(reco_mod, engine_no_pipeline):
"""One missing digit (PC0330632 vs PC03306323) should still match."""
autos = [_doc(reco_mod, ref="PC0330632Z0475")]
assert engine_no_pipeline._autorisation_matches("PC03306323Z0475", autos) is True
def test_autorisation_matches_with_glyph_fold(reco_mod, engine_no_pipeline):
"""OCR misread of digit `0` as letter `O` β€” O↔0 fold should rescue."""
autos = [_doc(reco_mod, ref="PC 056 260 22 YOO65")]
assert engine_no_pipeline._autorisation_matches("PC05626022Y0065", autos) is True
def test_autorisation_matches_false_when_clearly_different(reco_mod, engine_no_pipeline):
autos = [_doc(reco_mod, ref="PC 999 999 99 99999")]
assert engine_no_pipeline._autorisation_matches("PC0440352500035", autos) is False
def test_autorisation_matches_none_when_no_readable_ref(reco_mod, engine_no_pipeline):
"""If the autorisation has no extractable reference, return None (not False)
so the engine routes to manual_review rather than crying "incohΓ©rent"."""
autos = [_doc(reco_mod)] # no ref field
assert engine_no_pipeline._autorisation_matches("PC0440352500035", autos) is None
def test_autorisation_matches_empty_fiche_ref(reco_mod, engine_no_pipeline):
"""If we can't compare (fiche ref also empty), don't flag β€” return True."""
autos = [_doc(reco_mod, ref="PC0440352500035")]
assert engine_no_pipeline._autorisation_matches("", autos) is True
# ──────────────────────────────────────────────────────────────────────────
# _filename_class_hint
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("fname, expected", [
("PF0442_Plan-de-situation_PAR-1-1.pdf", "PlanSituation"),
("PF0442_Plan-de-masse_PAR-1-1.pdf", "PlanMasse"),
("PF0442_Fiche-de-renseignement_1.pdf", "fiche"),
("PF0442_Autorisation-d-urbanisme_1.pdf", "Autorisation"),
("PF0442_Certificat-d-adressage_1.pdf", "Certificat"),
("PF0442_Mandat_PAR-1-1.pdf", "Mandat"),
# Alternate naming we added
("0335502500011 ARRETE PC.jpg", "Autorisation"),
("0335502500011 CERTIFICAT ADRESSAGE.jpg", "Certificat"),
("0335502500011 PLAN DE MASSE.jpg", "PlanMasse"),
("0335502500011 PLAN DE SITUATION.jpg", "PlanSituation"),
("0821212500015 ATTESTATION CONFORMITE.pdf", "Autorisation"),
("ADRESSAGE.jpg", "Certificat"),
# Unknowns
("random_doc.pdf", None),
("20260202_1232_MONTPELLIER.pdf", None),
])
def test_filename_hint(engine_no_pipeline, fname, expected):
assert engine_no_pipeline._filename_class_hint(fname) == expected
# ──────────────────────────────────────────────────────────────────────────
# _is_out_of_scope_file
# ──────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("fname, expected", [
("PF0442_PV-Loc-PAR_PAR-2-1_1.pdf", True),
("PF0850_Plan-et-ou-photo-du-PAR-souhaite_PAR-2-1_1.pdf", True),
("PF0442_Autre_1.pdf", True),
("PF0442_Autre_PAR-1-1_1.png", True), # the \b fix
("PF0335_Autre_3 (1).pdf", True),
# negatives
("PF0442_Autorisation-d-urbanisme.pdf", False),
("PF0442_Plan-de-masse_PAR-1-1.pdf", False),
("PF0442_Fiche-de-renseignement.pdf", False),
])
def test_is_out_of_scope_file(engine_no_pipeline, fname, expected):
assert engine_no_pipeline._is_out_of_scope_file(fname) is expected
# ──────────────────────────────────────────────────────────────────────────
# _is_recolement_dossier β€” short-circuit for post-installation packages
# ──────────────────────────────────────────────────────────────────────────
def test_recolement_detected(engine_no_pipeline):
names = ["RECOLLEMENT.pdf", "0821 ATTESTATION CONFORMITE.pdf"]
assert engine_no_pipeline._is_recolement_dossier(names) is True
def test_recolement_accent(engine_no_pipeline):
names = ["dossier_de_rΓ©colement.pdf"]
assert engine_no_pipeline._is_recolement_dossier(names) is True
def test_recolement_not_detected_for_normal_demande(engine_no_pipeline):
names = [
"PF0442_Fiche-de-renseignement.pdf",
"PF0442_Autorisation-d-urbanisme.pdf",
"PF0442_Plan-de-masse.pdf",
]
assert engine_no_pipeline._is_recolement_dossier(names) is False
# ──────────────────────────────────────────────────────────────────────────
# Build verdict from synthetic Documents β€” the core rule engine logic
# ──────────────────────────────────────────────────────────────────────────
def _make_doc(reco_mod, file, cls, conf=0.95, fields=None, flags=None):
return reco_mod.DocumentSummary(
file=file, doc_class=cls, doc_confidence=conf,
fields=fields or {}, flags=flags or [],
)
def test_build_verdict_complete(reco_mod, engine_no_pipeline):
docs = [
_make_doc(reco_mod, "fiche.pdf", "fiche", fields={
"Reference_Urbanisme": {"value": "PC0440352500035", "confidence": 0.99},
"DLPI": {"value": "01/09/2026", "confidence": 0.98},
"Disposition_Mandat": {"value": "OUI", "confidence": 0.99},
"nb_log_totale": {"value": "5", "confidence": 0.70},
}),
_make_doc(reco_mod, "auto.pdf", "Autorisation", fields={
"Reference_Urbanisme": {"value": "PC 044 035 25 00035", "confidence": 0.99},
}),
_make_doc(reco_mod, "plan_masse.pdf", "PlanMasse"),
_make_doc(reco_mod, "plan_situation.pdf", "PlanSituation"),
_make_doc(reco_mod, "mandat.pdf", "Mandat"),
]
v = engine_no_pipeline._build_verdict(docs)
assert v.status == "complète"
assert v.missing_documents == []
assert v.incomplete_documents == []
def test_build_verdict_missing_fiche(reco_mod, engine_no_pipeline):
docs = [
_make_doc(reco_mod, "auto.pdf", "Autorisation"),
_make_doc(reco_mod, "plan_masse.pdf", "PlanMasse"),
_make_doc(reco_mod, "plan_sit.pdf", "PlanSituation"),
]
v = engine_no_pipeline._build_verdict(docs)
assert v.status == "incomplète"
assert any("fiche" in m.lower() for m in v.missing_documents)
def test_build_verdict_unreadable_auto_routes_to_manual_review(reco_mod, engine_no_pipeline):
"""Fiche has a ref, autorisation present but no readable ref β†’ manual_review."""
docs = [
_make_doc(reco_mod, "fiche.pdf", "fiche", fields={
"Reference_Urbanisme": {"value": "PC2221525Q0037", "confidence": 0.99},
"DLPI": {"value": "01/09/2026", "confidence": 0.98},
"nb_log_totale": {"value": "1", "confidence": 0.70},
}),
_make_doc(reco_mod, "auto.jpg", "Autorisation"), # no Reference_Urbanisme extracted
_make_doc(reco_mod, "plan_masse.pdf", "PlanMasse"),
_make_doc(reco_mod, "plan_situation.pdf", "PlanSituation"),
]
v = engine_no_pipeline._build_verdict(docs)
# Should NOT be flagged "incohΓ©rent"
assert not any("incohΓ©rent" in m.lower() for m in v.incomplete_documents)
# Should appear in manual_review with the "n'a pas pu Γͺtre lu" phrasing
assert any("n'a pas pu Γͺtre lu" in m for m in v.manual_review_documents)
def test_build_verdict_recolement_short_circuit(reco_mod, engine_no_pipeline):
docs = [
_make_doc(reco_mod, "ATTESTATION CONFORMITE.pdf", "Autorisation"),
_make_doc(reco_mod, "TRANCHEE FERMEE.jpg", "PlanSituation"),
_make_doc(reco_mod, "RECOLLEMENT.pdf", "Certificat"),
]
v = engine_no_pipeline._build_verdict(docs)
assert v.status == "hors-périmètre"
assert any("rΓ©colement" in m.lower() for m in v.manual_review_documents)
# Should bypass the regular rules β€” no "missing fiche" etc.
assert v.missing_documents == []
assert v.incomplete_documents == []
def test_build_verdict_out_of_scope_excluded_from_class_count(reco_mod, engine_no_pipeline):
"""A PV-Loc-PAR classified as PlanMasse should NOT satisfy the
'Plan de masse manquant' rule β€” out_of_scope_document flag excludes
it from class counting."""
docs = [
_make_doc(reco_mod, "fiche.pdf", "fiche", fields={
"Reference_Urbanisme": {"value": "PC0440352500035", "confidence": 0.99},
"DLPI": {"value": "01/09/2026", "confidence": 0.98},
"nb_log_totale": {"value": "1", "confidence": 0.70},
}),
_make_doc(reco_mod, "auto.pdf", "Autorisation", fields={
"Reference_Urbanisme": {"value": "PC0440352500035", "confidence": 0.99},
}),
_make_doc(reco_mod, "PV-Loc-PAR.pdf", "PlanMasse",
flags=["out_of_scope_document"]), # the only "plan masse"
_make_doc(reco_mod, "plan_situation.pdf", "PlanSituation"),
]
v = engine_no_pipeline._build_verdict(docs)
assert v.status == "incomplète"
assert any("plan de masse" in m.lower() for m in v.missing_documents)
def test_build_verdict_disposition_mandat_undetermined_to_manual_review(reco_mod, engine_no_pipeline):
"""Disposition_Mandat couldn't be read AND no Mandat doc provided β†’
manual_review entry, NOT 'Mandat manquant' in missing_documents."""
docs = [
_make_doc(reco_mod, "fiche.pdf", "fiche", fields={
"Reference_Urbanisme": {"value": "PC0440352500035", "confidence": 0.99},
"DLPI": {"value": "01/09/2026", "confidence": 0.98},
"nb_log_totale": {"value": "1", "confidence": 0.70},
# No Disposition_Mandat key β€” undetermined
}),
_make_doc(reco_mod, "auto.pdf", "Autorisation", fields={
"Reference_Urbanisme": {"value": "PC0440352500035", "confidence": 0.99},
}),
_make_doc(reco_mod, "plan_masse.pdf", "PlanMasse"),
_make_doc(reco_mod, "plan_situation.pdf", "PlanSituation"),
]
v = engine_no_pipeline._build_verdict(docs)
assert not any("mandat" in m.lower() for m in v.missing_documents)
assert any("Mandat" in m for m in v.manual_review_documents)