redaction / api /tests /test_suite.py
gni
UI/Core: Add large document examples and silence MISC warnings.
2639468
import sys
import os
import pytest
import time
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer, Pattern
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
def get_test_engines():
configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}, {"lang_code": "fr", "model_name": "fr_core_news_lg"}]
}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(languages=["en", "fr"])
fr_spacy = SpacyRecognizer(
supported_language="fr",
check_label_groups=[
("PERSON", ["PER", "PERSON"]),
("LOCATION", ["LOC", "GPE", "LOCATION"]),
("ORGANIZATION", ["ORG", "ORGANIZATION", "MISC"])
]
)
registry.add_recognizer(fr_spacy)
# Custom FR Recognizers
registry.add_recognizer(PatternRecognizer(supported_entity="LOCATION", supported_language="fr", patterns=[Pattern(name="address", regex=r"(?i)\b\d{1,4}[\s,]+(?:rue|av|ave|avenue|bd|boulevard|impasse|place|square|quai|cours|passage|route|chemin)[\s\w\-\'àâäéèêëîïôöùûüç,]{2,100}\b", score=0.85)]))
registry.add_recognizer(PatternRecognizer(supported_entity="SIRET", supported_language="fr", patterns=[Pattern(name="siret", regex=r"\b\d{3}\s*\d{3}\s*\d{3}\s*\d{5}\b", score=1.0)]))
registry.add_recognizer(PatternRecognizer(supported_entity="FR_NIR", supported_language="fr", patterns=[Pattern(name="nir", regex=r"\b[12]\s*\d{2}\s*\d{2}\s*(?:\d{2}|2[AB])\s*\d{3}\s*\d{3}\s*\d{2}\b", score=1.0)]))
registry.add_recognizer(PatternRecognizer(supported_entity="IBAN_CODE", supported_language="fr", patterns=[Pattern(name="iban_fr", regex=r"\b[A-Z]{2}\d{2}(?:\s*[A-Z0-9]{4}){4,7}\s*[A-Z0-9]{1,4}\b", score=1.0)]))
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, default_score_threshold=0.25)
anonymizer = AnonymizerEngine()
return analyzer, anonymizer
class TestPrivacyGateway:
def test_pv_reunion_long(self, pack):
"""Test exact du Procès-Verbal de réunion fourni par l'utilisateur."""
analyzer, anonymizer = pack
text = (
"PROCÈS-VERBAL DE LA RÉUNION DU COMITÉ DE DIRECTION - AZUR LOGISTIQUE\n"
"Lieu : Siège social, 15 Boulevard de la Libération, 13001 Marseille.\n"
"Monsieur Jean-Pierre Moulin et Madame Sophie Berthier.\n"
"SIRET 456 789 123 00015. IBAN FR76 3000 1000 2000 3000 4000 500.\n"
"Email: jp.moulin@azur-logistique.fr. IP 192.168.1.45."
)
results = analyzer.analyze(text=text, language="fr")
redacted = anonymizer.anonymize(text=text, analyzer_results=results).text
assert "Jean-Pierre Moulin" not in redacted
assert "Sophie Berthier" not in redacted
assert "456 789 123 00015" not in redacted
assert "FR76" not in redacted
assert "Marseille" not in redacted
assert "192.168.1.45" not in redacted
if __name__ == "__main__":
e = get_test_engines()
t = TestPrivacyGateway()
try:
t.test_pv_reunion_long(e)
print("✅ Long PV Content Test: OK")
except AssertionError as err:
print(f"❌ Test Failed")
sys.exit(1)