import sys import os import pytest import time from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer, Pattern from presidio_analyzer.predefined_recognizers import SpacyRecognizer from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_anonymizer import AnonymizerEngine def get_test_engines(): configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}, {"lang_code": "fr", "model_name": "fr_core_news_lg"}] } provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() registry = RecognizerRegistry() registry.load_predefined_recognizers(languages=["en", "fr"]) fr_spacy = SpacyRecognizer( supported_language="fr", check_label_groups=[ ("PERSON", ["PER", "PERSON"]), ("LOCATION", ["LOC", "GPE", "LOCATION"]), ("ORGANIZATION", ["ORG", "ORGANIZATION", "MISC"]) ] ) registry.add_recognizer(fr_spacy) # Custom FR Recognizers registry.add_recognizer(PatternRecognizer(supported_entity="LOCATION", supported_language="fr", patterns=[Pattern(name="address", regex=r"(?i)\b\d{1,4}[\s,]+(?:rue|av|ave|avenue|bd|boulevard|impasse|place|square|quai|cours|passage|route|chemin)[\s\w\-\'àâäéèêëîïôöùûüç,]{2,100}\b", score=0.85)])) registry.add_recognizer(PatternRecognizer(supported_entity="SIRET", supported_language="fr", patterns=[Pattern(name="siret", regex=r"\b\d{3}\s*\d{3}\s*\d{3}\s*\d{5}\b", score=1.0)])) registry.add_recognizer(PatternRecognizer(supported_entity="FR_NIR", supported_language="fr", patterns=[Pattern(name="nir", regex=r"\b[12]\s*\d{2}\s*\d{2}\s*(?:\d{2}|2[AB])\s*\d{3}\s*\d{3}\s*\d{2}\b", score=1.0)])) registry.add_recognizer(PatternRecognizer(supported_entity="IBAN_CODE", supported_language="fr", patterns=[Pattern(name="iban_fr", regex=r"\b[A-Z]{2}\d{2}(?:\s*[A-Z0-9]{4}){4,7}\s*[A-Z0-9]{1,4}\b", score=1.0)])) analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, default_score_threshold=0.25) anonymizer = AnonymizerEngine() return analyzer, anonymizer class TestPrivacyGateway: def test_pv_reunion_long(self, pack): """Test exact du Procès-Verbal de réunion fourni par l'utilisateur.""" analyzer, anonymizer = pack text = ( "PROCÈS-VERBAL DE LA RÉUNION DU COMITÉ DE DIRECTION - AZUR LOGISTIQUE\n" "Lieu : Siège social, 15 Boulevard de la Libération, 13001 Marseille.\n" "Monsieur Jean-Pierre Moulin et Madame Sophie Berthier.\n" "SIRET 456 789 123 00015. IBAN FR76 3000 1000 2000 3000 4000 500.\n" "Email: jp.moulin@azur-logistique.fr. IP 192.168.1.45." ) results = analyzer.analyze(text=text, language="fr") redacted = anonymizer.anonymize(text=text, analyzer_results=results).text assert "Jean-Pierre Moulin" not in redacted assert "Sophie Berthier" not in redacted assert "456 789 123 00015" not in redacted assert "FR76" not in redacted assert "Marseille" not in redacted assert "192.168.1.45" not in redacted if __name__ == "__main__": e = get_test_engines() t = TestPrivacyGateway() try: t.test_pv_reunion_long(e) print("✅ Long PV Content Test: OK") except AssertionError as err: print(f"❌ Test Failed") sys.exit(1)