| import sys |
| import os |
| import pytest |
| import time |
| from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer, Pattern |
| from presidio_analyzer.predefined_recognizers import SpacyRecognizer |
| from presidio_analyzer.nlp_engine import NlpEngineProvider |
| from presidio_anonymizer import AnonymizerEngine |
|
|
| def get_test_engines(): |
| configuration = { |
| "nlp_engine_name": "spacy", |
| "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}, {"lang_code": "fr", "model_name": "fr_core_news_lg"}] |
| } |
| provider = NlpEngineProvider(nlp_configuration=configuration) |
| nlp_engine = provider.create_engine() |
| registry = RecognizerRegistry() |
| registry.load_predefined_recognizers(languages=["en", "fr"]) |
| |
| fr_spacy = SpacyRecognizer( |
| supported_language="fr", |
| check_label_groups=[ |
| ("PERSON", ["PER", "PERSON"]), |
| ("LOCATION", ["LOC", "GPE", "LOCATION"]), |
| ("ORGANIZATION", ["ORG", "ORGANIZATION", "MISC"]) |
| ] |
| ) |
| registry.add_recognizer(fr_spacy) |
| |
| |
| registry.add_recognizer(PatternRecognizer(supported_entity="LOCATION", supported_language="fr", patterns=[Pattern(name="address", regex=r"(?i)\b\d{1,4}[\s,]+(?:rue|av|ave|avenue|bd|boulevard|impasse|place|square|quai|cours|passage|route|chemin)[\s\w\-\'àâäéèêëîïôöùûüç,]{2,100}\b", score=0.85)])) |
| registry.add_recognizer(PatternRecognizer(supported_entity="SIRET", supported_language="fr", patterns=[Pattern(name="siret", regex=r"\b\d{3}\s*\d{3}\s*\d{3}\s*\d{5}\b", score=1.0)])) |
| registry.add_recognizer(PatternRecognizer(supported_entity="FR_NIR", supported_language="fr", patterns=[Pattern(name="nir", regex=r"\b[12]\s*\d{2}\s*\d{2}\s*(?:\d{2}|2[AB])\s*\d{3}\s*\d{3}\s*\d{2}\b", score=1.0)])) |
| registry.add_recognizer(PatternRecognizer(supported_entity="IBAN_CODE", supported_language="fr", patterns=[Pattern(name="iban_fr", regex=r"\b[A-Z]{2}\d{2}(?:\s*[A-Z0-9]{4}){4,7}\s*[A-Z0-9]{1,4}\b", score=1.0)])) |
|
|
| analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, default_score_threshold=0.25) |
| anonymizer = AnonymizerEngine() |
| return analyzer, anonymizer |
|
|
| class TestPrivacyGateway: |
| |
| def test_pv_reunion_long(self, pack): |
| """Test exact du Procès-Verbal de réunion fourni par l'utilisateur.""" |
| analyzer, anonymizer = pack |
| text = ( |
| "PROCÈS-VERBAL DE LA RÉUNION DU COMITÉ DE DIRECTION - AZUR LOGISTIQUE\n" |
| "Lieu : Siège social, 15 Boulevard de la Libération, 13001 Marseille.\n" |
| "Monsieur Jean-Pierre Moulin et Madame Sophie Berthier.\n" |
| "SIRET 456 789 123 00015. IBAN FR76 3000 1000 2000 3000 4000 500.\n" |
| "Email: jp.moulin@azur-logistique.fr. IP 192.168.1.45." |
| ) |
| results = analyzer.analyze(text=text, language="fr") |
| redacted = anonymizer.anonymize(text=text, analyzer_results=results).text |
| |
| assert "Jean-Pierre Moulin" not in redacted |
| assert "Sophie Berthier" not in redacted |
| assert "456 789 123 00015" not in redacted |
| assert "FR76" not in redacted |
| assert "Marseille" not in redacted |
| assert "192.168.1.45" not in redacted |
|
|
| if __name__ == "__main__": |
| e = get_test_engines() |
| t = TestPrivacyGateway() |
| try: |
| t.test_pv_reunion_long(e) |
| print("✅ Long PV Content Test: OK") |
| except AssertionError as err: |
| print(f"❌ Test Failed") |
| sys.exit(1) |
|
|