File size: 3,500 Bytes
0e45313
 
 
2639468
0e45313
 
 
 
 
0e7e965
0e45313
 
2639468
0e45313
 
 
 
 
 
 
 
 
 
 
2639468
0e45313
 
 
 
2639468
 
 
 
 
0e45313
2639468
0e45313
 
 
0e7e965
0e45313
2639468
 
 
0e7e965
2639468
 
 
 
 
0e7e965
 
 
0e45313
0e7e965
2639468
0e7e965
 
2639468
 
0e45313
 
2639468
 
0e45313
2639468
 
 
 
0e45313
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import os
import pytest
import time
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer, Pattern
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine

def get_test_engines():
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}, {"lang_code": "fr", "model_name": "fr_core_news_lg"}]
    }
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()
    registry = RecognizerRegistry()
    registry.load_predefined_recognizers(languages=["en", "fr"])
    
    fr_spacy = SpacyRecognizer(
        supported_language="fr",
        check_label_groups=[
            ("PERSON", ["PER", "PERSON"]),
            ("LOCATION", ["LOC", "GPE", "LOCATION"]),
            ("ORGANIZATION", ["ORG", "ORGANIZATION", "MISC"])
        ]
    )
    registry.add_recognizer(fr_spacy)
    
    # Custom FR Recognizers
    registry.add_recognizer(PatternRecognizer(supported_entity="LOCATION", supported_language="fr", patterns=[Pattern(name="address", regex=r"(?i)\b\d{1,4}[\s,]+(?:rue|av|ave|avenue|bd|boulevard|impasse|place|square|quai|cours|passage|route|chemin)[\s\w\-\'àâäéèêëîïôöùûüç,]{2,100}\b", score=0.85)]))
    registry.add_recognizer(PatternRecognizer(supported_entity="SIRET", supported_language="fr", patterns=[Pattern(name="siret", regex=r"\b\d{3}\s*\d{3}\s*\d{3}\s*\d{5}\b", score=1.0)]))
    registry.add_recognizer(PatternRecognizer(supported_entity="FR_NIR", supported_language="fr", patterns=[Pattern(name="nir", regex=r"\b[12]\s*\d{2}\s*\d{2}\s*(?:\d{2}|2[AB])\s*\d{3}\s*\d{3}\s*\d{2}\b", score=1.0)]))
    registry.add_recognizer(PatternRecognizer(supported_entity="IBAN_CODE", supported_language="fr", patterns=[Pattern(name="iban_fr", regex=r"\b[A-Z]{2}\d{2}(?:\s*[A-Z0-9]{4}){4,7}\s*[A-Z0-9]{1,4}\b", score=1.0)]))

    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, default_score_threshold=0.25)
    anonymizer = AnonymizerEngine()
    return analyzer, anonymizer

class TestPrivacyGateway:
    
    def test_pv_reunion_long(self, pack):
        """Test exact du Procès-Verbal de réunion fourni par l'utilisateur."""
        analyzer, anonymizer = pack
        text = (
            "PROCÈS-VERBAL DE LA RÉUNION DU COMITÉ DE DIRECTION - AZUR LOGISTIQUE\n"
            "Lieu : Siège social, 15 Boulevard de la Libération, 13001 Marseille.\n"
            "Monsieur Jean-Pierre Moulin et Madame Sophie Berthier.\n"
            "SIRET 456 789 123 00015. IBAN FR76 3000 1000 2000 3000 4000 500.\n"
            "Email: jp.moulin@azur-logistique.fr. IP 192.168.1.45."
        )
        results = analyzer.analyze(text=text, language="fr")
        redacted = anonymizer.anonymize(text=text, analyzer_results=results).text
        
        assert "Jean-Pierre Moulin" not in redacted
        assert "Sophie Berthier" not in redacted
        assert "456 789 123 00015" not in redacted
        assert "FR76" not in redacted
        assert "Marseille" not in redacted
        assert "192.168.1.45" not in redacted

if __name__ == "__main__":
    e = get_test_engines()
    t = TestPrivacyGateway()
    try:
        t.test_pv_reunion_long(e)
        print("✅ Long PV Content Test: OK")
    except AssertionError as err:
        print(f"❌ Test Failed")
        sys.exit(1)