# Importing as module. import streamlit as st import en_pipeline from presidio_anonymizer import AnonymizerEngine from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider from presidio_analyzer import AnalyzerEngine from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_analyzer.recognizer_registry import RecognizerRegistry from presidio_analyzer.predefined_recognizers import SpacyRecognizer import spacy from spacy import displacy nlp = en_pipeline.load() supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"] @st.cache_resource def get_analyzer(): # https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[ supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"] # using presidio default recognizer rules analyzer = AnalyzerEngine() # #uncomment below to add spacy predefined engines instead of default engine # config = { # 'nlp_engine_name': 'spacy', # 'models': [ # { # 'lang_code': 'en', # 'model_name': 'en_core_web_sm' # }, # ], # 'ner_model_configuration': { # 'labels_to_ignore': ['O'], # 'model_to_presidio_entity_mapping': { # 'PER': 'PERSON', # 'LOC': 'LOCATION', # 'DATE': 'DATE_TIME', # 'GPE': 'LOCATION', # 'PERSON': 'PERSON', # 'TIME': 'DATE_TIME', # }, # # 'low_confidence_score_multiplier': 0.4, # # 'low_score_entity_names': ['ID', 'ORG'] # } # } # # Initialize the NLP engine with the recognizer registry # provider = NlpEngineProvider(nlp_configuration=config) # nlp_engine = provider.create_engine() # # Create the recognizer registry # registry = RecognizerRegistry() # registry.load_predefined_recognizers() # # Pass the created NLP engine and supported_languages to the AnalyzerEngine # analyzer = AnalyzerEngine( # nlp_engine=nlp_engine, # supported_languages = "en", # registry=registry # ) # below mis useful when model to presidio mapping are same. # Load spaCy model with transformers nlp = spacy.load("en_pipeline") # Integrate spaCy recognizer with Presidio spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities) analyzer.registry.add_recognizer(spacy_recognizer) return analyzer analyzer = get_analyzer() # Display a section header: st.header("PII-Redaction") # adding the text that will show in the text box as default default_value = '''While traveling through New York, Emily received an email from her bank at emily.brown@example.com informing her about a suspicious activity on her credit card number 4532-8291-1283-9427. She quickly noted down the IP address 192.168.1.1 from which the transaction was attempted. Concerned, she called her bank at (800) 555-1234 and provided her US bank account number 1234567890123456 to verify her identity. Earlier that day, on July 18, 2024, at 3 PM, Emily had also received an important document via email from her financial advisor, john.doe@finance.com, regarding her upcoming trip to Paris, France. The document included her US passport number 123456789 and instructions for her to keep a copy of her US driver license number A1234567 for identification purposes during her travels. she enabled two-factor authentication on all her accounts and noted down her backup email, emily.backup@example.org, in case she needed to recover any information.''' input_text = st.text_input("Enter your text...", default_value) st.divider() analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,) # Text Anonymizer engine = AnonymizerEngine() result = engine.anonymize(text=input_text, analyzer_results=analyzer_results) # Restructuring anonymizer results anonymization_results = {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]} anonym = anonymization_results['anonymized'] doc = nlp(input_text) # # Display a section header: # st.header("Dependency visualizer")` # # style="dep" indicates dependencies should be generated. # dep_svg = displacy.render(doc, style=”dep, jupyter=False) # st.image(dep_svg, width=400, use_column_width=’never’) col1, col2 = st.columns(2) with col1: # Add a section header: st.header("Entity visualizer") # Take the text from the input field and render the entity html. # Note that style="ent" indicates entities. ent_html = displacy.render(doc, style="ent", jupyter=False) # Display the entity visualization in the browser: st.markdown(ent_html, unsafe_allow_html=True) with col2: # Add a section header: st.header("Entity Anonymizer") # Display the entity visualization in the browser: st.markdown(anonym, unsafe_allow_html=True)