Spaces:

abhiii
/

PII-Redaction

Build error

File size: 5,473 Bytes

456745c
919d160
 
456745c
cf7c1ab
 
 
 
 
 
 
 
937b11d
 
919d160
639b844
5c6a8c9
 
cf7c1ab
 
 
 
 
833cc0b
cf7c1ab
833cc0b
 
 
 
 
 
 
 
cf7c1ab
833cc0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf7c1ab
833cc0b
 
 
cf7c1ab
833cc0b
 
 
 
 
 
cf7c1ab
 
 
 
 
 
 
 
 
 
 
 
919d160
683a289
919d160
 
 
 
 
 
e4732ee
919d160
683a289
8005bb7
 
 
683a289
 
 
cf7c1ab
 
 
ba21b98
cf7c1ab
 
 
 
8005bb7
 
 
 
 
 
 
 
833cc0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf7c1ab

# Importing as module.

import streamlit as st
import en_pipeline
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation
from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.predefined_recognizers import SpacyRecognizer

import spacy
from spacy import displacy
nlp = en_pipeline.load()
supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]

@st.cache_resource
def get_analyzer():
    # https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[
    supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]
    
    # using presidio default recognizer rules
    analyzer = AnalyzerEngine()
    
    # #uncomment below to add spacy predefined engines instead of default engine
    # config = {
    #     'nlp_engine_name': 'spacy',
    #     'models': [
    #         {
    #             'lang_code': 'en',
    #             'model_name': 'en_core_web_sm'
    #         },
           
    #     ],
    #     'ner_model_configuration': {
    #         'labels_to_ignore': ['O'],
    #         'model_to_presidio_entity_mapping': {
    #             'PER': 'PERSON',
    #             'LOC': 'LOCATION',
    #             'DATE': 'DATE_TIME',
    #             'GPE': 'LOCATION',
    #           'PERSON': 'PERSON',
    #           'TIME': 'DATE_TIME',
    #         },
    #         # 'low_confidence_score_multiplier': 0.4,
    #         # 'low_score_entity_names': ['ID', 'ORG']
    #     }
    # }

    # # Initialize the NLP engine with the recognizer registry
    # provider = NlpEngineProvider(nlp_configuration=config)
    # nlp_engine = provider.create_engine()
    
    # # Create the recognizer registry
    # registry = RecognizerRegistry()
    # registry.load_predefined_recognizers()
    
    # # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    # analyzer = AnalyzerEngine(
    #     nlp_engine=nlp_engine,
    #     supported_languages = "en",
    #     registry=registry
    # )
    
    # below mis useful when model to presidio mapping are same.
    # Load spaCy model with transformers
    nlp = spacy.load("en_pipeline")
    
    # Integrate spaCy recognizer with Presidio
    spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities)
    analyzer.registry.add_recognizer(spacy_recognizer)

    return analyzer

analyzer = get_analyzer()
# Display a section header:
st.header("PII-Redaction")

# adding the text that will show in the text box as default
default_value = '''While traveling through New York, Emily received an email from her bank at emily.brown@example.com informing her about a suspicious activity on her credit card number 4532-8291-1283-9427. She quickly noted down the IP address 192.168.1.1 from which the transaction was attempted. Concerned, she called her bank at (800) 555-1234 and provided her US bank account number 1234567890123456 to verify her identity.

Earlier that day, on July 18, 2024, at 3 PM, Emily had also received an important document via email from her financial advisor, john.doe@finance.com, regarding her upcoming trip to Paris, France. The document included her US passport number 123456789 and instructions for her to keep a copy of her US driver license number A1234567 for identification purposes during her travels.

she enabled two-factor authentication on all her accounts and noted down her backup email, emily.backup@example.org, in case she needed to recover any information.'''



input_text = st.text_input("Enter your text...", default_value)


st.divider()

analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,)
# Text Anonymizer
engine = AnonymizerEngine()
result = engine.anonymize(text=input_text, analyzer_results=analyzer_results)

# Restructuring anonymizer results
anonymization_results =  {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]}
anonym = anonymization_results['anonymized']

doc = nlp(input_text)

# # Display a section header:
# st.header("Dependency visualizer")`
# # style="dep" indicates dependencies should be generated.
# dep_svg = displacy.render(doc, style=”dep, jupyter=False)
# st.image(dep_svg, width=400, use_column_width=’never’)
col1, col2 = st.columns(2)

with col1:
    # Add a section header:
    st.header("Entity visualizer")
    # Take the text from the input field and render the entity html.
    # Note that style="ent" indicates entities.
    ent_html = displacy.render(doc, style="ent", jupyter=False)
    # Display the entity visualization in the browser:
    st.markdown(ent_html, unsafe_allow_html=True)

with col2:
    # Add a section header:
    st.header("Entity Anonymizer")
    # Display the entity visualization in the browser:
    st.markdown(anonym, unsafe_allow_html=True)