PII-Redaction / app.py
abhiii's picture
Update app.py
5c6a8c9 verified
# Importing as module.
import streamlit as st
import en_pipeline
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation
from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
import spacy
from spacy import displacy
nlp = en_pipeline.load()
supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]
@st.cache_resource
def get_analyzer():
# https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[
supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]
# using presidio default recognizer rules
analyzer = AnalyzerEngine()
# #uncomment below to add spacy predefined engines instead of default engine
# config = {
# 'nlp_engine_name': 'spacy',
# 'models': [
# {
# 'lang_code': 'en',
# 'model_name': 'en_core_web_sm'
# },
# ],
# 'ner_model_configuration': {
# 'labels_to_ignore': ['O'],
# 'model_to_presidio_entity_mapping': {
# 'PER': 'PERSON',
# 'LOC': 'LOCATION',
# 'DATE': 'DATE_TIME',
# 'GPE': 'LOCATION',
# 'PERSON': 'PERSON',
# 'TIME': 'DATE_TIME',
# },
# # 'low_confidence_score_multiplier': 0.4,
# # 'low_score_entity_names': ['ID', 'ORG']
# }
# }
# # Initialize the NLP engine with the recognizer registry
# provider = NlpEngineProvider(nlp_configuration=config)
# nlp_engine = provider.create_engine()
# # Create the recognizer registry
# registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
# # Pass the created NLP engine and supported_languages to the AnalyzerEngine
# analyzer = AnalyzerEngine(
# nlp_engine=nlp_engine,
# supported_languages = "en",
# registry=registry
# )
# below mis useful when model to presidio mapping are same.
# Load spaCy model with transformers
nlp = spacy.load("en_pipeline")
# Integrate spaCy recognizer with Presidio
spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities)
analyzer.registry.add_recognizer(spacy_recognizer)
return analyzer
analyzer = get_analyzer()
# Display a section header:
st.header("PII-Redaction")
# adding the text that will show in the text box as default
default_value = '''While traveling through New York, Emily received an email from her bank at emily.brown@example.com informing her about a suspicious activity on her credit card number 4532-8291-1283-9427. She quickly noted down the IP address 192.168.1.1 from which the transaction was attempted. Concerned, she called her bank at (800) 555-1234 and provided her US bank account number 1234567890123456 to verify her identity.
Earlier that day, on July 18, 2024, at 3 PM, Emily had also received an important document via email from her financial advisor, john.doe@finance.com, regarding her upcoming trip to Paris, France. The document included her US passport number 123456789 and instructions for her to keep a copy of her US driver license number A1234567 for identification purposes during her travels.
she enabled two-factor authentication on all her accounts and noted down her backup email, emily.backup@example.org, in case she needed to recover any information.'''
input_text = st.text_input("Enter your text...", default_value)
st.divider()
analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,)
# Text Anonymizer
engine = AnonymizerEngine()
result = engine.anonymize(text=input_text, analyzer_results=analyzer_results)
# Restructuring anonymizer results
anonymization_results = {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]}
anonym = anonymization_results['anonymized']
doc = nlp(input_text)
# # Display a section header:
# st.header("Dependency visualizer")`
# # style="dep" indicates dependencies should be generated.
# dep_svg = displacy.render(doc, style=”dep, jupyter=False)
# st.image(dep_svg, width=400, use_column_width=’never’)
col1, col2 = st.columns(2)
with col1:
# Add a section header:
st.header("Entity visualizer")
# Take the text from the input field and render the entity html.
# Note that style="ent" indicates entities.
ent_html = displacy.render(doc, style="ent", jupyter=False)
# Display the entity visualization in the browser:
st.markdown(ent_html, unsafe_allow_html=True)
with col2:
# Add a section header:
st.header("Entity Anonymizer")
# Display the entity visualization in the browser:
st.markdown(anonym, unsafe_allow_html=True)