Spaces:

abhiii
/

PII-Redaction

Build error

App Files Files Community

PII-Redaction / app.py

abhiii

Update app.py

5c6a8c9 verified over 1 year ago

raw

history blame contribute delete

5.47 kB

	# Importing as module.

	import streamlit as st
	import en_pipeline
	from presidio_anonymizer import AnonymizerEngine
	from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation
	from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider
	from presidio_analyzer import AnalyzerEngine
	from presidio_analyzer.nlp_engine import NlpEngineProvider
	from presidio_analyzer.recognizer_registry import RecognizerRegistry
	from presidio_analyzer.predefined_recognizers import SpacyRecognizer

	import spacy
	from spacy import displacy
	nlp = en_pipeline.load()
	supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]

	@st.cache_resource
	def get_analyzer():
	# https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[
	supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]

	# using presidio default recognizer rules
	analyzer = AnalyzerEngine()

	# #uncomment below to add spacy predefined engines instead of default engine
	# config = {
	# 'nlp_engine_name': 'spacy',
	# 'models': [
	# {
	# 'lang_code': 'en',
	# 'model_name': 'en_core_web_sm'
	# },

	# ],
	# 'ner_model_configuration': {
	# 'labels_to_ignore': ['O'],
	# 'model_to_presidio_entity_mapping': {
	# 'PER': 'PERSON',
	# 'LOC': 'LOCATION',
	# 'DATE': 'DATE_TIME',
	# 'GPE': 'LOCATION',
	# 'PERSON': 'PERSON',
	# 'TIME': 'DATE_TIME',
	# },
	# # 'low_confidence_score_multiplier': 0.4,
	# # 'low_score_entity_names': ['ID', 'ORG']
	# }
	# }

	# # Initialize the NLP engine with the recognizer registry
	# provider = NlpEngineProvider(nlp_configuration=config)
	# nlp_engine = provider.create_engine()

	# # Create the recognizer registry
	# registry = RecognizerRegistry()
	# registry.load_predefined_recognizers()

	# # Pass the created NLP engine and supported_languages to the AnalyzerEngine
	# analyzer = AnalyzerEngine(
	# nlp_engine=nlp_engine,
	# supported_languages = "en",
	# registry=registry
	# )

	# below mis useful when model to presidio mapping are same.
	# Load spaCy model with transformers
	nlp = spacy.load("en_pipeline")

	# Integrate spaCy recognizer with Presidio
	spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities)
	analyzer.registry.add_recognizer(spacy_recognizer)

	return analyzer

	analyzer = get_analyzer()
	# Display a section header:
	st.header("PII-Redaction")

	# adding the text that will show in the text box as default
	default_value = '''While traveling through New York, Emily received an email from her bank at emily.brown@example.com informing her about a suspicious activity on her credit card number 4532-8291-1283-9427. She quickly noted down the IP address 192.168.1.1 from which the transaction was attempted. Concerned, she called her bank at (800) 555-1234 and provided her US bank account number 1234567890123456 to verify her identity.

	Earlier that day, on July 18, 2024, at 3 PM, Emily had also received an important document via email from her financial advisor, john.doe@finance.com, regarding her upcoming trip to Paris, France. The document included her US passport number 123456789 and instructions for her to keep a copy of her US driver license number A1234567 for identification purposes during her travels.

	she enabled two-factor authentication on all her accounts and noted down her backup email, emily.backup@example.org, in case she needed to recover any information.'''



	input_text = st.text_input("Enter your text...", default_value)


	st.divider()

	analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,)
	# Text Anonymizer
	engine = AnonymizerEngine()
	result = engine.anonymize(text=input_text, analyzer_results=analyzer_results)

	# Restructuring anonymizer results
	anonymization_results = {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]}
	anonym = anonymization_results['anonymized']

	doc = nlp(input_text)

	# # Display a section header:
	# st.header("Dependency visualizer")`
	# # style="dep" indicates dependencies should be generated.
	# dep_svg = displacy.render(doc, style=”dep, jupyter=False)
	# st.image(dep_svg, width=400, use_column_width=’never’)
	col1, col2 = st.columns(2)

	with col1:
	# Add a section header:
	st.header("Entity visualizer")
	# Take the text from the input field and render the entity html.
	# Note that style="ent" indicates entities.
	ent_html = displacy.render(doc, style="ent", jupyter=False)
	# Display the entity visualization in the browser:
	st.markdown(ent_html, unsafe_allow_html=True)

	with col2:
	# Add a section header:
	st.header("Entity Anonymizer")
	# Display the entity visualization in the browser:
	st.markdown(anonym, unsafe_allow_html=True)