Spaces:

abhiii
/

PII-Redaction

Build error

App Files Files Community

abhiii commited on Jul 19, 2024

Commit

cf7c1ab

verified ·

1 Parent(s): 8005bb7

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -0

app.py CHANGED Viewed

@@ -2,10 +2,76 @@
 import streamlit as st
 import en_pipeline
 import spacy
 from spacy import displacy
 nlp = en_pipeline.load()
 # Display a section header:
 st.header("PII-Redaction")
@@ -35,6 +101,15 @@ input_text = st.text_input("Enter your text...", default_value)
 st.divider()
 doc = nlp(input_text)
@@ -51,3 +126,12 @@ st.header("Entity visualizer")
 ent_html = displacy.render(doc, style="ent", jupyter=False)
 # Display the entity visualization in the browser:
 st.markdown(ent_html, unsafe_allow_html=True)

 import streamlit as st
 import en_pipeline
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation
+from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider
+from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_analyzer.recognizer_registry import RecognizerRegistry
+from presidio_analyzer.predefined_recognizers import SpacyRecognizer
 import spacy
 from spacy import displacy
 nlp = en_pipeline.load()
+def get_analyzer():
+    # https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[
+    supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]
+    # using presidio default recognizer rules
+    # analyzer = AnalyzerEngine()
+    #uncomment below to add spacy predefined engines instead of default engine
+    config = {
+        'nlp_engine_name': 'spacy',
+        'models': [
+            {
+                'lang_code': 'en',
+                'model_name': 'en_core_web_sm'
+            },
+        ],
+        'ner_model_configuration': {
+            'labels_to_ignore': ['O'],
+            'model_to_presidio_entity_mapping': {
+                'PER': 'PERSON',
+                'LOC': 'LOCATION',
+                'DATE': 'DATE_TIME',
+                'GPE': 'LOCATION',
+              'PERSON': 'PERSON',
+              'TIME': 'DATE_TIME',
+            },
+            # 'low_confidence_score_multiplier': 0.4,
+            # 'low_score_entity_names': ['ID', 'ORG']
+        }
+    }
+    # Initialize the NLP engine with the recognizer registry
+    provider = NlpEngineProvider(nlp_configuration=config)
+    nlp_engine = provider.create_engine()
+    # Create the recognizer registry
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
+    analyzer = AnalyzerEngine(
+        nlp_engine=nlp_engine,
+        supported_languages = "en",
+        registry=registry
+    )
+    # below mis useful when model to presidio mapping are same.
+    # Load spaCy model with transformers
+    nlp = spacy.load("en_pipeline")
+    # Integrate spaCy recognizer with Presidio
+    spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities)
+    analyzer.registry.add_recognizer(spacy_recognizer)
+    return analyzer
+analyzer = get_analyzer()
 # Display a section header:
 st.header("PII-Redaction")
 st.divider()
+analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,)
+# Text Anonymizer
+engine = AnonymizerEngine()
+result = engine.anonymize(text=text_fr, analyzer_results=analyzer_results)
+# Restructuring anonymizer results
+anonymization_results =  {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]}
+words = [{'word': text_fr[obj['start']:obj['end']], 'entity_type':obj['entity_type'], 'start':obj['start'], 'end':obj['end']} for obj in anonymization_results['found']]
+anonym = anonymization_results['anonymized']
 doc = nlp(input_text)
 ent_html = displacy.render(doc, style="ent", jupyter=False)
 # Display the entity visualization in the browser:
 st.markdown(ent_html, unsafe_allow_html=True)
+st.divider()
+# Add a section header:
+st.header("Entity Anonymizer")
+# Display the entity visualization in the browser:
+st.markdown(anonym, unsafe_allow_html=True)