abhiii commited on
Commit
cf7c1ab
·
verified ·
1 Parent(s): 8005bb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py CHANGED
@@ -2,10 +2,76 @@
2
 
3
  import streamlit as st
4
  import en_pipeline
 
 
 
 
 
 
 
 
5
  import spacy
6
  from spacy import displacy
7
  nlp = en_pipeline.load()
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Display a section header:
10
  st.header("PII-Redaction")
11
 
@@ -35,6 +101,15 @@ input_text = st.text_input("Enter your text...", default_value)
35
 
36
  st.divider()
37
 
 
 
 
 
 
 
 
 
 
38
 
39
  doc = nlp(input_text)
40
 
@@ -51,3 +126,12 @@ st.header("Entity visualizer")
51
  ent_html = displacy.render(doc, style="ent", jupyter=False)
52
  # Display the entity visualization in the browser:
53
  st.markdown(ent_html, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
2
 
3
  import streamlit as st
4
  import en_pipeline
5
+ from presidio_anonymizer import AnonymizerEngine
6
+ from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer, AnalysisExplanation
7
+ from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider
8
+ from presidio_analyzer import AnalyzerEngine
9
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
10
+ from presidio_analyzer.recognizer_registry import RecognizerRegistry
11
+ from presidio_analyzer.predefined_recognizers import SpacyRecognizer
12
+
13
  import spacy
14
  from spacy import displacy
15
  nlp = en_pipeline.load()
16
 
17
+ def get_analyzer():
18
+ # https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities%20DEFAULT_ANOYNM_ENTITIES%20=%20[
19
+ supported_entities = ["CREDIT_CARD","DATE_TIME","EMAIL_ADDRESS","IBAN_CODE","IP_ADDRESS","NRP","LOCATION","PERSON","PHONE_NUMBER","URL","US_BANK_NUMBER","US_DRIVER_LICENSE","US_PASSPORT","US_SSN","US_ITIN"]
20
+
21
+ # using presidio default recognizer rules
22
+ # analyzer = AnalyzerEngine()
23
+
24
+ #uncomment below to add spacy predefined engines instead of default engine
25
+ config = {
26
+ 'nlp_engine_name': 'spacy',
27
+ 'models': [
28
+ {
29
+ 'lang_code': 'en',
30
+ 'model_name': 'en_core_web_sm'
31
+ },
32
+
33
+ ],
34
+ 'ner_model_configuration': {
35
+ 'labels_to_ignore': ['O'],
36
+ 'model_to_presidio_entity_mapping': {
37
+ 'PER': 'PERSON',
38
+ 'LOC': 'LOCATION',
39
+ 'DATE': 'DATE_TIME',
40
+ 'GPE': 'LOCATION',
41
+ 'PERSON': 'PERSON',
42
+ 'TIME': 'DATE_TIME',
43
+ },
44
+ # 'low_confidence_score_multiplier': 0.4,
45
+ # 'low_score_entity_names': ['ID', 'ORG']
46
+ }
47
+ }
48
+
49
+ # Initialize the NLP engine with the recognizer registry
50
+ provider = NlpEngineProvider(nlp_configuration=config)
51
+ nlp_engine = provider.create_engine()
52
+
53
+ # Create the recognizer registry
54
+ registry = RecognizerRegistry()
55
+ registry.load_predefined_recognizers()
56
+
57
+ # Pass the created NLP engine and supported_languages to the AnalyzerEngine
58
+ analyzer = AnalyzerEngine(
59
+ nlp_engine=nlp_engine,
60
+ supported_languages = "en",
61
+ registry=registry
62
+ )
63
+
64
+ # below mis useful when model to presidio mapping are same.
65
+ # Load spaCy model with transformers
66
+ nlp = spacy.load("en_pipeline")
67
+
68
+ # Integrate spaCy recognizer with Presidio
69
+ spacy_recognizer = SpacyRecognizer(nlp, supported_entities=supported_entities)
70
+ analyzer.registry.add_recognizer(spacy_recognizer)
71
+
72
+ return analyzer
73
+
74
+ analyzer = get_analyzer()
75
  # Display a section header:
76
  st.header("PII-Redaction")
77
 
 
101
 
102
  st.divider()
103
 
104
+ analyzer_results = analyzer.analyze(text=input_text, entities = supported_entities, language="en",return_decision_process=True,)
105
+ # Text Anonymizer
106
+ engine = AnonymizerEngine()
107
+ result = engine.anonymize(text=text_fr, analyzer_results=analyzer_results)
108
+
109
+ # Restructuring anonymizer results
110
+ anonymization_results = {"anonymized": result.text,"found": [entity.to_dict() for entity in analyzer_results]}
111
+ words = [{'word': text_fr[obj['start']:obj['end']], 'entity_type':obj['entity_type'], 'start':obj['start'], 'end':obj['end']} for obj in anonymization_results['found']]
112
+ anonym = anonymization_results['anonymized']
113
 
114
  doc = nlp(input_text)
115
 
 
126
  ent_html = displacy.render(doc, style="ent", jupyter=False)
127
  # Display the entity visualization in the browser:
128
  st.markdown(ent_html, unsafe_allow_html=True)
129
+
130
+ st.divider()
131
+
132
+ # Add a section header:
133
+ st.header("Entity Anonymizer")
134
+ # Display the entity visualization in the browser:
135
+ st.markdown(anonym, unsafe_allow_html=True)
136
+
137
+