Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import streamlit as st
|
|
| 3 |
import re
|
| 4 |
import logging
|
| 5 |
from presidio_anonymizer import AnonymizerEngine
|
| 6 |
-
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult
|
| 7 |
|
| 8 |
from annotated_text import annotated_text
|
| 9 |
from flair_recognizer import FlairRecognizer
|
|
@@ -41,21 +41,21 @@ def analyze(**kwargs):
|
|
| 41 |
if "entities" not in kwargs or "All" in kwargs["entities"]:
|
| 42 |
kwargs["entities"] = None
|
| 43 |
|
| 44 |
-
if st.session_state.excluded_words:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
| 55 |
|
| 56 |
results = analyzer_engine().analyze(**kwargs)
|
| 57 |
-
|
| 58 |
-
return results
|
| 59 |
|
| 60 |
def annotate():
|
| 61 |
text = st.session_state.text
|
|
@@ -98,64 +98,76 @@ def analyze_text():
|
|
| 98 |
logging.info(f"This is the text being analysed: {st.session_state.text}")
|
| 99 |
st.session_state.text_error = ""
|
| 100 |
st.session_state.n_requests += 1
|
| 101 |
-
|
| 102 |
text=st.session_state.text,
|
| 103 |
entities=st_entities,
|
| 104 |
language="en",
|
| 105 |
return_decision_process=False,
|
| 106 |
)
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
if st.session_state.allowed_words:
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
st.session_state.analyze_results = analyze_results
|
| 115 |
|
| 116 |
logging.info(
|
| 117 |
f"analyse results: {st.session_state.analyze_results}\n"
|
| 118 |
)
|
| 119 |
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
logging.info(
|
| 149 |
-
f"analyse results
|
| 150 |
)
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
|
| 153 |
analyze_results_fltered.append(token)
|
| 154 |
logging.info(
|
| 155 |
f"analyse results after removing allowed words: {analyze_results_fltered}\n"
|
| 156 |
)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
|
| 160 |
@st.cache(allow_output_mutation=True)
|
| 161 |
def anonymizer_engine():
|
|
@@ -190,8 +202,7 @@ def anonymise_text():
|
|
| 190 |
def clear_results():
|
| 191 |
st.session_state.anon_results=""
|
| 192 |
st.session_state.analyze_results=""
|
| 193 |
-
#
|
| 194 |
-
analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
|
| 195 |
|
| 196 |
#######################################
|
| 197 |
#### Initialize "global" variables ####
|
|
|
|
| 3 |
import re
|
| 4 |
import logging
|
| 5 |
from presidio_anonymizer import AnonymizerEngine
|
| 6 |
+
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, EntityRecognizer
|
| 7 |
|
| 8 |
from annotated_text import annotated_text
|
| 9 |
from flair_recognizer import FlairRecognizer
|
|
|
|
| 41 |
if "entities" not in kwargs or "All" in kwargs["entities"]:
|
| 42 |
kwargs["entities"] = None
|
| 43 |
|
| 44 |
+
# if st.session_state.excluded_words:
|
| 45 |
+
|
| 46 |
+
# deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
|
| 47 |
+
|
| 48 |
+
# logging.info(
|
| 49 |
+
# f"words excluded : {deny_list}\n"
|
| 50 |
+
# )
|
| 51 |
+
|
| 52 |
+
# excluded_words_recognizer = PatternRecognizer(supported_entity="MANUAL ADD",
|
| 53 |
+
# name="Excluded words recognizer",
|
| 54 |
+
# deny_list=deny_list)
|
| 55 |
+
# analyzer_engine().registry.add_recognizer(excluded_words_recognizer)
|
| 56 |
|
| 57 |
results = analyzer_engine().analyze(**kwargs)
|
| 58 |
+
st.session_state.analyze_results = results
|
|
|
|
| 59 |
|
| 60 |
def annotate():
|
| 61 |
text = st.session_state.text
|
|
|
|
| 98 |
logging.info(f"This is the text being analysed: {st.session_state.text}")
|
| 99 |
st.session_state.text_error = ""
|
| 100 |
st.session_state.n_requests += 1
|
| 101 |
+
analyze(
|
| 102 |
text=st.session_state.text,
|
| 103 |
entities=st_entities,
|
| 104 |
language="en",
|
| 105 |
return_decision_process=False,
|
| 106 |
)
|
| 107 |
|
| 108 |
+
if st.session_state.excluded_words:
|
| 109 |
+
include_manual_input()
|
| 110 |
|
| 111 |
if st.session_state.allowed_words:
|
| 112 |
+
exclude_manual_input()
|
|
|
|
|
|
|
| 113 |
|
| 114 |
logging.info(
|
| 115 |
f"analyse results: {st.session_state.analyze_results}\n"
|
| 116 |
)
|
| 117 |
|
| 118 |
|
| 119 |
+
def include_manual_input():
|
| 120 |
+
deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
|
| 121 |
+
def _deny_list_to_regex(deny_list):
|
| 122 |
+
"""
|
| 123 |
+
Convert a list of words to a matching regex.
|
| 124 |
+
To be analyzed by the analyze method as any other regex patterns.
|
| 125 |
+
:param deny_list: the list of words to detect
|
| 126 |
+
:return:the regex of the words for detection
|
| 127 |
+
"""
|
| 128 |
+
# Escape deny list elements as preparation for regex
|
| 129 |
+
escaped_deny_list = [re.escape(element) for element in deny_list]
|
| 130 |
+
regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
|
| 131 |
+
return regex
|
| 132 |
+
|
| 133 |
+
deny_list_pattern = _deny_list_to_regex(deny_list)
|
| 134 |
+
matches = re.finditer(deny_list_pattern, st.session_state.text)
|
| 135 |
+
results = []
|
| 136 |
+
for match in matches:
|
| 137 |
+
start, end = match.span()
|
| 138 |
+
current_match = st.session_state.text[start:end]
|
| 139 |
+
|
| 140 |
+
# Skip empty results
|
| 141 |
+
if current_match == "":
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
pattern_result = RecognizerResult(
|
| 145 |
+
entity_type='MANUALLY ADDED',
|
| 146 |
+
start=start,
|
| 147 |
+
end=end,
|
| 148 |
+
score=1.0,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
results.append(pattern_result)
|
| 152 |
+
|
| 153 |
+
results = EntityRecognizer.remove_duplicates(results)
|
| 154 |
+
|
| 155 |
+
st.session_state.analyze_results = st.session_state.analyze_results.extend(results)
|
| 156 |
+
|
| 157 |
logging.info(
|
| 158 |
+
f"analyse results after adding excluded words: {results}\n"
|
| 159 |
)
|
| 160 |
+
|
| 161 |
+
def exclude_manual_input():
|
| 162 |
+
analyze_results_fltered=[]
|
| 163 |
+
|
| 164 |
+
for token in st.session_state.analyze_results:
|
| 165 |
if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
|
| 166 |
analyze_results_fltered.append(token)
|
| 167 |
logging.info(
|
| 168 |
f"analyse results after removing allowed words: {analyze_results_fltered}\n"
|
| 169 |
)
|
| 170 |
+
st.session_state.analyze_results = analyze_results_fltered
|
|
|
|
| 171 |
|
| 172 |
@st.cache(allow_output_mutation=True)
|
| 173 |
def anonymizer_engine():
|
|
|
|
| 202 |
def clear_results():
|
| 203 |
st.session_state.anon_results=""
|
| 204 |
st.session_state.analyze_results=""
|
| 205 |
+
# analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
|
|
|
|
| 206 |
|
| 207 |
#######################################
|
| 208 |
#### Initialize "global" variables ####
|