feat(deidentification): #1 add french cities
Browse files
clinphen_src/data/hpo_synonym_filter.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
clinphen_src/get_phenotypes_lf.py
CHANGED
|
@@ -3,7 +3,7 @@ from nltk.stem import WordNetLemmatizer
|
|
| 3 |
import pandas as pd
|
| 4 |
import re
|
| 5 |
|
| 6 |
-
HPO_SYN_MAP_FILE = "clinphen_src/data/
|
| 7 |
|
| 8 |
def getNames():
|
| 9 |
returnMap = {}
|
|
@@ -23,10 +23,10 @@ def end_of_point(word):
|
|
| 23 |
if word == "though": return True
|
| 24 |
return False
|
| 25 |
|
| 26 |
-
subpoint_enders = [":"] #","
|
| 27 |
def end_of_subpoint(word):
|
| 28 |
if word[-1] in subpoint_enders: return True
|
| 29 |
-
|
| 30 |
return False
|
| 31 |
|
| 32 |
def string_to_record_linewise(medical_record):
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import re
|
| 5 |
|
| 6 |
+
HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonym_filter.txt"
|
| 7 |
|
| 8 |
def getNames():
|
| 9 |
returnMap = {}
|
|
|
|
| 23 |
if word == "though": return True
|
| 24 |
return False
|
| 25 |
|
| 26 |
+
subpoint_enders = [":", ','] #","
|
| 27 |
def end_of_subpoint(word):
|
| 28 |
if word[-1] in subpoint_enders: return True
|
| 29 |
+
if word == "and": return True
|
| 30 |
return False
|
| 31 |
|
| 32 |
def string_to_record_linewise(medical_record):
|
data/fr_abbreviations.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{
|
|
|
|
| 1 |
+
{}
|
data/fr_abbreviations_translation.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{
|
|
|
|
| 1 |
+
{}
|
data/hp_fr_en_translated_marian_review_lwg.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/proper_noun_location_sort.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lf_app.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import List
|
|
| 12 |
import transformers
|
| 13 |
from typing import Sequence
|
| 14 |
import spacy
|
| 15 |
-
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
| 16 |
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
| 17 |
from presidio_anonymizer import AnonymizerEngine
|
| 18 |
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
|
|
@@ -66,6 +66,18 @@ def get_models():
|
|
| 66 |
print(spacy_model_name + " already downloaded")
|
| 67 |
return "Done"
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
@st.cache_data(max_entries=30)
|
| 71 |
def get_list_not_deidentify():
|
|
@@ -116,12 +128,15 @@ def config_deidentify():
|
|
| 116 |
# Create NLP engine based on configuration
|
| 117 |
provider = NlpEngineProvider(nlp_configuration=configuration)
|
| 118 |
nlp_engine = provider.create_engine()
|
|
|
|
| 119 |
|
| 120 |
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
|
|
|
| 121 |
engine = AnonymizerEngine()
|
| 122 |
del configuration
|
| 123 |
del provider
|
| 124 |
del nlp_engine
|
|
|
|
| 125 |
return analyzer, engine
|
| 126 |
|
| 127 |
|
|
@@ -248,16 +263,21 @@ def anonymize_analyzer(MarianText_letter, _analyzer, nom_propre, nom, prenom):
|
|
| 248 |
analyzer_results_keep = []
|
| 249 |
analyzer_results_return = []
|
| 250 |
analyzer_results_saved = []
|
| 251 |
-
analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
|
| 252 |
len_to_add = 0
|
| 253 |
analyser_results_to_sort = {}
|
| 254 |
i = 0
|
|
|
|
| 255 |
for element in analyzer_results:
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
i = i + 1
|
| 258 |
sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
|
| 259 |
sorted_dict = {k: v for k, v in sorted_tuples}
|
| 260 |
-
|
| 261 |
exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
|
| 262 |
|
| 263 |
for element_raw in sorted_dict:
|
|
@@ -337,6 +357,7 @@ def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp)
|
|
| 337 |
operators={
|
| 338 |
"PERSON": OperatorConfig("replace", {"new_value": ""}),
|
| 339 |
"LOCATION": OperatorConfig("replace", {"new_value": ""}),
|
|
|
|
| 340 |
},
|
| 341 |
)
|
| 342 |
return reformat_to_letter(result.text, _nlp)
|
|
@@ -729,6 +750,8 @@ def main_function(inputStr):
|
|
| 729 |
return returnDf, returnDfUnsafe
|
| 730 |
|
| 731 |
models_status = get_models()
|
|
|
|
|
|
|
| 732 |
nlp_fr, marian_fr_en = get_nlp_marian()
|
| 733 |
#nlp_en = get_nlp_en()
|
| 734 |
dict_correction = get_translation_dict_correction()
|
|
@@ -736,6 +759,7 @@ dict_abbreviation_correction = get_abbreviation_dict_correction()
|
|
| 736 |
nom_propre = get_list_not_deidentify()
|
| 737 |
analyzer, engine = config_deidentify()
|
| 738 |
|
|
|
|
| 739 |
if "load_state" not in st.session_state:
|
| 740 |
st.session_state.load_state = False
|
| 741 |
|
|
@@ -928,7 +952,7 @@ if submit_button or st.session_state.load_state:
|
|
| 928 |
st.download_button(
|
| 929 |
"Download summarized letter in PhenoGenius list of HPO format",
|
| 930 |
convert_list_phenogenius(clinphen_df),
|
| 931 |
-
nom + "_" + prenom + "
|
| 932 |
"text",
|
| 933 |
key="download-summarization-phenogenius",
|
| 934 |
)
|
|
|
|
| 12 |
import transformers
|
| 13 |
from typing import Sequence
|
| 14 |
import spacy
|
| 15 |
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
|
| 16 |
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
| 17 |
from presidio_anonymizer import AnonymizerEngine
|
| 18 |
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
|
|
|
|
| 66 |
print(spacy_model_name + " already downloaded")
|
| 67 |
return "Done"
|
| 68 |
|
| 69 |
+
@st.cache_data(max_entries=30)
|
| 70 |
+
def get_cities_list():
|
| 71 |
+
cities = pd.read_csv('data/proper_noun_location_sort.csv')
|
| 72 |
+
cities.columns=['ville']
|
| 73 |
+
whole_cities_patterns = []
|
| 74 |
+
list_cities = cities['ville'].to_list()
|
| 75 |
+
for element in list_cities:
|
| 76 |
+
whole_cities_patterns.append(element)
|
| 77 |
+
whole_cities_patterns.append(element.lower().capitalize())
|
| 78 |
+
del cities
|
| 79 |
+
del list_cities
|
| 80 |
+
return whole_cities_patterns
|
| 81 |
|
| 82 |
@st.cache_data(max_entries=30)
|
| 83 |
def get_list_not_deidentify():
|
|
|
|
| 128 |
# Create NLP engine based on configuration
|
| 129 |
provider = NlpEngineProvider(nlp_configuration=configuration)
|
| 130 |
nlp_engine = provider.create_engine()
|
| 131 |
+
frcity_recognizer = PatternRecognizer(supported_entity="FRENCH_CITY", deny_list=cities_list)
|
| 132 |
|
| 133 |
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
| 134 |
+
analyzer.registry.add_recognizer(frcity_recognizer)
|
| 135 |
engine = AnonymizerEngine()
|
| 136 |
del configuration
|
| 137 |
del provider
|
| 138 |
del nlp_engine
|
| 139 |
+
del frcity_recognizer
|
| 140 |
return analyzer, engine
|
| 141 |
|
| 142 |
|
|
|
|
| 263 |
analyzer_results_keep = []
|
| 264 |
analyzer_results_return = []
|
| 265 |
analyzer_results_saved = []
|
| 266 |
+
analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON", "FRENCH_CITY"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
|
| 267 |
len_to_add = 0
|
| 268 |
analyser_results_to_sort = {}
|
| 269 |
i = 0
|
| 270 |
+
detect_duplicated = []
|
| 271 |
for element in analyzer_results:
|
| 272 |
+
if element.start not in detect_duplicated:
|
| 273 |
+
analyser_results_to_sort[i] = element.start
|
| 274 |
+
detect_duplicated.append(element.start)
|
| 275 |
+
else:
|
| 276 |
+
pass
|
| 277 |
i = i + 1
|
| 278 |
sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
|
| 279 |
sorted_dict = {k: v for k, v in sorted_tuples}
|
| 280 |
+
print(sorted_dict)
|
| 281 |
exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
|
| 282 |
|
| 283 |
for element_raw in sorted_dict:
|
|
|
|
| 357 |
operators={
|
| 358 |
"PERSON": OperatorConfig("replace", {"new_value": ""}),
|
| 359 |
"LOCATION": OperatorConfig("replace", {"new_value": ""}),
|
| 360 |
+
"FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
|
| 361 |
},
|
| 362 |
)
|
| 363 |
return reformat_to_letter(result.text, _nlp)
|
|
|
|
| 750 |
return returnDf, returnDfUnsafe
|
| 751 |
|
| 752 |
models_status = get_models()
|
| 753 |
+
cities_list = get_cities_list()
|
| 754 |
+
#print(cities_list)
|
| 755 |
nlp_fr, marian_fr_en = get_nlp_marian()
|
| 756 |
#nlp_en = get_nlp_en()
|
| 757 |
dict_correction = get_translation_dict_correction()
|
|
|
|
| 759 |
nom_propre = get_list_not_deidentify()
|
| 760 |
analyzer, engine = config_deidentify()
|
| 761 |
|
| 762 |
+
|
| 763 |
if "load_state" not in st.session_state:
|
| 764 |
st.session_state.load_state = False
|
| 765 |
|
|
|
|
| 952 |
st.download_button(
|
| 953 |
"Download summarized letter in PhenoGenius list of HPO format",
|
| 954 |
convert_list_phenogenius(clinphen_df),
|
| 955 |
+
nom + "_" + prenom + "_summarized_letter.txt",
|
| 956 |
"text",
|
| 957 |
key="download-summarization-phenogenius",
|
| 958 |
)
|