GERNET Enody commited on
Add files via upload
Browse files- utilities/anonymize.py +394 -0
- utilities/convert.py +45 -0
- utilities/extract_hpo.py +120 -0
- utilities/get_model.py +48 -0
- utilities/translate.py +220 -0
- utilities/web_utilities.py +32 -1
utilities/anonymize.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
from unidecode import unidecode
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from presidio_anonymizer.entities import OperatorConfig
|
| 6 |
+
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
|
| 7 |
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
| 8 |
+
from presidio_anonymizer import AnonymizerEngine
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 15 |
+
def anonymize_analyzer(MarianText_letter, _analyzer, proper_noun, Last_name, First_name):
|
| 16 |
+
MarianText_anonymize_letter = MarianText_letter
|
| 17 |
+
# st.write(MarianText_anonymize_letter)
|
| 18 |
+
analyzer_results_keep = []
|
| 19 |
+
analyzer_results_return = []
|
| 20 |
+
analyzer_results_saved = []
|
| 21 |
+
analyzer_results = _analyzer.analyze(
|
| 22 |
+
text=MarianText_letter,
|
| 23 |
+
language="en",
|
| 24 |
+
entities=["DATE_TIME", "PERSON", "FRENCH_CITY"],
|
| 25 |
+
allow_list=[
|
| 26 |
+
"evening",
|
| 27 |
+
"day",
|
| 28 |
+
"the day",
|
| 29 |
+
"the age of",
|
| 30 |
+
"age",
|
| 31 |
+
"years",
|
| 32 |
+
"week",
|
| 33 |
+
"years old",
|
| 34 |
+
"months",
|
| 35 |
+
"hours",
|
| 36 |
+
"night",
|
| 37 |
+
"noon",
|
| 38 |
+
"nights",
|
| 39 |
+
"tomorrow",
|
| 40 |
+
"today",
|
| 41 |
+
"yesterday",
|
| 42 |
+
],
|
| 43 |
+
)
|
| 44 |
+
len_to_add = 0
|
| 45 |
+
analyser_results_to_sort = {}
|
| 46 |
+
i = 0
|
| 47 |
+
detect_duplicated = []
|
| 48 |
+
for element in analyzer_results:
|
| 49 |
+
if element.start not in detect_duplicated:
|
| 50 |
+
analyser_results_to_sort[i] = element.start
|
| 51 |
+
detect_duplicated.append(element.start)
|
| 52 |
+
else:
|
| 53 |
+
pass
|
| 54 |
+
i = i + 1
|
| 55 |
+
sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
|
| 56 |
+
sorted_dict = {k: v for k, v in sorted_tuples}
|
| 57 |
+
print(sorted_dict)
|
| 58 |
+
exception_list_presidio = ["age", "year", "month", "day", "hour", "week"]
|
| 59 |
+
|
| 60 |
+
for element_raw in sorted_dict:
|
| 61 |
+
element = analyzer_results[element_raw]
|
| 62 |
+
word = MarianText_letter[element.start : element.end]
|
| 63 |
+
exception_detected = [e for e in exception_list_presidio if e in word.lower()]
|
| 64 |
+
if word.count("/") == 1 or word.count("/") > 2:
|
| 65 |
+
exception_detected.append("/ or ///")
|
| 66 |
+
if len(exception_detected) == 0:
|
| 67 |
+
if word.lower().strip() in proper_noun:
|
| 68 |
+
word_to_replace = (
|
| 69 |
+
"**:green[" + word + "]** `[" + element.entity_type + "]`"
|
| 70 |
+
)
|
| 71 |
+
MarianText_anonymize_letter = (
|
| 72 |
+
MarianText_anonymize_letter[: element.start + len_to_add]
|
| 73 |
+
+ word_to_replace
|
| 74 |
+
+ MarianText_anonymize_letter[element.end + len_to_add :]
|
| 75 |
+
)
|
| 76 |
+
analyzer_results_saved.append(
|
| 77 |
+
{
|
| 78 |
+
"name": Last_name,
|
| 79 |
+
"surname": First_name,
|
| 80 |
+
"type": "deidentification",
|
| 81 |
+
"value": word,
|
| 82 |
+
"correction": element.entity_type,
|
| 83 |
+
"lf_detected": False,
|
| 84 |
+
"manual_validation": False,
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
# analyzer_results_saved.append(str(element) + ", word:" + word)
|
| 88 |
+
else:
|
| 89 |
+
word_to_replace = (
|
| 90 |
+
"**:red[" + word + "]** `[" + element.entity_type + "]`"
|
| 91 |
+
)
|
| 92 |
+
MarianText_anonymize_letter = (
|
| 93 |
+
MarianText_anonymize_letter[: element.start + len_to_add]
|
| 94 |
+
+ word_to_replace
|
| 95 |
+
+ MarianText_anonymize_letter[element.end + len_to_add :]
|
| 96 |
+
)
|
| 97 |
+
analyzer_results_keep.append(
|
| 98 |
+
{
|
| 99 |
+
"name": Last_name,
|
| 100 |
+
"surname": First_name,
|
| 101 |
+
"type": "deidentification",
|
| 102 |
+
"value": word,
|
| 103 |
+
"correction": element.entity_type,
|
| 104 |
+
"lf_detected": True,
|
| 105 |
+
"manual_validation": True,
|
| 106 |
+
}
|
| 107 |
+
)
|
| 108 |
+
# analyzer_results_keep.append(str(element) + ", word:" + word)
|
| 109 |
+
analyzer_results_return.append(element)
|
| 110 |
+
len_to_add = len_to_add + len(word_to_replace) - len(word)
|
| 111 |
+
else:
|
| 112 |
+
analyzer_results_saved.append(
|
| 113 |
+
{
|
| 114 |
+
"name": Last_name,
|
| 115 |
+
"surname": First_name,
|
| 116 |
+
"type": "deidentification",
|
| 117 |
+
"value": word,
|
| 118 |
+
"correction": element.entity_type,
|
| 119 |
+
"lf_detected": False,
|
| 120 |
+
"manual_validation": False,
|
| 121 |
+
}
|
| 122 |
+
)
|
| 123 |
+
# analyzer_results_saved.append(str(element) + ", word:" + word)
|
| 124 |
+
del analyzer_results
|
| 125 |
+
del len_to_add
|
| 126 |
+
del exception_list_presidio
|
| 127 |
+
del analyser_results_to_sort
|
| 128 |
+
del sorted_tuples
|
| 129 |
+
del sorted_dict
|
| 130 |
+
|
| 131 |
+
return (
|
| 132 |
+
MarianText_anonymize_letter,
|
| 133 |
+
analyzer_results_return,
|
| 134 |
+
analyzer_results_keep,
|
| 135 |
+
analyzer_results_saved,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 140 |
+
def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp):
|
| 141 |
+
result = _engine.anonymize(
|
| 142 |
+
text=MarianText_letter,
|
| 143 |
+
analyzer_results=_analyzer_results_return,
|
| 144 |
+
operators={
|
| 145 |
+
"PERSON": OperatorConfig("replace", {"new_value": ""}),
|
| 146 |
+
"LOCATION": OperatorConfig("replace", {"new_value": ""}),
|
| 147 |
+
"FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
|
| 148 |
+
},
|
| 149 |
+
)
|
| 150 |
+
return reformat_to_report(result.text, _nlp)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 154 |
+
def add_space_to_comma(texte, _nlp):
|
| 155 |
+
text_list = []
|
| 156 |
+
regex = "(?<!\d)(\,)(?!\d)(?!.*\1)"
|
| 157 |
+
for sentence in _nlp.process(texte).sentences:
|
| 158 |
+
text_space = re.sub(regex, " , ", sentence.text.replace("\n", " "))
|
| 159 |
+
text_space_no_db = text_space.replace(" ", " ")
|
| 160 |
+
text_list.append(text_space_no_db)
|
| 161 |
+
# print(text_space_no_db)
|
| 162 |
+
return " ".join(text_list)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 166 |
+
def add_space_to_endpoint(texte, _nlp):
|
| 167 |
+
text_list = []
|
| 168 |
+
regex = "(?<!\d)(\.)(?!\d)(?!.*\1)"
|
| 169 |
+
for sentence in _nlp.process(texte).sentences:
|
| 170 |
+
text_space = re.sub(regex, " . ", sentence.text.replace("\n", " "))
|
| 171 |
+
text_space_no_db = text_space.replace(" ", " ")
|
| 172 |
+
text_list.append(text_space_no_db)
|
| 173 |
+
# print(text_space_no_db)
|
| 174 |
+
return " ".join(text_list)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 178 |
+
def add_space_to_leftp(texte, _nlp):
|
| 179 |
+
text_list = []
|
| 180 |
+
regex = "(?<!\d)(\()(?!\d)(?!.*\1)"
|
| 181 |
+
for sentence in _nlp.process(texte).sentences:
|
| 182 |
+
text_space = re.sub(regex, " ( ", sentence.text.replace("\n", " "))
|
| 183 |
+
text_space_no_db = text_space.replace(" ", " ")
|
| 184 |
+
text_list.append(text_space_no_db)
|
| 185 |
+
# print(text_space_no_db)
|
| 186 |
+
return " ".join(text_list)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 190 |
+
def add_space_to_rightp(texte, _nlp):
|
| 191 |
+
text_list = []
|
| 192 |
+
regex = "(?<!\d)(\))(?!\d)(?!.*\1)"
|
| 193 |
+
for sentence in _nlp.process(texte).sentences:
|
| 194 |
+
text_space = re.sub(regex, " ) ", sentence.text.replace("\n", " "))
|
| 195 |
+
text_space_no_db = text_space.replace(" ", " ")
|
| 196 |
+
text_list.append(text_space_no_db)
|
| 197 |
+
# print(text_space_no_db)
|
| 198 |
+
return " ".join(text_list)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 202 |
+
def add_space_to_stroph(texte, _nlp):
|
| 203 |
+
text_list = []
|
| 204 |
+
regex = "(?<!\d)(')(?!\d)(?!.*\1)"
|
| 205 |
+
for sentence in _nlp.process(texte).sentences:
|
| 206 |
+
text_space = re.sub(regex, " ' ", sentence.text.replace("\n", " "))
|
| 207 |
+
text_space_no_db = text_space.replace(" ", " ")
|
| 208 |
+
text_list.append(text_space_no_db)
|
| 209 |
+
# print(text_space_no_db)
|
| 210 |
+
return " ".join(text_list)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 214 |
+
def add_space_to_comma_endpoint(texte, _nlp):
|
| 215 |
+
text_fr_comma = add_space_to_comma(texte, _nlp)
|
| 216 |
+
text_fr_comma_endpoint = add_space_to_endpoint(text_fr_comma, _nlp)
|
| 217 |
+
text_fr_comma_endpoint_leftpc = add_space_to_leftp(text_fr_comma_endpoint, _nlp)
|
| 218 |
+
text_fr_comma_endpoint_leftpc_right_pc = add_space_to_rightp(
|
| 219 |
+
text_fr_comma_endpoint_leftpc, _nlp
|
| 220 |
+
)
|
| 221 |
+
del text_fr_comma
|
| 222 |
+
del text_fr_comma_endpoint
|
| 223 |
+
del text_fr_comma_endpoint_leftpc
|
| 224 |
+
return text_fr_comma_endpoint_leftpc_right_pc
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 228 |
+
def get_abbreviation_dict_correction():
|
| 229 |
+
# dict_correction = {}
|
| 230 |
+
with open("data/fr_abbreviations.json", "r") as outfile:
|
| 231 |
+
hpo_abbreviations = json.load(outfile)
|
| 232 |
+
return hpo_abbreviations # dict_correction
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 237 |
+
def reformat_to_report(text, _nlp):
|
| 238 |
+
cutsentence = []
|
| 239 |
+
for sentence in _nlp.process(text).sentences:
|
| 240 |
+
cutsentence.append(
|
| 241 |
+
sentence.text.replace(" ,", ",")
|
| 242 |
+
.replace(" .", ".")
|
| 243 |
+
.replace(" )", ")")
|
| 244 |
+
.replace(" (", "(")
|
| 245 |
+
.replace(" '", "'")
|
| 246 |
+
)
|
| 247 |
+
return " \n".join(cutsentence)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 251 |
+
def get_cities_list():
|
| 252 |
+
cities = pd.read_csv("data/proper_noun_location_sort.csv")
|
| 253 |
+
cities.columns = ["ville"]
|
| 254 |
+
whole_cities_patterns = []
|
| 255 |
+
list_cities = cities["ville"].to_list()
|
| 256 |
+
for element in list_cities:
|
| 257 |
+
whole_cities_patterns.append(element)
|
| 258 |
+
whole_cities_patterns.append(element.lower().capitalize())
|
| 259 |
+
whole_cities_patterns.append(element.upper())
|
| 260 |
+
whole_cities_patterns.append(unidecode(element))
|
| 261 |
+
whole_cities_patterns.append(unidecode(element).lower().capitalize())
|
| 262 |
+
whole_cities_patterns.append(unidecode(element).upper())
|
| 263 |
+
del cities
|
| 264 |
+
del list_cities
|
| 265 |
+
return whole_cities_patterns
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 269 |
+
def get_list_not_deidentify():
|
| 270 |
+
proper_noun_data = pd.read_csv(
|
| 271 |
+
"data/exception_list_anonymization.tsv", sep="\t", header=None
|
| 272 |
+
).astype(str)
|
| 273 |
+
|
| 274 |
+
drug_data = pd.read_csv("data/drug_name.tsv", sep="\t", header=None).astype(str)
|
| 275 |
+
|
| 276 |
+
gene_data = pd.read_csv("data/gene_name.tsv", sep="\t", header=None).astype(str)
|
| 277 |
+
|
| 278 |
+
proper_noun_list = (
|
| 279 |
+
proper_noun_data[0].to_list()
|
| 280 |
+
+ drug_data[0].to_list()
|
| 281 |
+
+ gene_data[0].to_list()
|
| 282 |
+
+ [
|
| 283 |
+
"PN",
|
| 284 |
+
"TN",
|
| 285 |
+
"SD",
|
| 286 |
+
"PCN",
|
| 287 |
+
"cher",
|
| 288 |
+
"chere",
|
| 289 |
+
"CAS",
|
| 290 |
+
"INDEX",
|
| 291 |
+
"APGAR",
|
| 292 |
+
"M",
|
| 293 |
+
"Ms",
|
| 294 |
+
"Mr",
|
| 295 |
+
"Behçet",
|
| 296 |
+
"hypoacousia",
|
| 297 |
+
]
|
| 298 |
+
)
|
| 299 |
+
proper_noun = [x.lower() for x in proper_noun_list]
|
| 300 |
+
|
| 301 |
+
del proper_noun_data
|
| 302 |
+
del drug_data
|
| 303 |
+
del gene_data
|
| 304 |
+
del proper_noun_list
|
| 305 |
+
return proper_noun
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 310 |
+
def change_name_patient_abbreviations(Report, Last_name, First_name, abbreviations_dict):
|
| 311 |
+
Report_name = Report
|
| 312 |
+
|
| 313 |
+
dict_correction_name_abbreviations = {
|
| 314 |
+
"M.": "M",
|
| 315 |
+
"Mme.": "Mme",
|
| 316 |
+
"Mlle.": "Mlle",
|
| 317 |
+
"Dr.": "Docteur",
|
| 318 |
+
"Dr": "Docteur",
|
| 319 |
+
"Pr.": "Professeur",
|
| 320 |
+
"Pr": "Professeur",
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
for firstname in First_name.split():
|
| 324 |
+
dict_correction_name_abbreviations[firstname] = "CAS"
|
| 325 |
+
for lastname in Last_name.split():
|
| 326 |
+
dict_correction_name_abbreviations[lastname] = "INDEX"
|
| 327 |
+
for key, value in abbreviations_dict.items():
|
| 328 |
+
dict_correction_name_abbreviations[key] = value # + " [" + key + "]"
|
| 329 |
+
|
| 330 |
+
list_replaced = []
|
| 331 |
+
splitted_Report = Report_name.replace("\n", " ").split(" ")
|
| 332 |
+
replaced_Report = []
|
| 333 |
+
for i in splitted_Report:
|
| 334 |
+
append_word = i
|
| 335 |
+
replace_word = None
|
| 336 |
+
for key, value in dict_correction_name_abbreviations.items():
|
| 337 |
+
i_check = i.lower().strip().replace(",", "").replace(".", "")
|
| 338 |
+
if i_check == key.lower().strip():
|
| 339 |
+
to_replace = i.strip().replace(",", "").replace(".", "")
|
| 340 |
+
replace_word = value
|
| 341 |
+
if i_check == Last_name or i_check == First_name:
|
| 342 |
+
list_replaced.append(
|
| 343 |
+
{
|
| 344 |
+
"name": Last_name,
|
| 345 |
+
"surname": First_name,
|
| 346 |
+
"type": "index_case",
|
| 347 |
+
"value": i.strip().replace(",", "").replace(".", ""),
|
| 348 |
+
"correction": value,
|
| 349 |
+
"lf_detected": True,
|
| 350 |
+
"manual_validation": True,
|
| 351 |
+
}
|
| 352 |
+
)
|
| 353 |
+
else:
|
| 354 |
+
list_replaced.append(
|
| 355 |
+
{
|
| 356 |
+
"name": Last_name,
|
| 357 |
+
"surname": First_name,
|
| 358 |
+
"type": "abbreviations",
|
| 359 |
+
"value": i.strip().replace(",", "").replace(".", ""),
|
| 360 |
+
"correction": value,
|
| 361 |
+
"lf_detected": True,
|
| 362 |
+
"manual_validation": True,
|
| 363 |
+
}
|
| 364 |
+
)
|
| 365 |
+
if replace_word:
|
| 366 |
+
append_word = append_word.replace(to_replace, replace_word)
|
| 367 |
+
replaced_Report.append(append_word)
|
| 368 |
+
del dict_correction_name_abbreviations
|
| 369 |
+
del splitted_Report
|
| 370 |
+
return " ".join(replaced_Report), list_replaced
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 374 |
+
def config_deidentify(cities_list):
|
| 375 |
+
configuration = {
|
| 376 |
+
"nlp_engine_name": "spacy",
|
| 377 |
+
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
# Create NLP engine based on configuration
|
| 381 |
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
| 382 |
+
nlp_engine = provider.create_engine()
|
| 383 |
+
frcity_recognizer = PatternRecognizer(
|
| 384 |
+
supported_entity="FRENCH_CITY", deny_list=cities_list
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
| 388 |
+
analyzer.registry.add_recognizer(frcity_recognizer)
|
| 389 |
+
engine = AnonymizerEngine()
|
| 390 |
+
del configuration
|
| 391 |
+
del provider
|
| 392 |
+
del nlp_engine
|
| 393 |
+
del frcity_recognizer
|
| 394 |
+
return analyzer, engine
|
utilities/convert.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from .web_utilities import st_cache_data_if, supported_cache
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 7 |
+
def convert_df(df):
|
| 8 |
+
return df.dropna(how="all").to_csv(sep="\t", index=False).encode("utf-8")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 12 |
+
def convert_df_no_header(df):
|
| 13 |
+
return (
|
| 14 |
+
df.dropna(how="all").to_csv(sep="\t", index=False, header=None).encode("utf-8")
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 19 |
+
def convert_json(df):
|
| 20 |
+
dict_return = {"features": []}
|
| 21 |
+
df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
|
| 22 |
+
if len(df_check) > 0:
|
| 23 |
+
df_dict_list = df[["HPO ID", "Phenotype name"]].to_dict(orient="index")
|
| 24 |
+
for key, value in df_dict_list.items():
|
| 25 |
+
dict_return["features"].append(
|
| 26 |
+
{
|
| 27 |
+
"id": value["HPO ID"],
|
| 28 |
+
"observed": "yes",
|
| 29 |
+
"label": value["Phenotype name"],
|
| 30 |
+
"type": "phenotype",
|
| 31 |
+
}
|
| 32 |
+
)
|
| 33 |
+
return json.dumps(dict_return)
|
| 34 |
+
else:
|
| 35 |
+
return json.dumps(dict_return)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
|
| 39 |
+
def convert_list_phenogenius(df):
|
| 40 |
+
df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
|
| 41 |
+
if len(df_check) > 0:
|
| 42 |
+
return ",".join(df_check["HPO ID"].to_list())
|
| 43 |
+
else:
|
| 44 |
+
return "No HPO in letters."
|
| 45 |
+
|
utilities/extract_hpo.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from clinphen_src import get_phenotypes_lf
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from .web_utilities import st_cache_data_if, supported_cache
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 9 |
+
def add_biometrics(text, _nlp):
|
| 10 |
+
cutsentence_with_biometrics = []
|
| 11 |
+
cutsentence = []
|
| 12 |
+
additional_terms = []
|
| 13 |
+
for sentence in _nlp.process(text).sentences:
|
| 14 |
+
cutsentence.append(sentence.text)
|
| 15 |
+
keep_element = ["cm", "kg", "qit", "qi"]
|
| 16 |
+
for sentence in cutsentence:
|
| 17 |
+
if any(ext in sentence.lower() for ext in keep_element):
|
| 18 |
+
if "SD" in sentence or "DS" in sentence:
|
| 19 |
+
sentence = sentence.replace("DS", "SD")
|
| 20 |
+
try:
|
| 21 |
+
kg_sd = re.findall("kg(.*?)sd", sentence.lower())[0]
|
| 22 |
+
num_kg_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", kg_sd)[0]
|
| 23 |
+
# print(kg_sd)
|
| 24 |
+
kg_sd = float(num_kg_sd)
|
| 25 |
+
print(kg_sd)
|
| 26 |
+
if kg_sd >= 2:
|
| 27 |
+
additional_terms.append("Increased body weight")
|
| 28 |
+
if kg_sd <= -2:
|
| 29 |
+
additional_terms.append("Decreased body weight")
|
| 30 |
+
except:
|
| 31 |
+
print("Incorrect weight recognition pattern")
|
| 32 |
+
print(sentence)
|
| 33 |
+
try:
|
| 34 |
+
if "is" in sentence.lower():
|
| 35 |
+
height_sd_alpha = re.findall("\ is(.*?)d", sentence.lower())[0]
|
| 36 |
+
if "cm" not in height_sd_alpha:
|
| 37 |
+
height_sd_raw = height_sd_alpha
|
| 38 |
+
if "easure" in sentence.lower():
|
| 39 |
+
height_sd_raw = re.findall("easure(.*?)d", sentence.lower())[0]
|
| 40 |
+
print(height_sd_raw)
|
| 41 |
+
height_sd = re.findall("m(.*?)s", height_sd_raw)[0]
|
| 42 |
+
print(height_sd)
|
| 43 |
+
num_height_sd = re.findall(
|
| 44 |
+
"\(\s*([-+].?\d+(?:\.\d+)?)\s*", height_sd
|
| 45 |
+
)[0]
|
| 46 |
+
height_sd = float(num_height_sd)
|
| 47 |
+
print(height_sd)
|
| 48 |
+
if height_sd >= 2:
|
| 49 |
+
additional_terms.append("Tall stature")
|
| 50 |
+
if height_sd <= -2:
|
| 51 |
+
additional_terms.append("Short stature")
|
| 52 |
+
except:
|
| 53 |
+
print("Incorrect height recognition pattern")
|
| 54 |
+
print(sentence)
|
| 55 |
+
try:
|
| 56 |
+
pc_sd_raw = (
|
| 57 |
+
re.findall("head(.*?)d", sentence.lower())[0]
|
| 58 |
+
.replace("(", "")
|
| 59 |
+
.replace(")", "")
|
| 60 |
+
.replace(" ", "")
|
| 61 |
+
)
|
| 62 |
+
pc_sd = re.findall("cm(.*?)s", pc_sd_raw)[0]
|
| 63 |
+
num_pc_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", pc_sd)[0]
|
| 64 |
+
pc_sd = float(num_pc_sd)
|
| 65 |
+
print(pc_sd)
|
| 66 |
+
if pc_sd >= 2:
|
| 67 |
+
additional_terms.append("Macrocephaly")
|
| 68 |
+
elif pc_sd <= -2:
|
| 69 |
+
additional_terms.append("Microcephaly")
|
| 70 |
+
except:
|
| 71 |
+
print("Incorrect head circumference recognition pattern")
|
| 72 |
+
print(sentence)
|
| 73 |
+
print(additional_terms)
|
| 74 |
+
if "FSIQ" in sentence or "IQ" in sentence:
|
| 75 |
+
try:
|
| 76 |
+
iq_score = re.findall("iq.*?(\d.*?)\D", sentence.lower())[0]
|
| 77 |
+
iq_score = float(iq_score)
|
| 78 |
+
print(iq_score)
|
| 79 |
+
if iq_score >= 70 and iq_score < 84:
|
| 80 |
+
additional_terms.append("Intellectual disability, borderline")
|
| 81 |
+
elif iq_score >= 50 and iq_score < 69:
|
| 82 |
+
additional_terms.append("Intellectual disability, mild")
|
| 83 |
+
elif iq_score >= 35 and iq_score < 49:
|
| 84 |
+
additional_terms.append("Intellectual disability, moderate")
|
| 85 |
+
elif iq_score >= 20 and iq_score < 34:
|
| 86 |
+
additional_terms.append("Intellectual disability, severe")
|
| 87 |
+
elif iq_score < 20:
|
| 88 |
+
additional_terms.append("Intellectual disability, profound")
|
| 89 |
+
print(additional_terms)
|
| 90 |
+
except:
|
| 91 |
+
print("Incorrect IQ recognition pattern")
|
| 92 |
+
print(sentence)
|
| 93 |
+
cutsentence_with_biometrics.append(
|
| 94 |
+
sentence + " This means " + ", ".join(additional_terms) + "."
|
| 95 |
+
)
|
| 96 |
+
else:
|
| 97 |
+
cutsentence_with_biometrics.append(sentence)
|
| 98 |
+
print(cutsentence_with_biometrics)
|
| 99 |
+
cutsentence_with_biometrics_return = [
|
| 100 |
+
i for i in cutsentence_with_biometrics if i != "."
|
| 101 |
+
]
|
| 102 |
+
del cutsentence_with_biometrics
|
| 103 |
+
del cutsentence
|
| 104 |
+
del keep_element
|
| 105 |
+
return " ".join(cutsentence_with_biometrics_return), additional_terms
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 110 |
+
def extract_hpo(inputStr):
|
| 111 |
+
hpo_to_name = get_phenotypes_lf.getNames()
|
| 112 |
+
returnString, returnStringUnsafe = get_phenotypes_lf.extract_phenotypes(
|
| 113 |
+
inputStr, hpo_to_name
|
| 114 |
+
)
|
| 115 |
+
returnDf = get_phenotypes_lf.get_dataframe_from_clinphen(returnString)
|
| 116 |
+
returnDfUnsafe = get_phenotypes_lf.get_dataframe_from_clinphen(returnStringUnsafe)
|
| 117 |
+
del hpo_to_name
|
| 118 |
+
del returnString
|
| 119 |
+
del returnStringUnsafe
|
| 120 |
+
return returnDf, returnDfUnsafe
|
utilities/get_model.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import stanza
|
| 2 |
+
import nltk
|
| 3 |
+
import os
|
| 4 |
+
import spacy
|
| 5 |
+
import streamlit as st
|
| 6 |
+
from .web_utilities import st_cache_resource_if, supported_cache
|
| 7 |
+
from .translate import Translator
|
| 8 |
+
|
| 9 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 10 |
+
def get_models(langue,output=os.path.expanduser("~")):
|
| 11 |
+
if langue == "fr":
|
| 12 |
+
stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
|
| 13 |
+
Translator(langue, "en")
|
| 14 |
+
elif langue == "de":
|
| 15 |
+
stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
|
| 16 |
+
Translator(langue, "en")
|
| 17 |
+
else:
|
| 18 |
+
stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
|
| 19 |
+
Translator(langue, "en")
|
| 20 |
+
if os.path.join(output,"nltk_data") not in nltk.data.path:
|
| 21 |
+
nltk.data.path.append(os.path.join(output,"nltk_data"))
|
| 22 |
+
try:
|
| 23 |
+
nltk.data.find("omw-1.4")
|
| 24 |
+
except LookupError:
|
| 25 |
+
nltk.download("omw-1.4",download_dir = os.path.join(output,"nltk_data"))
|
| 26 |
+
try:
|
| 27 |
+
nltk.data.find("wordnet")
|
| 28 |
+
except LookupError:
|
| 29 |
+
nltk.download("wordnet", download_dir = os.path.join(output,"nltk_data"))
|
| 30 |
+
|
| 31 |
+
spacy_model_name = "en_core_web_lg"
|
| 32 |
+
try:
|
| 33 |
+
nlp = spacy.load(os.path.join(output,spacy_model_name))
|
| 34 |
+
print(spacy_model_name + " already downloaded")
|
| 35 |
+
except OSError:
|
| 36 |
+
spacy.cli.download(spacy_model_name)
|
| 37 |
+
nlp = spacy.load(spacy_model_name)
|
| 38 |
+
nlp.to_disk(os.path.join(output,spacy_model_name))
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 43 |
+
def get_nlp_marian(source_lang):
|
| 44 |
+
nlp_fr = stanza.Pipeline(source_lang, processors="tokenize")
|
| 45 |
+
marian_fr_en = Translator(source_lang, "en")
|
| 46 |
+
return nlp_fr, marian_fr_en
|
| 47 |
+
|
| 48 |
+
|
utilities/translate.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Dict, List, Sequence
|
| 3 |
+
import stanza
|
| 4 |
+
import transformers
|
| 5 |
+
import json
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
|
| 8 |
+
from .anonymize import add_space_to_comma_endpoint, change_name_patient_abbreviations
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(frozen=True)
|
| 12 |
+
class SentenceBoundary:
|
| 13 |
+
text: str
|
| 14 |
+
prefix: str
|
| 15 |
+
|
| 16 |
+
def __str__(self):
|
| 17 |
+
return self.prefix + self.text
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class SentenceBoundaries:
|
| 22 |
+
def __init__(self) -> None:
|
| 23 |
+
self._sentence_boundaries: List[SentenceBoundary] = []
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def sentence_boundaries(self):
|
| 27 |
+
return self._sentence_boundaries
|
| 28 |
+
|
| 29 |
+
def update_sentence_boundaries(
|
| 30 |
+
self, sentence_boundaries_list: List[SentenceBoundary]
|
| 31 |
+
):
|
| 32 |
+
self._sentence_boundaries = sentence_boundaries_list
|
| 33 |
+
return self
|
| 34 |
+
|
| 35 |
+
def from_doc(self, doc: stanza.Document):
|
| 36 |
+
start_idx = 0
|
| 37 |
+
for sent in doc.sentences:
|
| 38 |
+
self.sentence_boundaries.append(
|
| 39 |
+
SentenceBoundary(
|
| 40 |
+
text=sent.text,
|
| 41 |
+
prefix=doc.text[start_idx : sent.tokens[0].start_char],
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
start_idx = sent.tokens[-1].end_char
|
| 45 |
+
self.sentence_boundaries.append(
|
| 46 |
+
SentenceBoundary(text="", prefix=doc.text[start_idx:])
|
| 47 |
+
)
|
| 48 |
+
return self
|
| 49 |
+
|
| 50 |
+
@property
|
| 51 |
+
def nonempty_sentences(self) -> List[str]:
|
| 52 |
+
return [item.text for item in self.sentence_boundaries if item.text]
|
| 53 |
+
|
| 54 |
+
def map_sentence_boundaries(self, d: Dict[str, str]) -> List:
|
| 55 |
+
return SentenceBoundaries().update_sentence_boundaries(
|
| 56 |
+
[
|
| 57 |
+
SentenceBoundary(text=d.get(sb.text, sb.text), prefix=sb.prefix)
|
| 58 |
+
for sb in self.sentence_boundaries
|
| 59 |
+
]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def __str__(self) -> str:
|
| 63 |
+
return "".join(map(str, self.sentence_boundaries))
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 67 |
+
def minibatch(seq, size):
|
| 68 |
+
items = []
|
| 69 |
+
for x in seq:
|
| 70 |
+
items.append(x)
|
| 71 |
+
if len(items) >= size:
|
| 72 |
+
yield items
|
| 73 |
+
items = []
|
| 74 |
+
if items:
|
| 75 |
+
yield items
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# @dataclass(frozen=True)
|
| 79 |
+
class Translator:
|
| 80 |
+
def __init__(self, source_lang: str, dest_lang: str, use_gpu: bool = False) -> None:
|
| 81 |
+
# self.use_gpu = use_gpu
|
| 82 |
+
self.model_name = "Helsinki-NLP/opus-mt-" + source_lang + "-" + dest_lang
|
| 83 |
+
self.model = transformers.MarianMTModel.from_pretrained(self.model_name)
|
| 84 |
+
# if use_gpu:
|
| 85 |
+
# self.model = self.model.cuda()
|
| 86 |
+
self.tokenizer = transformers.MarianTokenizer.from_pretrained(self.model_name)
|
| 87 |
+
self.sentencizer = stanza.Pipeline(
|
| 88 |
+
source_lang, processors="tokenize", verbose=False, use_gpu=use_gpu
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def sentencize(self, texts: Sequence[str]) -> List[SentenceBoundaries]:
|
| 92 |
+
return [
|
| 93 |
+
SentenceBoundaries().from_doc(doc=self.sentencizer.process(text))
|
| 94 |
+
for text in texts
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
def translate(
|
| 98 |
+
self, texts: Sequence[str], batch_size: int = 10, truncation=True
|
| 99 |
+
) -> Sequence[str]:
|
| 100 |
+
if isinstance(texts, str):
|
| 101 |
+
raise ValueError("Expected a sequence of texts")
|
| 102 |
+
text_sentences = self.sentencize(texts)
|
| 103 |
+
translations = {
|
| 104 |
+
sent: None for text in text_sentences for sent in text.nonempty_sentences
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
for text_batch in minibatch(
|
| 108 |
+
sorted(translations, key=len, reverse=True), batch_size
|
| 109 |
+
):
|
| 110 |
+
tokens = self.tokenizer(
|
| 111 |
+
text_batch, return_tensors="pt", padding=True, truncation=truncation
|
| 112 |
+
)
|
| 113 |
+
# if self.use_gpu:
|
| 114 |
+
# tokens = {k:v.cuda() for k, v in tokens.items()}
|
| 115 |
+
translate_tokens = self.model.generate(**tokens)
|
| 116 |
+
translate_batch = [
|
| 117 |
+
self.tokenizer.decode(t, skip_special_tokens=True)
|
| 118 |
+
for t in translate_tokens
|
| 119 |
+
]
|
| 120 |
+
for text, translated in zip(text_batch, translate_batch):
|
| 121 |
+
translations[text] = translated
|
| 122 |
+
|
| 123 |
+
return [
|
| 124 |
+
str(text.map_sentence_boundaries(translations)) for text in text_sentences
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 130 |
+
def translate_report(
|
| 131 |
+
Report, Last_name, First_name, _nlp, _marian_fr_en, dict_correction, abbreviation_dict
|
| 132 |
+
):
|
| 133 |
+
Report_name, list_replaced_abb_name = change_name_patient_abbreviations(
|
| 134 |
+
Report, Last_name, First_name, abbreviation_dict
|
| 135 |
+
)
|
| 136 |
+
MarianText_raw = translate_marian(Report_name, _nlp, _marian_fr_en)
|
| 137 |
+
MarianText_space = add_space_to_comma_endpoint(MarianText_raw, _nlp)
|
| 138 |
+
MarianText, list_replaced = correct_marian(
|
| 139 |
+
MarianText_space, dict_correction, Last_name, First_name
|
| 140 |
+
)
|
| 141 |
+
del MarianText_raw
|
| 142 |
+
del MarianText_space
|
| 143 |
+
return MarianText, list_replaced, list_replaced_abb_name
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 148 |
+
def translate_marian(Report_name, _nlp, _marian_fr_en):
|
| 149 |
+
list_of_sentence = []
|
| 150 |
+
for sentence in _nlp.process(Report_name).sentences:
|
| 151 |
+
list_of_sentence.append(sentence.text)
|
| 152 |
+
MarianText_raw = "\n".join(_marian_fr_en.translate(list_of_sentence))
|
| 153 |
+
del list_of_sentence
|
| 154 |
+
return MarianText_raw
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
|
| 158 |
+
def correct_marian(MarianText_space, dict_correction, Last_name, First_name):
|
| 159 |
+
MarianText = MarianText_space
|
| 160 |
+
list_replaced = []
|
| 161 |
+
for key, value in dict_correction.items():
|
| 162 |
+
if key in MarianText:
|
| 163 |
+
list_replaced.append(
|
| 164 |
+
{
|
| 165 |
+
"name": Last_name,
|
| 166 |
+
"surname": First_name,
|
| 167 |
+
"type": "marian_correction",
|
| 168 |
+
"value": key,
|
| 169 |
+
"correction": value,
|
| 170 |
+
"lf_detected": True,
|
| 171 |
+
"manual_validation": True,
|
| 172 |
+
}
|
| 173 |
+
)
|
| 174 |
+
MarianText = MarianText.replace(key, value)
|
| 175 |
+
return MarianText, list_replaced
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
|
| 180 |
+
def get_translation_dict_correction():
|
| 181 |
+
dict_correction_FRspec = {
|
| 182 |
+
"PC": "head circumference",
|
| 183 |
+
"palatine slot": "cleft palate",
|
| 184 |
+
"ASD": "autism",
|
| 185 |
+
"ADHD": "attention deficit hyperactivity disorder",
|
| 186 |
+
"IUGR": "intrauterin growth retardation",
|
| 187 |
+
"QI": "IQ ",
|
| 188 |
+
"QIT": "FSIQ ",
|
| 189 |
+
"ITQ": "FSIQ ",
|
| 190 |
+
"DS": "SD",
|
| 191 |
+
"FOP": "patent foramen ovale",
|
| 192 |
+
"PFO": "patent foramen ovale",
|
| 193 |
+
"ARCF": "fetal distress",
|
| 194 |
+
"\n": " ",
|
| 195 |
+
"associated": "with",
|
| 196 |
+
"Mr.": "Mr",
|
| 197 |
+
"Mrs.": "Mrs",
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
dict_correction = {}
|
| 201 |
+
for key, value in dict_correction_FRspec.items():
|
| 202 |
+
dict_correction[" " + key + " "] = " " + value + " "
|
| 203 |
+
|
| 204 |
+
with open("data/hp_fr_en_translated_marian_review_lwg.json", "r") as outfile:
|
| 205 |
+
hpo_translated = json.load(outfile)
|
| 206 |
+
|
| 207 |
+
for key, value in hpo_translated.items():
|
| 208 |
+
dict_correction[" " + key + " "] = " " + value + " "
|
| 209 |
+
|
| 210 |
+
with open("data/fr_abbreviations_translation.json", "r") as outfile:
|
| 211 |
+
hpo_translated_abbreviations = json.load(outfile)
|
| 212 |
+
|
| 213 |
+
for key, value in hpo_translated_abbreviations.items():
|
| 214 |
+
dict_correction[" " + key + " "] = " " + value + " "
|
| 215 |
+
|
| 216 |
+
del hpo_translated
|
| 217 |
+
del hpo_translated_abbreviations
|
| 218 |
+
return dict_correction
|
| 219 |
+
|
| 220 |
+
|
utilities/web_utilities.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
| 3 |
from PIL import Image
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
def display_page_title(title: str):
|
|
@@ -47,3 +48,33 @@ def display_sidebar():
|
|
| 47 |
# file_name="Mentions_legales_lf.pdf",
|
| 48 |
# mime='application/octet-stream')
|
| 49 |
# st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
from PIL import Image
|
| 3 |
+
import inspect
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
|
| 7 |
def display_page_title(title: str):
|
|
|
|
| 48 |
# file_name="Mentions_legales_lf.pdf",
|
| 49 |
# mime='application/octet-stream')
|
| 50 |
# st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def st_cache_data_if(condition, *args, **kwargs):
|
| 54 |
+
def decorator(func):
|
| 55 |
+
if condition:
|
| 56 |
+
return st.cache_data(*args, **kwargs)(func)
|
| 57 |
+
else:
|
| 58 |
+
return func
|
| 59 |
+
return decorator
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def st_cache_resource_if(condition, *args, **kwargs):
|
| 63 |
+
def decorator(func):
|
| 64 |
+
if condition:
|
| 65 |
+
return st.cache_resource(*args, **kwargs)(func)
|
| 66 |
+
else:
|
| 67 |
+
return func
|
| 68 |
+
return decorator
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
supported_cache = False
|
| 72 |
+
|
| 73 |
+
def stack_checker():
|
| 74 |
+
caller_frame = inspect.stack()
|
| 75 |
+
for e in caller_frame:
|
| 76 |
+
if os.path.basename(e.filename) == "clinfly_app_st.py":
|
| 77 |
+
global supported_cache
|
| 78 |
+
supported_cache = True
|
| 79 |
+
|
| 80 |
+
stack_checker()
|