GERNET Enody commited on
Commit
ccc4c35
·
unverified ·
1 Parent(s): e96fe28

Add files via upload

Browse files
utilities/anonymize.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from unidecode import unidecode
4
+ import pandas as pd
5
+ from presidio_anonymizer.entities import OperatorConfig
6
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer
7
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
8
+ from presidio_anonymizer import AnonymizerEngine
9
+ import streamlit as st
10
+ from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
11
+
12
+
13
+
14
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
15
+ def anonymize_analyzer(MarianText_letter, _analyzer, proper_noun, Last_name, First_name):
16
+ MarianText_anonymize_letter = MarianText_letter
17
+ # st.write(MarianText_anonymize_letter)
18
+ analyzer_results_keep = []
19
+ analyzer_results_return = []
20
+ analyzer_results_saved = []
21
+ analyzer_results = _analyzer.analyze(
22
+ text=MarianText_letter,
23
+ language="en",
24
+ entities=["DATE_TIME", "PERSON", "FRENCH_CITY"],
25
+ allow_list=[
26
+ "evening",
27
+ "day",
28
+ "the day",
29
+ "the age of",
30
+ "age",
31
+ "years",
32
+ "week",
33
+ "years old",
34
+ "months",
35
+ "hours",
36
+ "night",
37
+ "noon",
38
+ "nights",
39
+ "tomorrow",
40
+ "today",
41
+ "yesterday",
42
+ ],
43
+ )
44
+ len_to_add = 0
45
+ analyser_results_to_sort = {}
46
+ i = 0
47
+ detect_duplicated = []
48
+ for element in analyzer_results:
49
+ if element.start not in detect_duplicated:
50
+ analyser_results_to_sort[i] = element.start
51
+ detect_duplicated.append(element.start)
52
+ else:
53
+ pass
54
+ i = i + 1
55
+ sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
56
+ sorted_dict = {k: v for k, v in sorted_tuples}
57
+ print(sorted_dict)
58
+ exception_list_presidio = ["age", "year", "month", "day", "hour", "week"]
59
+
60
+ for element_raw in sorted_dict:
61
+ element = analyzer_results[element_raw]
62
+ word = MarianText_letter[element.start : element.end]
63
+ exception_detected = [e for e in exception_list_presidio if e in word.lower()]
64
+ if word.count("/") == 1 or word.count("/") > 2:
65
+ exception_detected.append("/ or ///")
66
+ if len(exception_detected) == 0:
67
+ if word.lower().strip() in proper_noun:
68
+ word_to_replace = (
69
+ "**:green[" + word + "]** `[" + element.entity_type + "]`"
70
+ )
71
+ MarianText_anonymize_letter = (
72
+ MarianText_anonymize_letter[: element.start + len_to_add]
73
+ + word_to_replace
74
+ + MarianText_anonymize_letter[element.end + len_to_add :]
75
+ )
76
+ analyzer_results_saved.append(
77
+ {
78
+ "name": Last_name,
79
+ "surname": First_name,
80
+ "type": "deidentification",
81
+ "value": word,
82
+ "correction": element.entity_type,
83
+ "lf_detected": False,
84
+ "manual_validation": False,
85
+ }
86
+ )
87
+ # analyzer_results_saved.append(str(element) + ", word:" + word)
88
+ else:
89
+ word_to_replace = (
90
+ "**:red[" + word + "]** `[" + element.entity_type + "]`"
91
+ )
92
+ MarianText_anonymize_letter = (
93
+ MarianText_anonymize_letter[: element.start + len_to_add]
94
+ + word_to_replace
95
+ + MarianText_anonymize_letter[element.end + len_to_add :]
96
+ )
97
+ analyzer_results_keep.append(
98
+ {
99
+ "name": Last_name,
100
+ "surname": First_name,
101
+ "type": "deidentification",
102
+ "value": word,
103
+ "correction": element.entity_type,
104
+ "lf_detected": True,
105
+ "manual_validation": True,
106
+ }
107
+ )
108
+ # analyzer_results_keep.append(str(element) + ", word:" + word)
109
+ analyzer_results_return.append(element)
110
+ len_to_add = len_to_add + len(word_to_replace) - len(word)
111
+ else:
112
+ analyzer_results_saved.append(
113
+ {
114
+ "name": Last_name,
115
+ "surname": First_name,
116
+ "type": "deidentification",
117
+ "value": word,
118
+ "correction": element.entity_type,
119
+ "lf_detected": False,
120
+ "manual_validation": False,
121
+ }
122
+ )
123
+ # analyzer_results_saved.append(str(element) + ", word:" + word)
124
+ del analyzer_results
125
+ del len_to_add
126
+ del exception_list_presidio
127
+ del analyser_results_to_sort
128
+ del sorted_tuples
129
+ del sorted_dict
130
+
131
+ return (
132
+ MarianText_anonymize_letter,
133
+ analyzer_results_return,
134
+ analyzer_results_keep,
135
+ analyzer_results_saved,
136
+ )
137
+
138
+
139
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
140
+ def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp):
141
+ result = _engine.anonymize(
142
+ text=MarianText_letter,
143
+ analyzer_results=_analyzer_results_return,
144
+ operators={
145
+ "PERSON": OperatorConfig("replace", {"new_value": ""}),
146
+ "LOCATION": OperatorConfig("replace", {"new_value": ""}),
147
+ "FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
148
+ },
149
+ )
150
+ return reformat_to_report(result.text, _nlp)
151
+
152
+
153
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
154
+ def add_space_to_comma(texte, _nlp):
155
+ text_list = []
156
+ regex = "(?<!\d)(\,)(?!\d)(?!.*\1)"
157
+ for sentence in _nlp.process(texte).sentences:
158
+ text_space = re.sub(regex, " , ", sentence.text.replace("\n", " "))
159
+ text_space_no_db = text_space.replace(" ", " ")
160
+ text_list.append(text_space_no_db)
161
+ # print(text_space_no_db)
162
+ return " ".join(text_list)
163
+
164
+
165
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
166
+ def add_space_to_endpoint(texte, _nlp):
167
+ text_list = []
168
+ regex = "(?<!\d)(\.)(?!\d)(?!.*\1)"
169
+ for sentence in _nlp.process(texte).sentences:
170
+ text_space = re.sub(regex, " . ", sentence.text.replace("\n", " "))
171
+ text_space_no_db = text_space.replace(" ", " ")
172
+ text_list.append(text_space_no_db)
173
+ # print(text_space_no_db)
174
+ return " ".join(text_list)
175
+
176
+
177
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
178
+ def add_space_to_leftp(texte, _nlp):
179
+ text_list = []
180
+ regex = "(?<!\d)(\()(?!\d)(?!.*\1)"
181
+ for sentence in _nlp.process(texte).sentences:
182
+ text_space = re.sub(regex, " ( ", sentence.text.replace("\n", " "))
183
+ text_space_no_db = text_space.replace(" ", " ")
184
+ text_list.append(text_space_no_db)
185
+ # print(text_space_no_db)
186
+ return " ".join(text_list)
187
+
188
+
189
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
190
+ def add_space_to_rightp(texte, _nlp):
191
+ text_list = []
192
+ regex = "(?<!\d)(\))(?!\d)(?!.*\1)"
193
+ for sentence in _nlp.process(texte).sentences:
194
+ text_space = re.sub(regex, " ) ", sentence.text.replace("\n", " "))
195
+ text_space_no_db = text_space.replace(" ", " ")
196
+ text_list.append(text_space_no_db)
197
+ # print(text_space_no_db)
198
+ return " ".join(text_list)
199
+
200
+
201
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
202
+ def add_space_to_stroph(texte, _nlp):
203
+ text_list = []
204
+ regex = "(?<!\d)(')(?!\d)(?!.*\1)"
205
+ for sentence in _nlp.process(texte).sentences:
206
+ text_space = re.sub(regex, " ' ", sentence.text.replace("\n", " "))
207
+ text_space_no_db = text_space.replace(" ", " ")
208
+ text_list.append(text_space_no_db)
209
+ # print(text_space_no_db)
210
+ return " ".join(text_list)
211
+
212
+
213
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
214
+ def add_space_to_comma_endpoint(texte, _nlp):
215
+ text_fr_comma = add_space_to_comma(texte, _nlp)
216
+ text_fr_comma_endpoint = add_space_to_endpoint(text_fr_comma, _nlp)
217
+ text_fr_comma_endpoint_leftpc = add_space_to_leftp(text_fr_comma_endpoint, _nlp)
218
+ text_fr_comma_endpoint_leftpc_right_pc = add_space_to_rightp(
219
+ text_fr_comma_endpoint_leftpc, _nlp
220
+ )
221
+ del text_fr_comma
222
+ del text_fr_comma_endpoint
223
+ del text_fr_comma_endpoint_leftpc
224
+ return text_fr_comma_endpoint_leftpc_right_pc
225
+
226
+
227
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
228
+ def get_abbreviation_dict_correction():
229
+ # dict_correction = {}
230
+ with open("data/fr_abbreviations.json", "r") as outfile:
231
+ hpo_abbreviations = json.load(outfile)
232
+ return hpo_abbreviations # dict_correction
233
+
234
+
235
+
236
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
237
+ def reformat_to_report(text, _nlp):
238
+ cutsentence = []
239
+ for sentence in _nlp.process(text).sentences:
240
+ cutsentence.append(
241
+ sentence.text.replace(" ,", ",")
242
+ .replace(" .", ".")
243
+ .replace(" )", ")")
244
+ .replace(" (", "(")
245
+ .replace(" '", "'")
246
+ )
247
+ return " \n".join(cutsentence)
248
+
249
+
250
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
251
+ def get_cities_list():
252
+ cities = pd.read_csv("data/proper_noun_location_sort.csv")
253
+ cities.columns = ["ville"]
254
+ whole_cities_patterns = []
255
+ list_cities = cities["ville"].to_list()
256
+ for element in list_cities:
257
+ whole_cities_patterns.append(element)
258
+ whole_cities_patterns.append(element.lower().capitalize())
259
+ whole_cities_patterns.append(element.upper())
260
+ whole_cities_patterns.append(unidecode(element))
261
+ whole_cities_patterns.append(unidecode(element).lower().capitalize())
262
+ whole_cities_patterns.append(unidecode(element).upper())
263
+ del cities
264
+ del list_cities
265
+ return whole_cities_patterns
266
+
267
+
268
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
269
+ def get_list_not_deidentify():
270
+ proper_noun_data = pd.read_csv(
271
+ "data/exception_list_anonymization.tsv", sep="\t", header=None
272
+ ).astype(str)
273
+
274
+ drug_data = pd.read_csv("data/drug_name.tsv", sep="\t", header=None).astype(str)
275
+
276
+ gene_data = pd.read_csv("data/gene_name.tsv", sep="\t", header=None).astype(str)
277
+
278
+ proper_noun_list = (
279
+ proper_noun_data[0].to_list()
280
+ + drug_data[0].to_list()
281
+ + gene_data[0].to_list()
282
+ + [
283
+ "PN",
284
+ "TN",
285
+ "SD",
286
+ "PCN",
287
+ "cher",
288
+ "chere",
289
+ "CAS",
290
+ "INDEX",
291
+ "APGAR",
292
+ "M",
293
+ "Ms",
294
+ "Mr",
295
+ "Behçet",
296
+ "hypoacousia",
297
+ ]
298
+ )
299
+ proper_noun = [x.lower() for x in proper_noun_list]
300
+
301
+ del proper_noun_data
302
+ del drug_data
303
+ del gene_data
304
+ del proper_noun_list
305
+ return proper_noun
306
+
307
+
308
+
309
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
310
+ def change_name_patient_abbreviations(Report, Last_name, First_name, abbreviations_dict):
311
+ Report_name = Report
312
+
313
+ dict_correction_name_abbreviations = {
314
+ "M.": "M",
315
+ "Mme.": "Mme",
316
+ "Mlle.": "Mlle",
317
+ "Dr.": "Docteur",
318
+ "Dr": "Docteur",
319
+ "Pr.": "Professeur",
320
+ "Pr": "Professeur",
321
+ }
322
+
323
+ for firstname in First_name.split():
324
+ dict_correction_name_abbreviations[firstname] = "CAS"
325
+ for lastname in Last_name.split():
326
+ dict_correction_name_abbreviations[lastname] = "INDEX"
327
+ for key, value in abbreviations_dict.items():
328
+ dict_correction_name_abbreviations[key] = value # + " [" + key + "]"
329
+
330
+ list_replaced = []
331
+ splitted_Report = Report_name.replace("\n", " ").split(" ")
332
+ replaced_Report = []
333
+ for i in splitted_Report:
334
+ append_word = i
335
+ replace_word = None
336
+ for key, value in dict_correction_name_abbreviations.items():
337
+ i_check = i.lower().strip().replace(",", "").replace(".", "")
338
+ if i_check == key.lower().strip():
339
+ to_replace = i.strip().replace(",", "").replace(".", "")
340
+ replace_word = value
341
+ if i_check == Last_name or i_check == First_name:
342
+ list_replaced.append(
343
+ {
344
+ "name": Last_name,
345
+ "surname": First_name,
346
+ "type": "index_case",
347
+ "value": i.strip().replace(",", "").replace(".", ""),
348
+ "correction": value,
349
+ "lf_detected": True,
350
+ "manual_validation": True,
351
+ }
352
+ )
353
+ else:
354
+ list_replaced.append(
355
+ {
356
+ "name": Last_name,
357
+ "surname": First_name,
358
+ "type": "abbreviations",
359
+ "value": i.strip().replace(",", "").replace(".", ""),
360
+ "correction": value,
361
+ "lf_detected": True,
362
+ "manual_validation": True,
363
+ }
364
+ )
365
+ if replace_word:
366
+ append_word = append_word.replace(to_replace, replace_word)
367
+ replaced_Report.append(append_word)
368
+ del dict_correction_name_abbreviations
369
+ del splitted_Report
370
+ return " ".join(replaced_Report), list_replaced
371
+
372
+
373
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
374
+ def config_deidentify(cities_list):
375
+ configuration = {
376
+ "nlp_engine_name": "spacy",
377
+ "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
378
+ }
379
+
380
+ # Create NLP engine based on configuration
381
+ provider = NlpEngineProvider(nlp_configuration=configuration)
382
+ nlp_engine = provider.create_engine()
383
+ frcity_recognizer = PatternRecognizer(
384
+ supported_entity="FRENCH_CITY", deny_list=cities_list
385
+ )
386
+
387
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
388
+ analyzer.registry.add_recognizer(frcity_recognizer)
389
+ engine = AnonymizerEngine()
390
+ del configuration
391
+ del provider
392
+ del nlp_engine
393
+ del frcity_recognizer
394
+ return analyzer, engine
utilities/convert.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import streamlit as st
3
+ from .web_utilities import st_cache_data_if, supported_cache
4
+
5
+
6
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
7
+ def convert_df(df):
8
+ return df.dropna(how="all").to_csv(sep="\t", index=False).encode("utf-8")
9
+
10
+
11
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
12
+ def convert_df_no_header(df):
13
+ return (
14
+ df.dropna(how="all").to_csv(sep="\t", index=False, header=None).encode("utf-8")
15
+ )
16
+
17
+
18
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
19
+ def convert_json(df):
20
+ dict_return = {"features": []}
21
+ df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
22
+ if len(df_check) > 0:
23
+ df_dict_list = df[["HPO ID", "Phenotype name"]].to_dict(orient="index")
24
+ for key, value in df_dict_list.items():
25
+ dict_return["features"].append(
26
+ {
27
+ "id": value["HPO ID"],
28
+ "observed": "yes",
29
+ "label": value["Phenotype name"],
30
+ "type": "phenotype",
31
+ }
32
+ )
33
+ return json.dumps(dict_return)
34
+ else:
35
+ return json.dumps(dict_return)
36
+
37
+
38
+ @st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
39
+ def convert_list_phenogenius(df):
40
+ df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
41
+ if len(df_check) > 0:
42
+ return ",".join(df_check["HPO ID"].to_list())
43
+ else:
44
+ return "No HPO in letters."
45
+
utilities/extract_hpo.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from clinphen_src import get_phenotypes_lf
3
+ import streamlit as st
4
+ from .web_utilities import st_cache_data_if, supported_cache
5
+
6
+
7
+
8
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
9
+ def add_biometrics(text, _nlp):
10
+ cutsentence_with_biometrics = []
11
+ cutsentence = []
12
+ additional_terms = []
13
+ for sentence in _nlp.process(text).sentences:
14
+ cutsentence.append(sentence.text)
15
+ keep_element = ["cm", "kg", "qit", "qi"]
16
+ for sentence in cutsentence:
17
+ if any(ext in sentence.lower() for ext in keep_element):
18
+ if "SD" in sentence or "DS" in sentence:
19
+ sentence = sentence.replace("DS", "SD")
20
+ try:
21
+ kg_sd = re.findall("kg(.*?)sd", sentence.lower())[0]
22
+ num_kg_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", kg_sd)[0]
23
+ # print(kg_sd)
24
+ kg_sd = float(num_kg_sd)
25
+ print(kg_sd)
26
+ if kg_sd >= 2:
27
+ additional_terms.append("Increased body weight")
28
+ if kg_sd <= -2:
29
+ additional_terms.append("Decreased body weight")
30
+ except:
31
+ print("Incorrect weight recognition pattern")
32
+ print(sentence)
33
+ try:
34
+ if "is" in sentence.lower():
35
+ height_sd_alpha = re.findall("\ is(.*?)d", sentence.lower())[0]
36
+ if "cm" not in height_sd_alpha:
37
+ height_sd_raw = height_sd_alpha
38
+ if "easure" in sentence.lower():
39
+ height_sd_raw = re.findall("easure(.*?)d", sentence.lower())[0]
40
+ print(height_sd_raw)
41
+ height_sd = re.findall("m(.*?)s", height_sd_raw)[0]
42
+ print(height_sd)
43
+ num_height_sd = re.findall(
44
+ "\(\s*([-+].?\d+(?:\.\d+)?)\s*", height_sd
45
+ )[0]
46
+ height_sd = float(num_height_sd)
47
+ print(height_sd)
48
+ if height_sd >= 2:
49
+ additional_terms.append("Tall stature")
50
+ if height_sd <= -2:
51
+ additional_terms.append("Short stature")
52
+ except:
53
+ print("Incorrect height recognition pattern")
54
+ print(sentence)
55
+ try:
56
+ pc_sd_raw = (
57
+ re.findall("head(.*?)d", sentence.lower())[0]
58
+ .replace("(", "")
59
+ .replace(")", "")
60
+ .replace(" ", "")
61
+ )
62
+ pc_sd = re.findall("cm(.*?)s", pc_sd_raw)[0]
63
+ num_pc_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", pc_sd)[0]
64
+ pc_sd = float(num_pc_sd)
65
+ print(pc_sd)
66
+ if pc_sd >= 2:
67
+ additional_terms.append("Macrocephaly")
68
+ elif pc_sd <= -2:
69
+ additional_terms.append("Microcephaly")
70
+ except:
71
+ print("Incorrect head circumference recognition pattern")
72
+ print(sentence)
73
+ print(additional_terms)
74
+ if "FSIQ" in sentence or "IQ" in sentence:
75
+ try:
76
+ iq_score = re.findall("iq.*?(\d.*?)\D", sentence.lower())[0]
77
+ iq_score = float(iq_score)
78
+ print(iq_score)
79
+ if iq_score >= 70 and iq_score < 84:
80
+ additional_terms.append("Intellectual disability, borderline")
81
+ elif iq_score >= 50 and iq_score < 69:
82
+ additional_terms.append("Intellectual disability, mild")
83
+ elif iq_score >= 35 and iq_score < 49:
84
+ additional_terms.append("Intellectual disability, moderate")
85
+ elif iq_score >= 20 and iq_score < 34:
86
+ additional_terms.append("Intellectual disability, severe")
87
+ elif iq_score < 20:
88
+ additional_terms.append("Intellectual disability, profound")
89
+ print(additional_terms)
90
+ except:
91
+ print("Incorrect IQ recognition pattern")
92
+ print(sentence)
93
+ cutsentence_with_biometrics.append(
94
+ sentence + " This means " + ", ".join(additional_terms) + "."
95
+ )
96
+ else:
97
+ cutsentence_with_biometrics.append(sentence)
98
+ print(cutsentence_with_biometrics)
99
+ cutsentence_with_biometrics_return = [
100
+ i for i in cutsentence_with_biometrics if i != "."
101
+ ]
102
+ del cutsentence_with_biometrics
103
+ del cutsentence
104
+ del keep_element
105
+ return " ".join(cutsentence_with_biometrics_return), additional_terms
106
+
107
+
108
+
109
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
110
+ def extract_hpo(inputStr):
111
+ hpo_to_name = get_phenotypes_lf.getNames()
112
+ returnString, returnStringUnsafe = get_phenotypes_lf.extract_phenotypes(
113
+ inputStr, hpo_to_name
114
+ )
115
+ returnDf = get_phenotypes_lf.get_dataframe_from_clinphen(returnString)
116
+ returnDfUnsafe = get_phenotypes_lf.get_dataframe_from_clinphen(returnStringUnsafe)
117
+ del hpo_to_name
118
+ del returnString
119
+ del returnStringUnsafe
120
+ return returnDf, returnDfUnsafe
utilities/get_model.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import stanza
2
+ import nltk
3
+ import os
4
+ import spacy
5
+ import streamlit as st
6
+ from .web_utilities import st_cache_resource_if, supported_cache
7
+ from .translate import Translator
8
+
9
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
10
+ def get_models(langue,output=os.path.expanduser("~")):
11
+ if langue == "fr":
12
+ stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
13
+ Translator(langue, "en")
14
+ elif langue == "de":
15
+ stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
16
+ Translator(langue, "en")
17
+ else:
18
+ stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
19
+ Translator(langue, "en")
20
+ if os.path.join(output,"nltk_data") not in nltk.data.path:
21
+ nltk.data.path.append(os.path.join(output,"nltk_data"))
22
+ try:
23
+ nltk.data.find("omw-1.4")
24
+ except LookupError:
25
+ nltk.download("omw-1.4",download_dir = os.path.join(output,"nltk_data"))
26
+ try:
27
+ nltk.data.find("wordnet")
28
+ except LookupError:
29
+ nltk.download("wordnet", download_dir = os.path.join(output,"nltk_data"))
30
+
31
+ spacy_model_name = "en_core_web_lg"
32
+ try:
33
+ nlp = spacy.load(os.path.join(output,spacy_model_name))
34
+ print(spacy_model_name + " already downloaded")
35
+ except OSError:
36
+ spacy.cli.download(spacy_model_name)
37
+ nlp = spacy.load(spacy_model_name)
38
+ nlp.to_disk(os.path.join(output,spacy_model_name))
39
+
40
+
41
+
42
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
43
+ def get_nlp_marian(source_lang):
44
+ nlp_fr = stanza.Pipeline(source_lang, processors="tokenize")
45
+ marian_fr_en = Translator(source_lang, "en")
46
+ return nlp_fr, marian_fr_en
47
+
48
+
utilities/translate.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Sequence
3
+ import stanza
4
+ import transformers
5
+ import json
6
+ import streamlit as st
7
+ from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
8
+ from .anonymize import add_space_to_comma_endpoint, change_name_patient_abbreviations
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class SentenceBoundary:
13
+ text: str
14
+ prefix: str
15
+
16
+ def __str__(self):
17
+ return self.prefix + self.text
18
+
19
+
20
+ @dataclass
21
+ class SentenceBoundaries:
22
+ def __init__(self) -> None:
23
+ self._sentence_boundaries: List[SentenceBoundary] = []
24
+
25
+ @property
26
+ def sentence_boundaries(self):
27
+ return self._sentence_boundaries
28
+
29
+ def update_sentence_boundaries(
30
+ self, sentence_boundaries_list: List[SentenceBoundary]
31
+ ):
32
+ self._sentence_boundaries = sentence_boundaries_list
33
+ return self
34
+
35
+ def from_doc(self, doc: stanza.Document):
36
+ start_idx = 0
37
+ for sent in doc.sentences:
38
+ self.sentence_boundaries.append(
39
+ SentenceBoundary(
40
+ text=sent.text,
41
+ prefix=doc.text[start_idx : sent.tokens[0].start_char],
42
+ )
43
+ )
44
+ start_idx = sent.tokens[-1].end_char
45
+ self.sentence_boundaries.append(
46
+ SentenceBoundary(text="", prefix=doc.text[start_idx:])
47
+ )
48
+ return self
49
+
50
+ @property
51
+ def nonempty_sentences(self) -> List[str]:
52
+ return [item.text for item in self.sentence_boundaries if item.text]
53
+
54
+ def map_sentence_boundaries(self, d: Dict[str, str]) -> List:
55
+ return SentenceBoundaries().update_sentence_boundaries(
56
+ [
57
+ SentenceBoundary(text=d.get(sb.text, sb.text), prefix=sb.prefix)
58
+ for sb in self.sentence_boundaries
59
+ ]
60
+ )
61
+
62
+ def __str__(self) -> str:
63
+ return "".join(map(str, self.sentence_boundaries))
64
+
65
+
66
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
67
+ def minibatch(seq, size):
68
+ items = []
69
+ for x in seq:
70
+ items.append(x)
71
+ if len(items) >= size:
72
+ yield items
73
+ items = []
74
+ if items:
75
+ yield items
76
+
77
+
78
+ # @dataclass(frozen=True)
79
+ class Translator:
80
+ def __init__(self, source_lang: str, dest_lang: str, use_gpu: bool = False) -> None:
81
+ # self.use_gpu = use_gpu
82
+ self.model_name = "Helsinki-NLP/opus-mt-" + source_lang + "-" + dest_lang
83
+ self.model = transformers.MarianMTModel.from_pretrained(self.model_name)
84
+ # if use_gpu:
85
+ # self.model = self.model.cuda()
86
+ self.tokenizer = transformers.MarianTokenizer.from_pretrained(self.model_name)
87
+ self.sentencizer = stanza.Pipeline(
88
+ source_lang, processors="tokenize", verbose=False, use_gpu=use_gpu
89
+ )
90
+
91
+ def sentencize(self, texts: Sequence[str]) -> List[SentenceBoundaries]:
92
+ return [
93
+ SentenceBoundaries().from_doc(doc=self.sentencizer.process(text))
94
+ for text in texts
95
+ ]
96
+
97
+ def translate(
98
+ self, texts: Sequence[str], batch_size: int = 10, truncation=True
99
+ ) -> Sequence[str]:
100
+ if isinstance(texts, str):
101
+ raise ValueError("Expected a sequence of texts")
102
+ text_sentences = self.sentencize(texts)
103
+ translations = {
104
+ sent: None for text in text_sentences for sent in text.nonempty_sentences
105
+ }
106
+
107
+ for text_batch in minibatch(
108
+ sorted(translations, key=len, reverse=True), batch_size
109
+ ):
110
+ tokens = self.tokenizer(
111
+ text_batch, return_tensors="pt", padding=True, truncation=truncation
112
+ )
113
+ # if self.use_gpu:
114
+ # tokens = {k:v.cuda() for k, v in tokens.items()}
115
+ translate_tokens = self.model.generate(**tokens)
116
+ translate_batch = [
117
+ self.tokenizer.decode(t, skip_special_tokens=True)
118
+ for t in translate_tokens
119
+ ]
120
+ for text, translated in zip(text_batch, translate_batch):
121
+ translations[text] = translated
122
+
123
+ return [
124
+ str(text.map_sentence_boundaries(translations)) for text in text_sentences
125
+ ]
126
+
127
+
128
+
129
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
130
+ def translate_report(
131
+ Report, Last_name, First_name, _nlp, _marian_fr_en, dict_correction, abbreviation_dict
132
+ ):
133
+ Report_name, list_replaced_abb_name = change_name_patient_abbreviations(
134
+ Report, Last_name, First_name, abbreviation_dict
135
+ )
136
+ MarianText_raw = translate_marian(Report_name, _nlp, _marian_fr_en)
137
+ MarianText_space = add_space_to_comma_endpoint(MarianText_raw, _nlp)
138
+ MarianText, list_replaced = correct_marian(
139
+ MarianText_space, dict_correction, Last_name, First_name
140
+ )
141
+ del MarianText_raw
142
+ del MarianText_space
143
+ return MarianText, list_replaced, list_replaced_abb_name
144
+
145
+
146
+
147
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
148
+ def translate_marian(Report_name, _nlp, _marian_fr_en):
149
+ list_of_sentence = []
150
+ for sentence in _nlp.process(Report_name).sentences:
151
+ list_of_sentence.append(sentence.text)
152
+ MarianText_raw = "\n".join(_marian_fr_en.translate(list_of_sentence))
153
+ del list_of_sentence
154
+ return MarianText_raw
155
+
156
+
157
+ @st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
158
+ def correct_marian(MarianText_space, dict_correction, Last_name, First_name):
159
+ MarianText = MarianText_space
160
+ list_replaced = []
161
+ for key, value in dict_correction.items():
162
+ if key in MarianText:
163
+ list_replaced.append(
164
+ {
165
+ "name": Last_name,
166
+ "surname": First_name,
167
+ "type": "marian_correction",
168
+ "value": key,
169
+ "correction": value,
170
+ "lf_detected": True,
171
+ "manual_validation": True,
172
+ }
173
+ )
174
+ MarianText = MarianText.replace(key, value)
175
+ return MarianText, list_replaced
176
+
177
+
178
+
179
+ @st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
180
+ def get_translation_dict_correction():
181
+ dict_correction_FRspec = {
182
+ "PC": "head circumference",
183
+ "palatine slot": "cleft palate",
184
+ "ASD": "autism",
185
+ "ADHD": "attention deficit hyperactivity disorder",
186
+ "IUGR": "intrauterin growth retardation",
187
+ "QI": "IQ ",
188
+ "QIT": "FSIQ ",
189
+ "ITQ": "FSIQ ",
190
+ "DS": "SD",
191
+ "FOP": "patent foramen ovale",
192
+ "PFO": "patent foramen ovale",
193
+ "ARCF": "fetal distress",
194
+ "\n": " ",
195
+ "associated": "with",
196
+ "Mr.": "Mr",
197
+ "Mrs.": "Mrs",
198
+ }
199
+
200
+ dict_correction = {}
201
+ for key, value in dict_correction_FRspec.items():
202
+ dict_correction[" " + key + " "] = " " + value + " "
203
+
204
+ with open("data/hp_fr_en_translated_marian_review_lwg.json", "r") as outfile:
205
+ hpo_translated = json.load(outfile)
206
+
207
+ for key, value in hpo_translated.items():
208
+ dict_correction[" " + key + " "] = " " + value + " "
209
+
210
+ with open("data/fr_abbreviations_translation.json", "r") as outfile:
211
+ hpo_translated_abbreviations = json.load(outfile)
212
+
213
+ for key, value in hpo_translated_abbreviations.items():
214
+ dict_correction[" " + key + " "] = " " + value + " "
215
+
216
+ del hpo_translated
217
+ del hpo_translated_abbreviations
218
+ return dict_correction
219
+
220
+
utilities/web_utilities.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
-
3
  from PIL import Image
 
 
4
 
5
 
6
  def display_page_title(title: str):
@@ -47,3 +48,33 @@ def display_sidebar():
47
  # file_name="Mentions_legales_lf.pdf",
48
  # mime='application/octet-stream')
49
  # st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  from PIL import Image
3
+ import inspect
4
+ import os
5
 
6
 
7
  def display_page_title(title: str):
 
48
  # file_name="Mentions_legales_lf.pdf",
49
  # mime='application/octet-stream')
50
  # st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")
51
+
52
+
53
+ def st_cache_data_if(condition, *args, **kwargs):
54
+ def decorator(func):
55
+ if condition:
56
+ return st.cache_data(*args, **kwargs)(func)
57
+ else:
58
+ return func
59
+ return decorator
60
+
61
+
62
+ def st_cache_resource_if(condition, *args, **kwargs):
63
+ def decorator(func):
64
+ if condition:
65
+ return st.cache_resource(*args, **kwargs)(func)
66
+ else:
67
+ return func
68
+ return decorator
69
+
70
+
71
+ supported_cache = False
72
+
73
+ def stack_checker():
74
+ caller_frame = inspect.stack()
75
+ for e in caller_frame:
76
+ if os.path.basename(e.filename) == "clinfly_app_st.py":
77
+ global supported_cache
78
+ supported_cache = True
79
+
80
+ stack_checker()