kyauy commited on
Commit
e82dc50
·
1 Parent(s): 34602ae

feat(streamlit): #1 Add ClinPhen

Browse files
clinphen_src/__pycache__/get_phenotypes_lf.cpython-38.pyc ADDED
Binary file (8.07 kB). View file
 
clinphen_src/data/hpo_synonyms.txt ADDED
The diff for this file is too large to render. See raw diff
 
clinphen_src/data/hpo_term_names.txt ADDED
The diff for this file is too large to render. See raw diff
 
clinphen_src/get_phenotypes_lf.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from nltk.stem import WordNetLemmatizer
3
+ import re
4
+
5
+ HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonyms.txt"
6
+
7
+ def getNames():
8
+ returnMap = {}
9
+ for line in open("clinphen_src/data/hpo_term_names.txt"):
10
+ lineData = line.strip().split("\t")
11
+ returnMap[lineData[0]] = lineData[1]
12
+ return returnMap
13
+
14
+ point_enders = [".", u'•', '•', ";", "\t"]
15
+ def end_of_point(word):
16
+ #for char in point_enders:
17
+ # if char in word: return True
18
+ if word[-1] in point_enders: return True
19
+ if word == "but": return True
20
+ if word == "except": return True
21
+ if word == "however": return True
22
+ if word == "though": return True
23
+ return False
24
+
25
+ subpoint_enders = [",", ":"]
26
+ def end_of_subpoint(word):
27
+ if word[-1] in subpoint_enders: return True
28
+ if word == "and": return True
29
+ return False
30
+
31
+ def string_to_record_linewise(medical_record):
32
+ return medical_record.split("\n")
33
+
34
+ def load_medical_record_linewise(medical_record):
35
+ recordFile = string_to_record_linewise(medical_record)
36
+ sentences = []
37
+ for line in recordFile:
38
+ if ":" not in line: continue
39
+ curSentence = []
40
+ for word in line.strip().split(" "):
41
+ word = word.lower()
42
+ if len(word) < 1: continue
43
+ curSentence.append(word)
44
+ if end_of_point(word):
45
+ sentences.append(" ".join(curSentence))
46
+ curSentence = []
47
+ if len(curSentence) > 0: sentences.append(" ".join(curSentence))
48
+ subsentence_sets = []
49
+ for sent in sentences:
50
+ subsents = []
51
+ curSubsent = []
52
+ for word in sent.split(" "):
53
+ word = word.lower()
54
+ curSubsent.append(word)
55
+ if end_of_subpoint(word):
56
+ subsents.append(" ".join(curSubsent))
57
+ curSubsent = []
58
+ if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
59
+ subsentence_sets.append(subsents)
60
+ return subsentence_sets
61
+
62
+ def string_to_record_nonlinewise(medical_record):
63
+ listForm = []
64
+ for line in medical_record.split("\n"):
65
+ if len(line) < 1: continue
66
+ listForm.append(line)
67
+ return " ".join(listForm).split(" ")
68
+
69
+ def load_medical_record_subsentences(medical_record):
70
+ record = string_to_record_nonlinewise(medical_record)
71
+ sentences = []
72
+ curSentence = []
73
+ for word in record:
74
+ word = word.lower()
75
+ if len(word) < 1: continue
76
+ curSentence.append(word)
77
+ if end_of_point(word):
78
+ sentences.append(" ".join(curSentence))
79
+ curSentence = []
80
+ if len(curSentence) > 0: sentences.append(" ".join(curSentence))
81
+ subsentence_sets = []
82
+ for sent in sentences:
83
+ subsents = []
84
+ curSubsent = []
85
+ for word in sent.split(" "):
86
+ word = word.lower()
87
+ curSubsent.append(word)
88
+ if end_of_subpoint(word):
89
+ subsents.append(" ".join(curSubsent))
90
+ curSubsent = []
91
+ if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
92
+ subsentence_sets.append(subsents)
93
+ return subsentence_sets + load_medical_record_linewise(medical_record)
94
+
95
+ #Checks the given sentence for any flags from the lists you indicate.
96
+ negative_flags = ["no", "not", "none", "negative", "non", "never", "without", "denies", "haven't", "don't", "doesn't", "haven t", "don t", "doesn t", 'didn t']
97
+ family_flags = ["<person>","<person","cousin", "parent", "mom", "mother", "dad", "father", "grandmother", "grandfather", "grandparent", "family", "brother", "sister", "sibling", "uncle", "aunt", "nephew", "niece", "son", "daughter", "grandchild"]
98
+ healthy_flags = ["normal"]
99
+ disease_flags = ["associated", "gene", "recessive", "dominant", "variant", "cause", "literature", "individuals"]
100
+ treatment_flags = []
101
+ history_flags = []
102
+ mild_flags = []
103
+ uncertain_flags = []
104
+
105
+
106
+ low_synonyms = set(["low", "decreased", "decrease", "deficient", "deficiency", "deficit", "deficits", "reduce", "reduced", "lack", "lacking", "insufficient", "impairment", "impaired", "impair", "difficulty", "difficulties", "trouble"])
107
+ high_synonyms = set(["high", "increased", "increase", "elevated", "elevate", "elevation"])
108
+ abnormal_synonyms = set(["abnormal", "unusual", "atypical", "abnormality", "anomaly", "anomalies", "problem"])
109
+ common_synonyms = [
110
+ low_synonyms,
111
+ high_synonyms,
112
+ abnormal_synonyms
113
+ ]
114
+
115
+ def synonym_lemmas(word):
116
+ returnSet = set()
117
+ for synSet in common_synonyms:
118
+ if word in synSet: returnSet |= synSet
119
+ return returnSet
120
+
121
+ def custom_lemmas(word):
122
+ returnSet = set()
123
+ if len(word) < 2: return returnSet
124
+ if word[-1] == "s": returnSet.add(word[:-1])
125
+ if word[-1] == "i": returnSet.add(word[:-1] + "us")
126
+ if word [-1] == "a":
127
+ returnSet.add(word[:-1] + "um")
128
+ returnSet.add(word[:-1] + "on")
129
+ if len(word) < 3: return returnSet
130
+ if word[-2:] == "es":
131
+ returnSet.add(word[:-2])
132
+ returnSet.add(word[:-2] + "is")
133
+ if word[-2:] == "ic":
134
+ returnSet.add(word[:-2] + "ia")
135
+ returnSet.add(word[:-2] + "y")
136
+ if word[-2:] == "ly": returnSet.add(word[:-2])
137
+ if word[-2:] == "ed": returnSet.add(word[:-2])
138
+ if len(word) < 4: return returnSet
139
+ if word[-3:] == "ata": returnSet.add(word[:-2])
140
+ if word[-3:] == "ies": returnSet.add(word[:-3] + "y")
141
+ if word[-3:] == "ble": returnSet.add(word[:-2] + "ility")
142
+ if len(word) < 7: return returnSet
143
+ if word[-6:] == "bility": returnSet.add(word[:-5] + "le")
144
+ if len(word) < 8: return returnSet
145
+ if word[-7:] == "ication":
146
+ returnSet.add(word[:-7] + "y")
147
+ returnSet.add(word[:-7] + "ied")
148
+ return returnSet
149
+
150
+
151
+ def lemmatize(word):
152
+ word = re.sub('[^0-9a-zA-Z]+', '', word)
153
+ word = word.lower()
154
+ return WordNetLemmatizer().lemmatize(word)
155
+
156
+ def add_lemmas(wordSet):
157
+ lemmas = set()
158
+ for word in wordSet:
159
+ lemma = lemmatize(word)
160
+ if len(lemma) > 0: lemmas.add(lemma)
161
+ lemmas |= synonym_lemmas(word)
162
+ lemmas |= custom_lemmas(word)
163
+ return wordSet | lemmas
164
+
165
+
166
+ def get_flags(line, *flagsets):
167
+ line = add_lemmas(set(line))
168
+ returnFlags = set()
169
+ for flagset in flagsets:
170
+ flagset = add_lemmas(set(flagset))
171
+ for word in flagset:
172
+ if word in line: returnFlags.add(word)
173
+ return returnFlags
174
+
175
+ def alphanum_only(wordSet):
176
+ returnSet = set()
177
+ for word in wordSet:
178
+ #returnSet |= set(word_tokenize(re.sub('[^0-9a-zA-Z]+', ' ', word)))
179
+ returnSet |= set(re.sub('[^0-9a-zA-Z]+', ' ', word).split(" "))
180
+ return returnSet
181
+
182
+ def load_mr_map(parsed_record):
183
+ returnMap = defaultdict(set)
184
+ for i in range(len(parsed_record)):
185
+ line = set(parsed_record[i])
186
+ for word in line: returnMap[word].add(i)
187
+ return returnMap
188
+
189
+ def load_all_hpo_synonyms(filename=HPO_SYN_MAP_FILE):
190
+ returnMap = defaultdict(set)
191
+ for line in open(filename):
192
+ lineData = line.strip().split("\t")
193
+ hpo = lineData[0]
194
+ syn = lineData[1]
195
+ returnMap[hpo].add(syn)
196
+ return returnMap
197
+
198
+
199
+ def sort_ids_by_occurrences_then_earliness(id_to_lines):
200
+ listForm = []
201
+ for hpoid in id_to_lines.keys(): listForm.append((hpoid, len(id_to_lines[hpoid]), min(id_to_lines[hpoid])))
202
+ listForm.sort(key=lambda x: [-1*x[1], x[2], x[0]])
203
+ returnList = list()
204
+ for item in listForm: returnList.append(item[0])
205
+ return returnList
206
+
207
+ def extract_phenotypes(record, names, hpo_syn_file=HPO_SYN_MAP_FILE):
208
+ safe_ID_to_lines = defaultdict(set)
209
+ medical_record = load_medical_record_subsentences(record)
210
+ medical_record_subsentences = []
211
+ medical_record_words = []
212
+ medical_record_flags = []
213
+ subsent_to_sentence = []
214
+ for subsents in medical_record:
215
+ whole_sentence = ""
216
+ for subsent in subsents: whole_sentence += subsent + " "
217
+ whole_sentence = whole_sentence.strip()
218
+ whole_sentence = re.sub('[^0-9a-zA-Z]+', ' ', whole_sentence)
219
+ flags = get_flags(whole_sentence.split(" "), negative_flags, family_flags, healthy_flags, disease_flags, treatment_flags, history_flags, uncertain_flags, mild_flags)
220
+ for subsent in subsents:
221
+ medical_record_subsentences.append(subsent)
222
+ subsent_to_sentence.append(whole_sentence)
223
+ medical_record_words.append(add_lemmas(alphanum_only(set([subsent]))))
224
+ medical_record_flags.append(flags)
225
+ mr_map = load_mr_map(medical_record_words)
226
+ syns = load_all_hpo_synonyms(hpo_syn_file)
227
+ for hpoID in syns.keys():
228
+ for syn in syns[hpoID]:
229
+ syn = re.sub('[^0-9a-zA-Z]+', ' ', syn.lower())
230
+ synTokens = alphanum_only(set([syn]))
231
+ if len(synTokens) < 1: continue
232
+ firstToken = list(synTokens)[0]
233
+ lines = set(mr_map[firstToken])
234
+ for token in synTokens:
235
+ lines &= set(mr_map[token])
236
+ if len(lines) < 1: break
237
+ if len(lines) < 1: continue
238
+ for i in lines:
239
+ line = " ".join(medical_record_words[i])
240
+ flagged = False
241
+ for flag in medical_record_flags[i]:
242
+ if flag not in synTokens:
243
+ flagged = True
244
+ break
245
+ if flagged: continue
246
+ safe_ID_to_lines[hpoID].add(i)
247
+ safe_IDs = sort_ids_by_occurrences_then_earliness(safe_ID_to_lines)
248
+ returnString = ["HPO ID\tPhenotype name\tNo. occurrences\tEarliness (lower = earlier)\tExample sentence"]
249
+ #returnString = []
250
+ for ID in safe_IDs: returnString.append("\t".join([ID, names[ID], str(len(safe_ID_to_lines[ID])), str(min(safe_ID_to_lines[ID])), subsent_to_sentence[safe_ID_to_lines[ID].pop()]]))
251
+ return "\n".join(returnString)
lf_app.py CHANGED
@@ -16,6 +16,8 @@ from presidio_analyzer.nlp_engine import NlpEngineProvider
16
  from presidio_anonymizer import AnonymizerEngine
17
  from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
18
  import subprocess
 
 
19
 
20
  # -- Set page config
21
  apptitle = "Linguo Franca"
@@ -33,6 +35,8 @@ st.sidebar.header(
33
 
34
  st.sidebar.markdown(
35
  """
 
 
36
  If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
37
 
38
  Code source is available in GitHub:
@@ -51,6 +55,7 @@ st.sidebar.image(image_chu, caption=None, width=95)
51
  @st.cache_resource()
52
  def get_models():
53
  nltk.download("omw-1.4")
 
54
  stanza.download("fr")
55
  spacy_model_name = "en_core_web_lg"
56
  if not spacy.util.is_package(spacy_model_name):
@@ -450,19 +455,19 @@ def reformat_to_letter(text, _nlp):
450
 
451
  @st.cache_data()
452
  def convert_df(df):
453
- return df.to_csv(sep="\t").encode("utf-8")
454
 
455
 
456
  @st.cache_data()
457
  def add_biometrics(text, _nlp):
458
  cutsentence_with_biometrics = []
459
  cutsentence = []
 
460
  for sentence in _nlp.process(text).sentences:
461
  cutsentence.append(sentence.text)
462
  keep_element = ["cm", "kg", "qit", "qi"]
463
  for sentence in cutsentence:
464
  if any(ext in sentence.lower() for ext in keep_element):
465
- additional_terms = []
466
  if "SD" in sentence or "DS" in sentence:
467
  sentence = sentence.replace("DS", "SD")
468
  try:
@@ -546,7 +551,28 @@ def add_biometrics(text, _nlp):
546
  i for i in cutsentence_with_biometrics if i != "."
547
  ]
548
  return " ".join(cutsentence_with_biometrics_return), additional_terms
549
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  models_status = get_models()
552
  nlp, marian_fr_en = get_nlp_marian()
@@ -635,19 +661,7 @@ if submit_button or st.session_state.load_state:
635
  with st.expander("See additional terms extracted with biometrics analysis"):
636
  st.write(additional_terms)
637
 
638
- with open("sample_translated_deindentified_biometrics.txt", "w") as f:
639
- f.write(MarianText_anonymized_reformat_biometrics)
640
-
641
- with open("extract_clinphen_patient.tsv", "w") as outfile:
642
- subprocess.run(
643
- [
644
- "clinphen",
645
- "sample_translated_deindentified_biometrics.txt",
646
- ],
647
- stdout=outfile,
648
- )
649
-
650
- clinphen = pd.read_csv("extract_clinphen_patient.tsv", sep="\t")
651
 
652
  clinphen_df = st.experimental_data_editor(
653
  clinphen, num_rows="dynamic", key="data_editor"
 
16
  from presidio_anonymizer import AnonymizerEngine
17
  from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
18
  import subprocess
19
+ from clinphen_src import get_phenotypes_lf
20
+
21
 
22
  # -- Set page config
23
  apptitle = "Linguo Franca"
 
35
 
36
  st.sidebar.markdown(
37
  """
38
+ Currently only working from :fr: to :gb:.
39
+
40
  If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
41
 
42
  Code source is available in GitHub:
 
55
  @st.cache_resource()
56
  def get_models():
57
  nltk.download("omw-1.4")
58
+ nltk.download('wordnet')
59
  stanza.download("fr")
60
  spacy_model_name = "en_core_web_lg"
61
  if not spacy.util.is_package(spacy_model_name):
 
455
 
456
  @st.cache_data()
457
  def convert_df(df):
458
+ return df.to_csv(sep="\t", index=False, header=None).encode("utf-8")
459
 
460
 
461
  @st.cache_data()
462
  def add_biometrics(text, _nlp):
463
  cutsentence_with_biometrics = []
464
  cutsentence = []
465
+ additional_terms = []
466
  for sentence in _nlp.process(text).sentences:
467
  cutsentence.append(sentence.text)
468
  keep_element = ["cm", "kg", "qit", "qi"]
469
  for sentence in cutsentence:
470
  if any(ext in sentence.lower() for ext in keep_element):
 
471
  if "SD" in sentence or "DS" in sentence:
472
  sentence = sentence.replace("DS", "SD")
473
  try:
 
551
  i for i in cutsentence_with_biometrics if i != "."
552
  ]
553
  return " ".join(cutsentence_with_biometrics_return), additional_terms
554
+ @st.cache_data()
555
+ def main_function(inputStr):
556
+ hpo_to_name = get_phenotypes_lf.getNames()
557
+ returnString = get_phenotypes_lf.extract_phenotypes(inputStr, hpo_to_name)
558
+ returnList = []
559
+ i = 0
560
+ for element in returnString.split('\n'):
561
+ if i == 0:
562
+ i = 1
563
+ pass
564
+ else:
565
+ elementList = []
566
+ for i in element.split('\t'):
567
+ elementList.append(i)
568
+ returnList.append(elementList)
569
+ if len(returnList) > 0:
570
+ returnDf = pd.DataFrame(returnList)
571
+ returnDf.columns = ['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence']
572
+ else:
573
+ returnDf = pd.DataFrame(columns=['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence'])
574
+ return returnDf
575
+ return returnDf
576
 
577
  models_status = get_models()
578
  nlp, marian_fr_en = get_nlp_marian()
 
661
  with st.expander("See additional terms extracted with biometrics analysis"):
662
  st.write(additional_terms)
663
 
664
+ clinphen = main_function(MarianText_anonymized_reformat_biometrics)
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
  clinphen_df = st.experimental_data_editor(
667
  clinphen, num_rows="dynamic", key="data_editor"
pyproject.toml CHANGED
@@ -7,10 +7,12 @@ authors = ["kyauy <kevin.yauy@gmail.com>"]
7
  [tool.poetry.dependencies]
8
  python = ">=3.8.0,<3.12"
9
  pyhpo = "^3.1.3"
10
- clinphen = "^1.28"
11
  argostranslate = "^1.8.0"
12
  transformers = "^4.26.1"
 
13
  nltk = "^3.8.1"
 
 
14
 
15
  [tool.poetry.dev-dependencies]
16
  pytest = "^5.2"
 
7
  [tool.poetry.dependencies]
8
  python = ">=3.8.0,<3.12"
9
  pyhpo = "^3.1.3"
 
10
  argostranslate = "^1.8.0"
11
  transformers = "^4.26.1"
12
+ protobuf = "3.20.*"
13
  nltk = "^3.8.1"
14
+ six = "^1.16.0"
15
+ pandas = "^1.5.3"
16
 
17
  [tool.poetry.dev-dependencies]
18
  pytest = "^5.2"