feat(streamlit): #1 Add ClinPhen
Browse files
clinphen_src/__pycache__/get_phenotypes_lf.cpython-38.pyc
ADDED
|
Binary file (8.07 kB). View file
|
|
|
clinphen_src/data/hpo_synonyms.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
clinphen_src/data/hpo_term_names.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
clinphen_src/get_phenotypes_lf.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
from nltk.stem import WordNetLemmatizer
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonyms.txt"
|
| 6 |
+
|
| 7 |
+
def getNames():
|
| 8 |
+
returnMap = {}
|
| 9 |
+
for line in open("clinphen_src/data/hpo_term_names.txt"):
|
| 10 |
+
lineData = line.strip().split("\t")
|
| 11 |
+
returnMap[lineData[0]] = lineData[1]
|
| 12 |
+
return returnMap
|
| 13 |
+
|
| 14 |
+
point_enders = [".", u'•', '•', ";", "\t"]
|
| 15 |
+
def end_of_point(word):
|
| 16 |
+
#for char in point_enders:
|
| 17 |
+
# if char in word: return True
|
| 18 |
+
if word[-1] in point_enders: return True
|
| 19 |
+
if word == "but": return True
|
| 20 |
+
if word == "except": return True
|
| 21 |
+
if word == "however": return True
|
| 22 |
+
if word == "though": return True
|
| 23 |
+
return False
|
| 24 |
+
|
| 25 |
+
subpoint_enders = [",", ":"]
|
| 26 |
+
def end_of_subpoint(word):
|
| 27 |
+
if word[-1] in subpoint_enders: return True
|
| 28 |
+
if word == "and": return True
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
def string_to_record_linewise(medical_record):
|
| 32 |
+
return medical_record.split("\n")
|
| 33 |
+
|
| 34 |
+
def load_medical_record_linewise(medical_record):
|
| 35 |
+
recordFile = string_to_record_linewise(medical_record)
|
| 36 |
+
sentences = []
|
| 37 |
+
for line in recordFile:
|
| 38 |
+
if ":" not in line: continue
|
| 39 |
+
curSentence = []
|
| 40 |
+
for word in line.strip().split(" "):
|
| 41 |
+
word = word.lower()
|
| 42 |
+
if len(word) < 1: continue
|
| 43 |
+
curSentence.append(word)
|
| 44 |
+
if end_of_point(word):
|
| 45 |
+
sentences.append(" ".join(curSentence))
|
| 46 |
+
curSentence = []
|
| 47 |
+
if len(curSentence) > 0: sentences.append(" ".join(curSentence))
|
| 48 |
+
subsentence_sets = []
|
| 49 |
+
for sent in sentences:
|
| 50 |
+
subsents = []
|
| 51 |
+
curSubsent = []
|
| 52 |
+
for word in sent.split(" "):
|
| 53 |
+
word = word.lower()
|
| 54 |
+
curSubsent.append(word)
|
| 55 |
+
if end_of_subpoint(word):
|
| 56 |
+
subsents.append(" ".join(curSubsent))
|
| 57 |
+
curSubsent = []
|
| 58 |
+
if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
|
| 59 |
+
subsentence_sets.append(subsents)
|
| 60 |
+
return subsentence_sets
|
| 61 |
+
|
| 62 |
+
def string_to_record_nonlinewise(medical_record):
|
| 63 |
+
listForm = []
|
| 64 |
+
for line in medical_record.split("\n"):
|
| 65 |
+
if len(line) < 1: continue
|
| 66 |
+
listForm.append(line)
|
| 67 |
+
return " ".join(listForm).split(" ")
|
| 68 |
+
|
| 69 |
+
def load_medical_record_subsentences(medical_record):
|
| 70 |
+
record = string_to_record_nonlinewise(medical_record)
|
| 71 |
+
sentences = []
|
| 72 |
+
curSentence = []
|
| 73 |
+
for word in record:
|
| 74 |
+
word = word.lower()
|
| 75 |
+
if len(word) < 1: continue
|
| 76 |
+
curSentence.append(word)
|
| 77 |
+
if end_of_point(word):
|
| 78 |
+
sentences.append(" ".join(curSentence))
|
| 79 |
+
curSentence = []
|
| 80 |
+
if len(curSentence) > 0: sentences.append(" ".join(curSentence))
|
| 81 |
+
subsentence_sets = []
|
| 82 |
+
for sent in sentences:
|
| 83 |
+
subsents = []
|
| 84 |
+
curSubsent = []
|
| 85 |
+
for word in sent.split(" "):
|
| 86 |
+
word = word.lower()
|
| 87 |
+
curSubsent.append(word)
|
| 88 |
+
if end_of_subpoint(word):
|
| 89 |
+
subsents.append(" ".join(curSubsent))
|
| 90 |
+
curSubsent = []
|
| 91 |
+
if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
|
| 92 |
+
subsentence_sets.append(subsents)
|
| 93 |
+
return subsentence_sets + load_medical_record_linewise(medical_record)
|
| 94 |
+
|
| 95 |
+
#Checks the given sentence for any flags from the lists you indicate.
|
| 96 |
+
negative_flags = ["no", "not", "none", "negative", "non", "never", "without", "denies", "haven't", "don't", "doesn't", "haven t", "don t", "doesn t", 'didn t']
|
| 97 |
+
family_flags = ["<person>","<person","cousin", "parent", "mom", "mother", "dad", "father", "grandmother", "grandfather", "grandparent", "family", "brother", "sister", "sibling", "uncle", "aunt", "nephew", "niece", "son", "daughter", "grandchild"]
|
| 98 |
+
healthy_flags = ["normal"]
|
| 99 |
+
disease_flags = ["associated", "gene", "recessive", "dominant", "variant", "cause", "literature", "individuals"]
|
| 100 |
+
treatment_flags = []
|
| 101 |
+
history_flags = []
|
| 102 |
+
mild_flags = []
|
| 103 |
+
uncertain_flags = []
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
low_synonyms = set(["low", "decreased", "decrease", "deficient", "deficiency", "deficit", "deficits", "reduce", "reduced", "lack", "lacking", "insufficient", "impairment", "impaired", "impair", "difficulty", "difficulties", "trouble"])
|
| 107 |
+
high_synonyms = set(["high", "increased", "increase", "elevated", "elevate", "elevation"])
|
| 108 |
+
abnormal_synonyms = set(["abnormal", "unusual", "atypical", "abnormality", "anomaly", "anomalies", "problem"])
|
| 109 |
+
common_synonyms = [
|
| 110 |
+
low_synonyms,
|
| 111 |
+
high_synonyms,
|
| 112 |
+
abnormal_synonyms
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
def synonym_lemmas(word):
|
| 116 |
+
returnSet = set()
|
| 117 |
+
for synSet in common_synonyms:
|
| 118 |
+
if word in synSet: returnSet |= synSet
|
| 119 |
+
return returnSet
|
| 120 |
+
|
| 121 |
+
def custom_lemmas(word):
|
| 122 |
+
returnSet = set()
|
| 123 |
+
if len(word) < 2: return returnSet
|
| 124 |
+
if word[-1] == "s": returnSet.add(word[:-1])
|
| 125 |
+
if word[-1] == "i": returnSet.add(word[:-1] + "us")
|
| 126 |
+
if word [-1] == "a":
|
| 127 |
+
returnSet.add(word[:-1] + "um")
|
| 128 |
+
returnSet.add(word[:-1] + "on")
|
| 129 |
+
if len(word) < 3: return returnSet
|
| 130 |
+
if word[-2:] == "es":
|
| 131 |
+
returnSet.add(word[:-2])
|
| 132 |
+
returnSet.add(word[:-2] + "is")
|
| 133 |
+
if word[-2:] == "ic":
|
| 134 |
+
returnSet.add(word[:-2] + "ia")
|
| 135 |
+
returnSet.add(word[:-2] + "y")
|
| 136 |
+
if word[-2:] == "ly": returnSet.add(word[:-2])
|
| 137 |
+
if word[-2:] == "ed": returnSet.add(word[:-2])
|
| 138 |
+
if len(word) < 4: return returnSet
|
| 139 |
+
if word[-3:] == "ata": returnSet.add(word[:-2])
|
| 140 |
+
if word[-3:] == "ies": returnSet.add(word[:-3] + "y")
|
| 141 |
+
if word[-3:] == "ble": returnSet.add(word[:-2] + "ility")
|
| 142 |
+
if len(word) < 7: return returnSet
|
| 143 |
+
if word[-6:] == "bility": returnSet.add(word[:-5] + "le")
|
| 144 |
+
if len(word) < 8: return returnSet
|
| 145 |
+
if word[-7:] == "ication":
|
| 146 |
+
returnSet.add(word[:-7] + "y")
|
| 147 |
+
returnSet.add(word[:-7] + "ied")
|
| 148 |
+
return returnSet
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def lemmatize(word):
|
| 152 |
+
word = re.sub('[^0-9a-zA-Z]+', '', word)
|
| 153 |
+
word = word.lower()
|
| 154 |
+
return WordNetLemmatizer().lemmatize(word)
|
| 155 |
+
|
| 156 |
+
def add_lemmas(wordSet):
|
| 157 |
+
lemmas = set()
|
| 158 |
+
for word in wordSet:
|
| 159 |
+
lemma = lemmatize(word)
|
| 160 |
+
if len(lemma) > 0: lemmas.add(lemma)
|
| 161 |
+
lemmas |= synonym_lemmas(word)
|
| 162 |
+
lemmas |= custom_lemmas(word)
|
| 163 |
+
return wordSet | lemmas
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def get_flags(line, *flagsets):
|
| 167 |
+
line = add_lemmas(set(line))
|
| 168 |
+
returnFlags = set()
|
| 169 |
+
for flagset in flagsets:
|
| 170 |
+
flagset = add_lemmas(set(flagset))
|
| 171 |
+
for word in flagset:
|
| 172 |
+
if word in line: returnFlags.add(word)
|
| 173 |
+
return returnFlags
|
| 174 |
+
|
| 175 |
+
def alphanum_only(wordSet):
|
| 176 |
+
returnSet = set()
|
| 177 |
+
for word in wordSet:
|
| 178 |
+
#returnSet |= set(word_tokenize(re.sub('[^0-9a-zA-Z]+', ' ', word)))
|
| 179 |
+
returnSet |= set(re.sub('[^0-9a-zA-Z]+', ' ', word).split(" "))
|
| 180 |
+
return returnSet
|
| 181 |
+
|
| 182 |
+
def load_mr_map(parsed_record):
|
| 183 |
+
returnMap = defaultdict(set)
|
| 184 |
+
for i in range(len(parsed_record)):
|
| 185 |
+
line = set(parsed_record[i])
|
| 186 |
+
for word in line: returnMap[word].add(i)
|
| 187 |
+
return returnMap
|
| 188 |
+
|
| 189 |
+
def load_all_hpo_synonyms(filename=HPO_SYN_MAP_FILE):
|
| 190 |
+
returnMap = defaultdict(set)
|
| 191 |
+
for line in open(filename):
|
| 192 |
+
lineData = line.strip().split("\t")
|
| 193 |
+
hpo = lineData[0]
|
| 194 |
+
syn = lineData[1]
|
| 195 |
+
returnMap[hpo].add(syn)
|
| 196 |
+
return returnMap
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def sort_ids_by_occurrences_then_earliness(id_to_lines):
|
| 200 |
+
listForm = []
|
| 201 |
+
for hpoid in id_to_lines.keys(): listForm.append((hpoid, len(id_to_lines[hpoid]), min(id_to_lines[hpoid])))
|
| 202 |
+
listForm.sort(key=lambda x: [-1*x[1], x[2], x[0]])
|
| 203 |
+
returnList = list()
|
| 204 |
+
for item in listForm: returnList.append(item[0])
|
| 205 |
+
return returnList
|
| 206 |
+
|
| 207 |
+
def extract_phenotypes(record, names, hpo_syn_file=HPO_SYN_MAP_FILE):
|
| 208 |
+
safe_ID_to_lines = defaultdict(set)
|
| 209 |
+
medical_record = load_medical_record_subsentences(record)
|
| 210 |
+
medical_record_subsentences = []
|
| 211 |
+
medical_record_words = []
|
| 212 |
+
medical_record_flags = []
|
| 213 |
+
subsent_to_sentence = []
|
| 214 |
+
for subsents in medical_record:
|
| 215 |
+
whole_sentence = ""
|
| 216 |
+
for subsent in subsents: whole_sentence += subsent + " "
|
| 217 |
+
whole_sentence = whole_sentence.strip()
|
| 218 |
+
whole_sentence = re.sub('[^0-9a-zA-Z]+', ' ', whole_sentence)
|
| 219 |
+
flags = get_flags(whole_sentence.split(" "), negative_flags, family_flags, healthy_flags, disease_flags, treatment_flags, history_flags, uncertain_flags, mild_flags)
|
| 220 |
+
for subsent in subsents:
|
| 221 |
+
medical_record_subsentences.append(subsent)
|
| 222 |
+
subsent_to_sentence.append(whole_sentence)
|
| 223 |
+
medical_record_words.append(add_lemmas(alphanum_only(set([subsent]))))
|
| 224 |
+
medical_record_flags.append(flags)
|
| 225 |
+
mr_map = load_mr_map(medical_record_words)
|
| 226 |
+
syns = load_all_hpo_synonyms(hpo_syn_file)
|
| 227 |
+
for hpoID in syns.keys():
|
| 228 |
+
for syn in syns[hpoID]:
|
| 229 |
+
syn = re.sub('[^0-9a-zA-Z]+', ' ', syn.lower())
|
| 230 |
+
synTokens = alphanum_only(set([syn]))
|
| 231 |
+
if len(synTokens) < 1: continue
|
| 232 |
+
firstToken = list(synTokens)[0]
|
| 233 |
+
lines = set(mr_map[firstToken])
|
| 234 |
+
for token in synTokens:
|
| 235 |
+
lines &= set(mr_map[token])
|
| 236 |
+
if len(lines) < 1: break
|
| 237 |
+
if len(lines) < 1: continue
|
| 238 |
+
for i in lines:
|
| 239 |
+
line = " ".join(medical_record_words[i])
|
| 240 |
+
flagged = False
|
| 241 |
+
for flag in medical_record_flags[i]:
|
| 242 |
+
if flag not in synTokens:
|
| 243 |
+
flagged = True
|
| 244 |
+
break
|
| 245 |
+
if flagged: continue
|
| 246 |
+
safe_ID_to_lines[hpoID].add(i)
|
| 247 |
+
safe_IDs = sort_ids_by_occurrences_then_earliness(safe_ID_to_lines)
|
| 248 |
+
returnString = ["HPO ID\tPhenotype name\tNo. occurrences\tEarliness (lower = earlier)\tExample sentence"]
|
| 249 |
+
#returnString = []
|
| 250 |
+
for ID in safe_IDs: returnString.append("\t".join([ID, names[ID], str(len(safe_ID_to_lines[ID])), str(min(safe_ID_to_lines[ID])), subsent_to_sentence[safe_ID_to_lines[ID].pop()]]))
|
| 251 |
+
return "\n".join(returnString)
|
lf_app.py
CHANGED
|
@@ -16,6 +16,8 @@ from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
| 16 |
from presidio_anonymizer import AnonymizerEngine
|
| 17 |
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
|
| 18 |
import subprocess
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# -- Set page config
|
| 21 |
apptitle = "Linguo Franca"
|
|
@@ -33,6 +35,8 @@ st.sidebar.header(
|
|
| 33 |
|
| 34 |
st.sidebar.markdown(
|
| 35 |
"""
|
|
|
|
|
|
|
| 36 |
If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
|
| 37 |
|
| 38 |
Code source is available in GitHub:
|
|
@@ -51,6 +55,7 @@ st.sidebar.image(image_chu, caption=None, width=95)
|
|
| 51 |
@st.cache_resource()
|
| 52 |
def get_models():
|
| 53 |
nltk.download("omw-1.4")
|
|
|
|
| 54 |
stanza.download("fr")
|
| 55 |
spacy_model_name = "en_core_web_lg"
|
| 56 |
if not spacy.util.is_package(spacy_model_name):
|
|
@@ -450,19 +455,19 @@ def reformat_to_letter(text, _nlp):
|
|
| 450 |
|
| 451 |
@st.cache_data()
|
| 452 |
def convert_df(df):
|
| 453 |
-
return df.to_csv(sep="\t").encode("utf-8")
|
| 454 |
|
| 455 |
|
| 456 |
@st.cache_data()
|
| 457 |
def add_biometrics(text, _nlp):
|
| 458 |
cutsentence_with_biometrics = []
|
| 459 |
cutsentence = []
|
|
|
|
| 460 |
for sentence in _nlp.process(text).sentences:
|
| 461 |
cutsentence.append(sentence.text)
|
| 462 |
keep_element = ["cm", "kg", "qit", "qi"]
|
| 463 |
for sentence in cutsentence:
|
| 464 |
if any(ext in sentence.lower() for ext in keep_element):
|
| 465 |
-
additional_terms = []
|
| 466 |
if "SD" in sentence or "DS" in sentence:
|
| 467 |
sentence = sentence.replace("DS", "SD")
|
| 468 |
try:
|
|
@@ -546,7 +551,28 @@ def add_biometrics(text, _nlp):
|
|
| 546 |
i for i in cutsentence_with_biometrics if i != "."
|
| 547 |
]
|
| 548 |
return " ".join(cutsentence_with_biometrics_return), additional_terms
|
| 549 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
models_status = get_models()
|
| 552 |
nlp, marian_fr_en = get_nlp_marian()
|
|
@@ -635,19 +661,7 @@ if submit_button or st.session_state.load_state:
|
|
| 635 |
with st.expander("See additional terms extracted with biometrics analysis"):
|
| 636 |
st.write(additional_terms)
|
| 637 |
|
| 638 |
-
|
| 639 |
-
f.write(MarianText_anonymized_reformat_biometrics)
|
| 640 |
-
|
| 641 |
-
with open("extract_clinphen_patient.tsv", "w") as outfile:
|
| 642 |
-
subprocess.run(
|
| 643 |
-
[
|
| 644 |
-
"clinphen",
|
| 645 |
-
"sample_translated_deindentified_biometrics.txt",
|
| 646 |
-
],
|
| 647 |
-
stdout=outfile,
|
| 648 |
-
)
|
| 649 |
-
|
| 650 |
-
clinphen = pd.read_csv("extract_clinphen_patient.tsv", sep="\t")
|
| 651 |
|
| 652 |
clinphen_df = st.experimental_data_editor(
|
| 653 |
clinphen, num_rows="dynamic", key="data_editor"
|
|
|
|
| 16 |
from presidio_anonymizer import AnonymizerEngine
|
| 17 |
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
|
| 18 |
import subprocess
|
| 19 |
+
from clinphen_src import get_phenotypes_lf
|
| 20 |
+
|
| 21 |
|
| 22 |
# -- Set page config
|
| 23 |
apptitle = "Linguo Franca"
|
|
|
|
| 35 |
|
| 36 |
st.sidebar.markdown(
|
| 37 |
"""
|
| 38 |
+
Currently only working from :fr: to :gb:.
|
| 39 |
+
|
| 40 |
If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
|
| 41 |
|
| 42 |
Code source is available in GitHub:
|
|
|
|
| 55 |
@st.cache_resource()
|
| 56 |
def get_models():
|
| 57 |
nltk.download("omw-1.4")
|
| 58 |
+
nltk.download('wordnet')
|
| 59 |
stanza.download("fr")
|
| 60 |
spacy_model_name = "en_core_web_lg"
|
| 61 |
if not spacy.util.is_package(spacy_model_name):
|
|
|
|
| 455 |
|
| 456 |
@st.cache_data()
|
| 457 |
def convert_df(df):
|
| 458 |
+
return df.to_csv(sep="\t", index=False, header=None).encode("utf-8")
|
| 459 |
|
| 460 |
|
| 461 |
@st.cache_data()
|
| 462 |
def add_biometrics(text, _nlp):
|
| 463 |
cutsentence_with_biometrics = []
|
| 464 |
cutsentence = []
|
| 465 |
+
additional_terms = []
|
| 466 |
for sentence in _nlp.process(text).sentences:
|
| 467 |
cutsentence.append(sentence.text)
|
| 468 |
keep_element = ["cm", "kg", "qit", "qi"]
|
| 469 |
for sentence in cutsentence:
|
| 470 |
if any(ext in sentence.lower() for ext in keep_element):
|
|
|
|
| 471 |
if "SD" in sentence or "DS" in sentence:
|
| 472 |
sentence = sentence.replace("DS", "SD")
|
| 473 |
try:
|
|
|
|
| 551 |
i for i in cutsentence_with_biometrics if i != "."
|
| 552 |
]
|
| 553 |
return " ".join(cutsentence_with_biometrics_return), additional_terms
|
| 554 |
+
@st.cache_data()
|
| 555 |
+
def main_function(inputStr):
|
| 556 |
+
hpo_to_name = get_phenotypes_lf.getNames()
|
| 557 |
+
returnString = get_phenotypes_lf.extract_phenotypes(inputStr, hpo_to_name)
|
| 558 |
+
returnList = []
|
| 559 |
+
i = 0
|
| 560 |
+
for element in returnString.split('\n'):
|
| 561 |
+
if i == 0:
|
| 562 |
+
i = 1
|
| 563 |
+
pass
|
| 564 |
+
else:
|
| 565 |
+
elementList = []
|
| 566 |
+
for i in element.split('\t'):
|
| 567 |
+
elementList.append(i)
|
| 568 |
+
returnList.append(elementList)
|
| 569 |
+
if len(returnList) > 0:
|
| 570 |
+
returnDf = pd.DataFrame(returnList)
|
| 571 |
+
returnDf.columns = ['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence']
|
| 572 |
+
else:
|
| 573 |
+
returnDf = pd.DataFrame(columns=['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence'])
|
| 574 |
+
return returnDf
|
| 575 |
+
return returnDf
|
| 576 |
|
| 577 |
models_status = get_models()
|
| 578 |
nlp, marian_fr_en = get_nlp_marian()
|
|
|
|
| 661 |
with st.expander("See additional terms extracted with biometrics analysis"):
|
| 662 |
st.write(additional_terms)
|
| 663 |
|
| 664 |
+
clinphen = main_function(MarianText_anonymized_reformat_biometrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
clinphen_df = st.experimental_data_editor(
|
| 667 |
clinphen, num_rows="dynamic", key="data_editor"
|
pyproject.toml
CHANGED
|
@@ -7,10 +7,12 @@ authors = ["kyauy <kevin.yauy@gmail.com>"]
|
|
| 7 |
[tool.poetry.dependencies]
|
| 8 |
python = ">=3.8.0,<3.12"
|
| 9 |
pyhpo = "^3.1.3"
|
| 10 |
-
clinphen = "^1.28"
|
| 11 |
argostranslate = "^1.8.0"
|
| 12 |
transformers = "^4.26.1"
|
|
|
|
| 13 |
nltk = "^3.8.1"
|
|
|
|
|
|
|
| 14 |
|
| 15 |
[tool.poetry.dev-dependencies]
|
| 16 |
pytest = "^5.2"
|
|
|
|
| 7 |
[tool.poetry.dependencies]
|
| 8 |
python = ">=3.8.0,<3.12"
|
| 9 |
pyhpo = "^3.1.3"
|
|
|
|
| 10 |
argostranslate = "^1.8.0"
|
| 11 |
transformers = "^4.26.1"
|
| 12 |
+
protobuf = "3.20.*"
|
| 13 |
nltk = "^3.8.1"
|
| 14 |
+
six = "^1.16.0"
|
| 15 |
+
pandas = "^1.5.3"
|
| 16 |
|
| 17 |
[tool.poetry.dev-dependencies]
|
| 18 |
pytest = "^5.2"
|