Spaces:

kyauy
/

ClinFly

Running

+from collections import defaultdict
+from nltk.stem import WordNetLemmatizer
+import re
+HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonyms.txt"
+def getNames():
+  returnMap = {}
+  for line in open("clinphen_src/data/hpo_term_names.txt"):
+    lineData = line.strip().split("\t")
+    returnMap[lineData[0]] = lineData[1]
+  return returnMap
+point_enders = [".", u'•', '•', ";", "\t"]
+def end_of_point(word):
+  #for char in point_enders:
+  #  if char in word: return True
+  if word[-1] in point_enders: return True
+  if word == "but": return True
+  if word == "except": return True
+  if word == "however": return True
+  if word == "though": return True
+  return False
+subpoint_enders = [",", ":"]
+def end_of_subpoint(word):
+  if word[-1] in subpoint_enders: return True
+  if word == "and": return True
+  return False
+def string_to_record_linewise(medical_record):
+  return medical_record.split("\n")
+def load_medical_record_linewise(medical_record):
+  recordFile = string_to_record_linewise(medical_record)
+  sentences = []
+  for line in recordFile:
+    if ":" not in line: continue
+    curSentence = []
+    for word in line.strip().split(" "):
+      word = word.lower()
+      if len(word) < 1: continue
+      curSentence.append(word)
+      if end_of_point(word):
+        sentences.append(" ".join(curSentence))
+        curSentence = []
+    if len(curSentence) > 0: sentences.append(" ".join(curSentence))
+  subsentence_sets = []
+  for sent in sentences:
+    subsents = []
+    curSubsent = []
+    for word in sent.split(" "):
+      word = word.lower()
+      curSubsent.append(word)
+      if end_of_subpoint(word):
+        subsents.append(" ".join(curSubsent))
+        curSubsent = []
+    if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
+    subsentence_sets.append(subsents)
+  return subsentence_sets
+def string_to_record_nonlinewise(medical_record):
+  listForm = []
+  for line in medical_record.split("\n"):
+    if len(line) < 1: continue
+    listForm.append(line)
+  return " ".join(listForm).split(" ")
+def load_medical_record_subsentences(medical_record):
+  record = string_to_record_nonlinewise(medical_record)
+  sentences = []
+  curSentence = []
+  for word in record:
+    word = word.lower()
+    if len(word) < 1: continue
+    curSentence.append(word)
+    if end_of_point(word):
+      sentences.append(" ".join(curSentence))
+      curSentence = []
+  if len(curSentence) > 0: sentences.append(" ".join(curSentence))
+  subsentence_sets = []
+  for sent in sentences:
+    subsents = []
+    curSubsent = []
+    for word in sent.split(" "):
+      word = word.lower()
+      curSubsent.append(word)
+      if end_of_subpoint(word):
+        subsents.append(" ".join(curSubsent))
+        curSubsent = []
+    if len(curSubsent) > 0: subsents.append(" ".join(curSubsent))
+    subsentence_sets.append(subsents)
+  return subsentence_sets + load_medical_record_linewise(medical_record)
+#Checks the given sentence for any flags from the lists you indicate.
+negative_flags = ["no", "not", "none", "negative", "non", "never", "without", "denies", "haven't", "don't", "doesn't", "haven t", "don t", "doesn t", 'didn t']
+family_flags = ["<person>","<person","cousin", "parent", "mom", "mother", "dad", "father", "grandmother", "grandfather", "grandparent", "family", "brother", "sister", "sibling", "uncle", "aunt", "nephew", "niece", "son", "daughter", "grandchild"]
+healthy_flags = ["normal"]
+disease_flags = ["associated", "gene", "recessive", "dominant", "variant", "cause", "literature", "individuals"]
+treatment_flags = []
+history_flags = []
+mild_flags = []
+uncertain_flags = []
+low_synonyms = set(["low", "decreased", "decrease", "deficient", "deficiency", "deficit", "deficits", "reduce", "reduced", "lack", "lacking", "insufficient", "impairment", "impaired", "impair", "difficulty", "difficulties", "trouble"])
+high_synonyms = set(["high", "increased", "increase", "elevated", "elevate", "elevation"])
+abnormal_synonyms = set(["abnormal", "unusual", "atypical", "abnormality", "anomaly", "anomalies", "problem"])
+common_synonyms = [
+low_synonyms,
+high_synonyms,
+abnormal_synonyms
+]
+def synonym_lemmas(word):
+  returnSet = set()
+  for synSet in common_synonyms:
+    if word in synSet: returnSet |= synSet
+  return returnSet
+def custom_lemmas(word):
+  returnSet = set()
+  if len(word) < 2: return returnSet
+  if word[-1] == "s": returnSet.add(word[:-1])
+  if word[-1] == "i": returnSet.add(word[:-1] + "us")
+  if word [-1] == "a":
+    returnSet.add(word[:-1] + "um")
+    returnSet.add(word[:-1] + "on")
+  if len(word) < 3: return returnSet
+  if word[-2:] == "es":
+    returnSet.add(word[:-2])
+    returnSet.add(word[:-2] + "is")
+  if word[-2:] == "ic":
+    returnSet.add(word[:-2] + "ia")
+    returnSet.add(word[:-2] + "y")
+  if word[-2:] == "ly": returnSet.add(word[:-2])
+  if word[-2:] == "ed": returnSet.add(word[:-2])
+  if len(word) < 4: return returnSet
+  if word[-3:] == "ata": returnSet.add(word[:-2])
+  if word[-3:] == "ies": returnSet.add(word[:-3] + "y")
+  if word[-3:] == "ble": returnSet.add(word[:-2] + "ility")
+  if len(word) < 7: return returnSet
+  if word[-6:] == "bility": returnSet.add(word[:-5] + "le")
+  if len(word) < 8: return returnSet
+  if word[-7:] == "ication":
+    returnSet.add(word[:-7] + "y")
+    returnSet.add(word[:-7] + "ied")
+  return returnSet
+def lemmatize(word):
+  word = re.sub('[^0-9a-zA-Z]+', '', word)
+  word = word.lower()
+  return WordNetLemmatizer().lemmatize(word)
+def add_lemmas(wordSet):
+  lemmas = set()
+  for word in wordSet:
+    lemma = lemmatize(word)
+    if len(lemma) > 0: lemmas.add(lemma)
+    lemmas |= synonym_lemmas(word)
+    lemmas |= custom_lemmas(word)
+  return wordSet | lemmas
+def get_flags(line, *flagsets):
+  line = add_lemmas(set(line))
+  returnFlags = set()
+  for flagset in flagsets:
+    flagset = add_lemmas(set(flagset))
+    for word in flagset:
+      if word in line: returnFlags.add(word)
+  return returnFlags
+def alphanum_only(wordSet):
+  returnSet = set()
+  for word in wordSet:
+    #returnSet |= set(word_tokenize(re.sub('[^0-9a-zA-Z]+', ' ', word)))
+    returnSet |= set(re.sub('[^0-9a-zA-Z]+', ' ', word).split(" "))
+  return returnSet
+def load_mr_map(parsed_record):
+  returnMap = defaultdict(set)
+  for i in range(len(parsed_record)):
+    line = set(parsed_record[i])
+    for word in line: returnMap[word].add(i)
+  return returnMap
+def load_all_hpo_synonyms(filename=HPO_SYN_MAP_FILE):
+  returnMap = defaultdict(set)
+  for line in open(filename):
+    lineData = line.strip().split("\t")
+    hpo = lineData[0]
+    syn = lineData[1]
+    returnMap[hpo].add(syn)
+  return returnMap
+def sort_ids_by_occurrences_then_earliness(id_to_lines):
+  listForm = []
+  for hpoid in id_to_lines.keys(): listForm.append((hpoid, len(id_to_lines[hpoid]), min(id_to_lines[hpoid])))
+  listForm.sort(key=lambda x: [-1*x[1], x[2], x[0]])
+  returnList = list()
+  for item in listForm: returnList.append(item[0])
+  return returnList
+def extract_phenotypes(record, names, hpo_syn_file=HPO_SYN_MAP_FILE):
+  safe_ID_to_lines = defaultdict(set)
+  medical_record = load_medical_record_subsentences(record)
+  medical_record_subsentences = []
+  medical_record_words = []
+  medical_record_flags = []
+  subsent_to_sentence = []
+  for subsents in medical_record:
+    whole_sentence = ""
+    for subsent in subsents: whole_sentence += subsent + " "
+    whole_sentence = whole_sentence.strip()
+    whole_sentence = re.sub('[^0-9a-zA-Z]+', ' ', whole_sentence)
+    flags = get_flags(whole_sentence.split(" "), negative_flags, family_flags, healthy_flags, disease_flags, treatment_flags, history_flags, uncertain_flags, mild_flags)
+    for subsent in subsents:
+      medical_record_subsentences.append(subsent)
+      subsent_to_sentence.append(whole_sentence)
+      medical_record_words.append(add_lemmas(alphanum_only(set([subsent]))))
+      medical_record_flags.append(flags)
+  mr_map = load_mr_map(medical_record_words)
+  syns = load_all_hpo_synonyms(hpo_syn_file)
+  for hpoID in syns.keys():
+    for syn in syns[hpoID]:
+      syn = re.sub('[^0-9a-zA-Z]+', ' ', syn.lower())
+      synTokens = alphanum_only(set([syn]))
+      if len(synTokens) < 1: continue
+      firstToken = list(synTokens)[0]
+      lines = set(mr_map[firstToken])
+      for token in synTokens:
+        lines &= set(mr_map[token])
+        if len(lines) < 1: break
+      if len(lines) < 1: continue
+      for i in lines:
+        line = " ".join(medical_record_words[i])
+        flagged = False
+        for flag in medical_record_flags[i]:
+          if flag not in synTokens:
+            flagged = True
+            break
+        if flagged: continue
+        safe_ID_to_lines[hpoID].add(i)
+  safe_IDs = sort_ids_by_occurrences_then_earliness(safe_ID_to_lines)
+  returnString = ["HPO ID\tPhenotype name\tNo. occurrences\tEarliness (lower = earlier)\tExample sentence"]
+  #returnString = []
+  for ID in safe_IDs: returnString.append("\t".join([ID, names[ID], str(len(safe_ID_to_lines[ID])), str(min(safe_ID_to_lines[ID])), subsent_to_sentence[safe_ID_to_lines[ID].pop()]]))
+  return "\n".join(returnString)

lf_app.py CHANGED Viewed

@@ -16,6 +16,8 @@ from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
 import subprocess
 # -- Set page config
 apptitle = "Linguo Franca"
@@ -33,6 +35,8 @@ st.sidebar.header(
 st.sidebar.markdown(
     """
  If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
  Code source is available in GitHub:
@@ -51,6 +55,7 @@ st.sidebar.image(image_chu, caption=None, width=95)
 @st.cache_resource()
 def get_models():
     nltk.download("omw-1.4")
     stanza.download("fr")
     spacy_model_name = "en_core_web_lg"
     if not spacy.util.is_package(spacy_model_name):
@@ -450,19 +455,19 @@ def reformat_to_letter(text, _nlp):
 @st.cache_data()
 def convert_df(df):
-    return df.to_csv(sep="\t").encode("utf-8")
 @st.cache_data()
 def add_biometrics(text, _nlp):
     cutsentence_with_biometrics = []
     cutsentence = []
     for sentence in _nlp.process(text).sentences:
         cutsentence.append(sentence.text)
     keep_element = ["cm", "kg", "qit", "qi"]
     for sentence in cutsentence:
         if any(ext in sentence.lower() for ext in keep_element):
-            additional_terms = []
             if "SD" in sentence or "DS" in sentence:
                 sentence = sentence.replace("DS", "SD")
                 try:
@@ -546,7 +551,28 @@ def add_biometrics(text, _nlp):
         i for i in cutsentence_with_biometrics if i != "."
     ]
     return " ".join(cutsentence_with_biometrics_return), additional_terms
 models_status = get_models()
 nlp, marian_fr_en = get_nlp_marian()
@@ -635,19 +661,7 @@ if submit_button or st.session_state.load_state:
     with st.expander("See additional terms extracted with biometrics analysis"):
         st.write(additional_terms)
-    with open("sample_translated_deindentified_biometrics.txt", "w") as f:
-        f.write(MarianText_anonymized_reformat_biometrics)
-    with open("extract_clinphen_patient.tsv", "w") as outfile:
-        subprocess.run(
-            [
-                "clinphen",
-                "sample_translated_deindentified_biometrics.txt",
-            ],
-            stdout=outfile,
-        )
-    clinphen = pd.read_csv("extract_clinphen_patient.tsv", sep="\t")
     clinphen_df = st.experimental_data_editor(
         clinphen, num_rows="dynamic", key="data_editor"

 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
 import subprocess
+from clinphen_src import get_phenotypes_lf
 # -- Set page config
 apptitle = "Linguo Franca"
 st.sidebar.markdown(
     """
+ Currently only working from :fr: to :gb:.
  If any questions or suggestions, please contact: [kevin.yauy@chu-montpellier.fr](kevin.yauy@chu-montpellier.fr) and [lucas.gauthier@chu-lyon.fr](lucas.gauthier@chu-lyon.fr)
  Code source is available in GitHub:
 @st.cache_resource()
 def get_models():
     nltk.download("omw-1.4")
+    nltk.download('wordnet')
     stanza.download("fr")
     spacy_model_name = "en_core_web_lg"
     if not spacy.util.is_package(spacy_model_name):
 @st.cache_data()
 def convert_df(df):
+    return df.to_csv(sep="\t", index=False, header=None).encode("utf-8")
 @st.cache_data()
 def add_biometrics(text, _nlp):
     cutsentence_with_biometrics = []
     cutsentence = []
+    additional_terms = []
     for sentence in _nlp.process(text).sentences:
         cutsentence.append(sentence.text)
     keep_element = ["cm", "kg", "qit", "qi"]
     for sentence in cutsentence:
         if any(ext in sentence.lower() for ext in keep_element):
             if "SD" in sentence or "DS" in sentence:
                 sentence = sentence.replace("DS", "SD")
                 try:
         i for i in cutsentence_with_biometrics if i != "."
     ]
     return " ".join(cutsentence_with_biometrics_return), additional_terms
+@st.cache_data()
+def main_function(inputStr):
+  hpo_to_name = get_phenotypes_lf.getNames()
+  returnString = get_phenotypes_lf.extract_phenotypes(inputStr, hpo_to_name)
+  returnList = []
+  i = 0
+  for element in returnString.split('\n'):
+    if i == 0:
+      i = 1
+      pass
+    else:
+      elementList = []
+      for i in element.split('\t'):
+        elementList.append(i)
+      returnList.append(elementList)
+  if len(returnList) > 0:
+    returnDf = pd.DataFrame(returnList)
+    returnDf.columns = ['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence']
+  else:
+    returnDf = pd.DataFrame(columns=['HPO ID', 'Phenotype name', 'No. occurrences', 'Earliness (lower = earlier)', 'Example sentence'])
+    return returnDf
+  return returnDf
 models_status = get_models()
 nlp, marian_fr_en = get_nlp_marian()
     with st.expander("See additional terms extracted with biometrics analysis"):
         st.write(additional_terms)
+    clinphen = main_function(MarianText_anonymized_reformat_biometrics)
     clinphen_df = st.experimental_data_editor(
         clinphen, num_rows="dynamic", key="data_editor"

pyproject.toml CHANGED Viewed

@@ -7,10 +7,12 @@ authors = ["kyauy <kevin.yauy@gmail.com>"]
 [tool.poetry.dependencies]
 python = ">=3.8.0,<3.12"
 pyhpo = "^3.1.3"
-clinphen = "^1.28"
 argostranslate = "^1.8.0"
 transformers = "^4.26.1"
 nltk = "^3.8.1"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"

 [tool.poetry.dependencies]
 python = ">=3.8.0,<3.12"
 pyhpo = "^3.1.3"
 argostranslate = "^1.8.0"
 transformers = "^4.26.1"
+protobuf = "3.20.*"
 nltk = "^3.8.1"
+six = "^1.16.0"
+pandas = "^1.5.3"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"