Spaces:

kyauy
/

ClinFly

Sleeping

App Files Files Community

GERNET Enody commited on Apr 2, 2024

Commit

ccc4c35

unverified ·

1 Parent(s): e96fe28

Add files via upload

Browse files

Files changed (6) hide show

utilities/anonymize.py +394 -0
utilities/convert.py +45 -0
utilities/extract_hpo.py +120 -0
utilities/get_model.py +48 -0
utilities/translate.py +220 -0
utilities/web_utilities.py +32 -1

utilities/anonymize.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import re
+import json
+from unidecode import unidecode
+import pandas as pd
+from presidio_anonymizer.entities import OperatorConfig
+from presidio_analyzer import AnalyzerEngine, PatternRecognizer
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+import streamlit as st
+from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def anonymize_analyzer(MarianText_letter, _analyzer, proper_noun, Last_name, First_name):
+    MarianText_anonymize_letter = MarianText_letter
+    # st.write(MarianText_anonymize_letter)
+    analyzer_results_keep = []
+    analyzer_results_return = []
+    analyzer_results_saved = []
+    analyzer_results = _analyzer.analyze(
+        text=MarianText_letter,
+        language="en",
+        entities=["DATE_TIME", "PERSON", "FRENCH_CITY"],
+        allow_list=[
+            "evening",
+            "day",
+            "the day",
+            "the age of",
+            "age",
+            "years",
+            "week",
+            "years old",
+            "months",
+            "hours",
+            "night",
+            "noon",
+            "nights",
+            "tomorrow",
+            "today",
+            "yesterday",
+        ],
+    )
+    len_to_add = 0
+    analyser_results_to_sort = {}
+    i = 0
+    detect_duplicated = []
+    for element in analyzer_results:
+        if element.start not in detect_duplicated:
+            analyser_results_to_sort[i] = element.start
+            detect_duplicated.append(element.start)
+        else:
+            pass
+        i = i + 1
+    sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
+    sorted_dict = {k: v for k, v in sorted_tuples}
+    print(sorted_dict)
+    exception_list_presidio = ["age", "year", "month", "day", "hour", "week"]
+    for element_raw in sorted_dict:
+        element = analyzer_results[element_raw]
+        word = MarianText_letter[element.start : element.end]
+        exception_detected = [e for e in exception_list_presidio if e in word.lower()]
+        if word.count("/") == 1 or word.count("/") > 2:
+            exception_detected.append("/ or ///")
+        if len(exception_detected) == 0:
+            if word.lower().strip() in proper_noun:
+                word_to_replace = (
+                    "**:green[" + word + "]** `[" + element.entity_type + "]`"
+                )
+                MarianText_anonymize_letter = (
+                    MarianText_anonymize_letter[: element.start + len_to_add]
+                    + word_to_replace
+                    + MarianText_anonymize_letter[element.end + len_to_add :]
+                )
+                analyzer_results_saved.append(
+                    {
+                        "name": Last_name,
+                        "surname": First_name,
+                        "type": "deidentification",
+                        "value": word,
+                        "correction": element.entity_type,
+                        "lf_detected": False,
+                        "manual_validation": False,
+                    }
+                )
+            # analyzer_results_saved.append(str(element) + ", word:" + word)
+            else:
+                word_to_replace = (
+                    "**:red[" + word + "]** `[" + element.entity_type + "]`"
+                )
+                MarianText_anonymize_letter = (
+                    MarianText_anonymize_letter[: element.start + len_to_add]
+                    + word_to_replace
+                    + MarianText_anonymize_letter[element.end + len_to_add :]
+                )
+                analyzer_results_keep.append(
+                    {
+                        "name": Last_name,
+                        "surname": First_name,
+                        "type": "deidentification",
+                        "value": word,
+                        "correction": element.entity_type,
+                        "lf_detected": True,
+                        "manual_validation": True,
+                    }
+                )
+                # analyzer_results_keep.append(str(element) + ", word:" + word)
+                analyzer_results_return.append(element)
+            len_to_add = len_to_add + len(word_to_replace) - len(word)
+        else:
+            analyzer_results_saved.append(
+                {
+                    "name": Last_name,
+                    "surname": First_name,
+                    "type": "deidentification",
+                    "value": word,
+                    "correction": element.entity_type,
+                    "lf_detected": False,
+                    "manual_validation": False,
+                }
+            )
+            # analyzer_results_saved.append(str(element) + ", word:" + word)
+    del analyzer_results
+    del len_to_add
+    del exception_list_presidio
+    del analyser_results_to_sort
+    del sorted_tuples
+    del sorted_dict
+    return (
+        MarianText_anonymize_letter,
+        analyzer_results_return,
+        analyzer_results_keep,
+        analyzer_results_saved,
+    )
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp):
+    result = _engine.anonymize(
+        text=MarianText_letter,
+        analyzer_results=_analyzer_results_return,
+        operators={
+            "PERSON": OperatorConfig("replace", {"new_value": ""}),
+            "LOCATION": OperatorConfig("replace", {"new_value": ""}),
+            "FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
+        },
+    )
+    return reformat_to_report(result.text, _nlp)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_comma(texte, _nlp):
+    text_list = []
+    regex = "(?<!\d)(\,)(?!\d)(?!.*\1)"
+    for sentence in _nlp.process(texte).sentences:
+        text_space = re.sub(regex, " , ", sentence.text.replace("\n", " "))
+        text_space_no_db = text_space.replace("  ", " ")
+        text_list.append(text_space_no_db)
+        # print(text_space_no_db)
+    return " ".join(text_list)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_endpoint(texte, _nlp):
+    text_list = []
+    regex = "(?<!\d)(\.)(?!\d)(?!.*\1)"
+    for sentence in _nlp.process(texte).sentences:
+        text_space = re.sub(regex, " . ", sentence.text.replace("\n", " "))
+        text_space_no_db = text_space.replace("  ", " ")
+        text_list.append(text_space_no_db)
+        # print(text_space_no_db)
+    return " ".join(text_list)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_leftp(texte, _nlp):
+    text_list = []
+    regex = "(?<!\d)(\()(?!\d)(?!.*\1)"
+    for sentence in _nlp.process(texte).sentences:
+        text_space = re.sub(regex, " ( ", sentence.text.replace("\n", " "))
+        text_space_no_db = text_space.replace("  ", " ")
+        text_list.append(text_space_no_db)
+        # print(text_space_no_db)
+    return " ".join(text_list)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_rightp(texte, _nlp):
+    text_list = []
+    regex = "(?<!\d)(\))(?!\d)(?!.*\1)"
+    for sentence in _nlp.process(texte).sentences:
+        text_space = re.sub(regex, " ) ", sentence.text.replace("\n", " "))
+        text_space_no_db = text_space.replace("  ", " ")
+        text_list.append(text_space_no_db)
+        # print(text_space_no_db)
+    return " ".join(text_list)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_stroph(texte, _nlp):
+    text_list = []
+    regex = "(?<!\d)(')(?!\d)(?!.*\1)"
+    for sentence in _nlp.process(texte).sentences:
+        text_space = re.sub(regex, " ' ", sentence.text.replace("\n", " "))
+        text_space_no_db = text_space.replace("  ", " ")
+        text_list.append(text_space_no_db)
+        # print(text_space_no_db)
+    return " ".join(text_list)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def add_space_to_comma_endpoint(texte, _nlp):
+    text_fr_comma = add_space_to_comma(texte, _nlp)
+    text_fr_comma_endpoint = add_space_to_endpoint(text_fr_comma, _nlp)
+    text_fr_comma_endpoint_leftpc = add_space_to_leftp(text_fr_comma_endpoint, _nlp)
+    text_fr_comma_endpoint_leftpc_right_pc = add_space_to_rightp(
+        text_fr_comma_endpoint_leftpc, _nlp
+    )
+    del text_fr_comma
+    del text_fr_comma_endpoint
+    del text_fr_comma_endpoint_leftpc
+    return text_fr_comma_endpoint_leftpc_right_pc
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_abbreviation_dict_correction():
+    # dict_correction = {}
+    with open("data/fr_abbreviations.json", "r") as outfile:
+        hpo_abbreviations = json.load(outfile)
+    return hpo_abbreviations  # dict_correction
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def reformat_to_report(text, _nlp):
+    cutsentence = []
+    for sentence in _nlp.process(text).sentences:
+        cutsentence.append(
+            sentence.text.replace(" ,", ",")
+            .replace(" .", ".")
+            .replace(" )", ")")
+            .replace(" (", "(")
+            .replace(" '", "'")
+        )
+    return "  \n".join(cutsentence)
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_cities_list():
+    cities = pd.read_csv("data/proper_noun_location_sort.csv")
+    cities.columns = ["ville"]
+    whole_cities_patterns = []
+    list_cities = cities["ville"].to_list()
+    for element in list_cities:
+        whole_cities_patterns.append(element)
+        whole_cities_patterns.append(element.lower().capitalize())
+        whole_cities_patterns.append(element.upper())
+        whole_cities_patterns.append(unidecode(element))
+        whole_cities_patterns.append(unidecode(element).lower().capitalize())
+        whole_cities_patterns.append(unidecode(element).upper())
+    del cities
+    del list_cities
+    return whole_cities_patterns
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_list_not_deidentify():
+    proper_noun_data = pd.read_csv(
+        "data/exception_list_anonymization.tsv", sep="\t", header=None
+    ).astype(str)
+    drug_data = pd.read_csv("data/drug_name.tsv", sep="\t", header=None).astype(str)
+    gene_data = pd.read_csv("data/gene_name.tsv", sep="\t", header=None).astype(str)
+    proper_noun_list = (
+        proper_noun_data[0].to_list()
+        + drug_data[0].to_list()
+        + gene_data[0].to_list()
+        + [
+            "PN",
+            "TN",
+            "SD",
+            "PCN",
+            "cher",
+            "chere",
+            "CAS",
+            "INDEX",
+            "APGAR",
+            "M",
+            "Ms",
+            "Mr",
+            "Behçet",
+            "hypoacousia",
+        ]
+    )
+    proper_noun = [x.lower() for x in proper_noun_list]
+    del proper_noun_data
+    del drug_data
+    del gene_data
+    del proper_noun_list
+    return proper_noun
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def change_name_patient_abbreviations(Report, Last_name, First_name, abbreviations_dict):
+    Report_name = Report
+    dict_correction_name_abbreviations = {
+        "M.": "M",
+        "Mme.": "Mme",
+        "Mlle.": "Mlle",
+        "Dr.": "Docteur",
+        "Dr": "Docteur",
+        "Pr.": "Professeur",
+        "Pr": "Professeur",
+    }
+    for firstname in First_name.split():
+        dict_correction_name_abbreviations[firstname] = "CAS"
+    for lastname in Last_name.split():
+        dict_correction_name_abbreviations[lastname] = "INDEX"
+    for key, value in abbreviations_dict.items():
+        dict_correction_name_abbreviations[key] = value  # + " [" + key + "]"
+    list_replaced = []
+    splitted_Report = Report_name.replace("\n", " ").split(" ")
+    replaced_Report = []
+    for i in splitted_Report:
+        append_word = i
+        replace_word = None
+        for key, value in dict_correction_name_abbreviations.items():
+            i_check = i.lower().strip().replace(",", "").replace(".", "")
+            if i_check == key.lower().strip():
+                to_replace = i.strip().replace(",", "").replace(".", "")
+                replace_word = value
+                if i_check == Last_name or i_check == First_name:
+                    list_replaced.append(
+                        {
+                            "name": Last_name,
+                            "surname": First_name,
+                            "type": "index_case",
+                            "value": i.strip().replace(",", "").replace(".", ""),
+                            "correction": value,
+                            "lf_detected": True,
+                            "manual_validation": True,
+                        }
+                    )
+                else:
+                    list_replaced.append(
+                        {
+                            "name": Last_name,
+                            "surname": First_name,
+                            "type": "abbreviations",
+                            "value": i.strip().replace(",", "").replace(".", ""),
+                            "correction": value,
+                            "lf_detected": True,
+                            "manual_validation": True,
+                        }
+                    )
+        if replace_word:
+            append_word = append_word.replace(to_replace, replace_word)
+        replaced_Report.append(append_word)
+    del dict_correction_name_abbreviations
+    del splitted_Report
+    return " ".join(replaced_Report), list_replaced
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def config_deidentify(cities_list):
+    configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
+    }
+    # Create NLP engine based on configuration
+    provider = NlpEngineProvider(nlp_configuration=configuration)
+    nlp_engine = provider.create_engine()
+    frcity_recognizer = PatternRecognizer(
+        supported_entity="FRENCH_CITY", deny_list=cities_list
+    )
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
+    analyzer.registry.add_recognizer(frcity_recognizer)
+    engine = AnonymizerEngine()
+    del configuration
+    del provider
+    del nlp_engine
+    del frcity_recognizer
+    return analyzer, engine

utilities/convert.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import json
+import streamlit as st
+from .web_utilities import st_cache_data_if, supported_cache
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def convert_df(df):
+    return df.dropna(how="all").to_csv(sep="\t", index=False).encode("utf-8")
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def convert_df_no_header(df):
+    return (
+        df.dropna(how="all").to_csv(sep="\t", index=False, header=None).encode("utf-8")
+    )
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def convert_json(df):
+    dict_return = {"features": []}
+    df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
+    if len(df_check) > 0:
+        df_dict_list = df[["HPO ID", "Phenotype name"]].to_dict(orient="index")
+        for key, value in df_dict_list.items():
+            dict_return["features"].append(
+                {
+                    "id": value["HPO ID"],
+                    "observed": "yes",
+                    "label": value["Phenotype name"],
+                    "type": "phenotype",
+                }
+            )
+        return json.dumps(dict_return)
+    else:
+        return json.dumps(dict_return)
+@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
+def convert_list_phenogenius(df):
+    df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
+    if len(df_check) > 0:
+        return ",".join(df_check["HPO ID"].to_list())
+    else:
+        return "No HPO in letters."

utilities/extract_hpo.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import re
+from clinphen_src import get_phenotypes_lf
+import streamlit as st
+from .web_utilities import st_cache_data_if, supported_cache
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def add_biometrics(text, _nlp):
+    cutsentence_with_biometrics = []
+    cutsentence = []
+    additional_terms = []
+    for sentence in _nlp.process(text).sentences:
+        cutsentence.append(sentence.text)
+    keep_element = ["cm", "kg", "qit", "qi"]
+    for sentence in cutsentence:
+        if any(ext in sentence.lower() for ext in keep_element):
+            if "SD" in sentence or "DS" in sentence:
+                sentence = sentence.replace("DS", "SD")
+                try:
+                    kg_sd = re.findall("kg(.*?)sd", sentence.lower())[0]
+                    num_kg_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", kg_sd)[0]
+                    # print(kg_sd)
+                    kg_sd = float(num_kg_sd)
+                    print(kg_sd)
+                    if kg_sd >= 2:
+                        additional_terms.append("Increased body weight")
+                    if kg_sd <= -2:
+                        additional_terms.append("Decreased body weight")
+                except:
+                    print("Incorrect weight recognition pattern")
+                    print(sentence)
+                try:
+                    if "is" in sentence.lower():
+                        height_sd_alpha = re.findall("\ is(.*?)d", sentence.lower())[0]
+                        if "cm" not in height_sd_alpha:
+                            height_sd_raw = height_sd_alpha
+                    if "easure" in sentence.lower():
+                        height_sd_raw = re.findall("easure(.*?)d", sentence.lower())[0]
+                        print(height_sd_raw)
+                    height_sd = re.findall("m(.*?)s", height_sd_raw)[0]
+                    print(height_sd)
+                    num_height_sd = re.findall(
+                        "\(\s*([-+].?\d+(?:\.\d+)?)\s*", height_sd
+                    )[0]
+                    height_sd = float(num_height_sd)
+                    print(height_sd)
+                    if height_sd >= 2:
+                        additional_terms.append("Tall stature")
+                    if height_sd <= -2:
+                        additional_terms.append("Short stature")
+                except:
+                    print("Incorrect height recognition pattern")
+                    print(sentence)
+                try:
+                    pc_sd_raw = (
+                        re.findall("head(.*?)d", sentence.lower())[0]
+                        .replace("(", "")
+                        .replace(")", "")
+                        .replace(" ", "")
+                    )
+                    pc_sd = re.findall("cm(.*?)s", pc_sd_raw)[0]
+                    num_pc_sd = re.findall("\(\s*([-+].?\d+(?:\.\d+)?)\s*", pc_sd)[0]
+                    pc_sd = float(num_pc_sd)
+                    print(pc_sd)
+                    if pc_sd >= 2:
+                        additional_terms.append("Macrocephaly")
+                    elif pc_sd <= -2:
+                        additional_terms.append("Microcephaly")
+                except:
+                    print("Incorrect head circumference recognition pattern")
+                    print(sentence)
+                print(additional_terms)
+            if "FSIQ" in sentence or "IQ" in sentence:
+                try:
+                    iq_score = re.findall("iq.*?(\d.*?)\D", sentence.lower())[0]
+                    iq_score = float(iq_score)
+                    print(iq_score)
+                    if iq_score >= 70 and iq_score < 84:
+                        additional_terms.append("Intellectual disability, borderline")
+                    elif iq_score >= 50 and iq_score < 69:
+                        additional_terms.append("Intellectual disability, mild")
+                    elif iq_score >= 35 and iq_score < 49:
+                        additional_terms.append("Intellectual disability, moderate")
+                    elif iq_score >= 20 and iq_score < 34:
+                        additional_terms.append("Intellectual disability, severe")
+                    elif iq_score < 20:
+                        additional_terms.append("Intellectual disability, profound")
+                    print(additional_terms)
+                except:
+                    print("Incorrect IQ recognition pattern")
+                    print(sentence)
+            cutsentence_with_biometrics.append(
+                sentence + " This means " + ", ".join(additional_terms) + "."
+            )
+        else:
+            cutsentence_with_biometrics.append(sentence)
+    print(cutsentence_with_biometrics)
+    cutsentence_with_biometrics_return = [
+        i for i in cutsentence_with_biometrics if i != "."
+    ]
+    del cutsentence_with_biometrics
+    del cutsentence
+    del keep_element
+    return " ".join(cutsentence_with_biometrics_return), additional_terms
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def extract_hpo(inputStr):
+    hpo_to_name = get_phenotypes_lf.getNames()
+    returnString, returnStringUnsafe = get_phenotypes_lf.extract_phenotypes(
+        inputStr, hpo_to_name
+    )
+    returnDf = get_phenotypes_lf.get_dataframe_from_clinphen(returnString)
+    returnDfUnsafe = get_phenotypes_lf.get_dataframe_from_clinphen(returnStringUnsafe)
+    del hpo_to_name
+    del returnString
+    del returnStringUnsafe
+    return returnDf, returnDfUnsafe

utilities/get_model.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import stanza
+import nltk
+import os
+import spacy
+import streamlit as st
+from .web_utilities import st_cache_resource_if, supported_cache
+from .translate import Translator
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_models(langue,output=os.path.expanduser("~")):
+    if langue == "fr":
+        stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
+        Translator(langue, "en")
+    elif langue == "de":
+        stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
+        Translator(langue, "en")
+    else:
+        stanza.download(langue,dir = os.path.join(output,"stanza_resources"))
+        Translator(langue, "en")
+    if os.path.join(output,"nltk_data") not in nltk.data.path:
+        nltk.data.path.append(os.path.join(output,"nltk_data"))
+    try:
+        nltk.data.find("omw-1.4")
+    except LookupError:
+        nltk.download("omw-1.4",download_dir = os.path.join(output,"nltk_data"))
+    try:
+        nltk.data.find("wordnet")
+    except LookupError:
+        nltk.download("wordnet", download_dir = os.path.join(output,"nltk_data"))
+    spacy_model_name = "en_core_web_lg"
+    try:
+        nlp = spacy.load(os.path.join(output,spacy_model_name))
+        print(spacy_model_name + " already downloaded")
+    except OSError:
+        spacy.cli.download(spacy_model_name)
+        nlp = spacy.load(spacy_model_name)
+        nlp.to_disk(os.path.join(output,spacy_model_name))
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_nlp_marian(source_lang):
+    nlp_fr = stanza.Pipeline(source_lang, processors="tokenize")
+    marian_fr_en = Translator(source_lang, "en")
+    return nlp_fr, marian_fr_en

utilities/translate.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from dataclasses import dataclass
+from typing import Dict, List, Sequence
+import stanza
+import transformers
+import json
+import streamlit as st
+from .web_utilities import st_cache_data_if, st_cache_resource_if, supported_cache
+from .anonymize import add_space_to_comma_endpoint, change_name_patient_abbreviations
+@dataclass(frozen=True)
+class SentenceBoundary:
+    text: str
+    prefix: str
+    def __str__(self):
+        return self.prefix + self.text
+@dataclass
+class SentenceBoundaries:
+    def __init__(self) -> None:
+        self._sentence_boundaries: List[SentenceBoundary] = []
+    @property
+    def sentence_boundaries(self):
+        return self._sentence_boundaries
+    def update_sentence_boundaries(
+        self, sentence_boundaries_list: List[SentenceBoundary]
+    ):
+        self._sentence_boundaries = sentence_boundaries_list
+        return self
+    def from_doc(self, doc: stanza.Document):
+        start_idx = 0
+        for sent in doc.sentences:
+            self.sentence_boundaries.append(
+                SentenceBoundary(
+                    text=sent.text,
+                    prefix=doc.text[start_idx : sent.tokens[0].start_char],
+                )
+            )
+            start_idx = sent.tokens[-1].end_char
+        self.sentence_boundaries.append(
+            SentenceBoundary(text="", prefix=doc.text[start_idx:])
+        )
+        return self
+    @property
+    def nonempty_sentences(self) -> List[str]:
+        return [item.text for item in self.sentence_boundaries if item.text]
+    def map_sentence_boundaries(self, d: Dict[str, str]) -> List:
+        return SentenceBoundaries().update_sentence_boundaries(
+            [
+                SentenceBoundary(text=d.get(sb.text, sb.text), prefix=sb.prefix)
+                for sb in self.sentence_boundaries
+            ]
+        )
+    def __str__(self) -> str:
+        return "".join(map(str, self.sentence_boundaries))
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def minibatch(seq, size):
+    items = []
+    for x in seq:
+        items.append(x)
+        if len(items) >= size:
+            yield items
+            items = []
+    if items:
+        yield items
+# @dataclass(frozen=True)
+class Translator:
+    def __init__(self, source_lang: str, dest_lang: str, use_gpu: bool = False) -> None:
+        # self.use_gpu = use_gpu
+        self.model_name = "Helsinki-NLP/opus-mt-" + source_lang + "-" + dest_lang
+        self.model = transformers.MarianMTModel.from_pretrained(self.model_name)
+        # if use_gpu:
+        #    self.model = self.model.cuda()
+        self.tokenizer = transformers.MarianTokenizer.from_pretrained(self.model_name)
+        self.sentencizer = stanza.Pipeline(
+            source_lang, processors="tokenize", verbose=False, use_gpu=use_gpu
+        )
+    def sentencize(self, texts: Sequence[str]) -> List[SentenceBoundaries]:
+        return [
+            SentenceBoundaries().from_doc(doc=self.sentencizer.process(text))
+            for text in texts
+        ]
+    def translate(
+        self, texts: Sequence[str], batch_size: int = 10, truncation=True
+    ) -> Sequence[str]:
+        if isinstance(texts, str):
+            raise ValueError("Expected a sequence of texts")
+        text_sentences = self.sentencize(texts)
+        translations = {
+            sent: None for text in text_sentences for sent in text.nonempty_sentences
+        }
+        for text_batch in minibatch(
+            sorted(translations, key=len, reverse=True), batch_size
+        ):
+            tokens = self.tokenizer(
+                text_batch, return_tensors="pt", padding=True, truncation=truncation
+            )
+            # if self.use_gpu:
+            #    tokens = {k:v.cuda() for k, v in tokens.items()}
+            translate_tokens = self.model.generate(**tokens)
+            translate_batch = [
+                self.tokenizer.decode(t, skip_special_tokens=True)
+                for t in translate_tokens
+            ]
+            for text, translated in zip(text_batch, translate_batch):
+                translations[text] = translated
+        return [
+            str(text.map_sentence_boundaries(translations)) for text in text_sentences
+        ]
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def translate_report(
+    Report, Last_name, First_name, _nlp, _marian_fr_en, dict_correction, abbreviation_dict
+):
+    Report_name, list_replaced_abb_name = change_name_patient_abbreviations(
+        Report, Last_name, First_name, abbreviation_dict
+    )
+    MarianText_raw = translate_marian(Report_name, _nlp, _marian_fr_en)
+    MarianText_space = add_space_to_comma_endpoint(MarianText_raw, _nlp)
+    MarianText, list_replaced = correct_marian(
+        MarianText_space, dict_correction, Last_name, First_name
+    )
+    del MarianText_raw
+    del MarianText_space
+    return MarianText, list_replaced, list_replaced_abb_name
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def translate_marian(Report_name, _nlp, _marian_fr_en):
+    list_of_sentence = []
+    for sentence in _nlp.process(Report_name).sentences:
+        list_of_sentence.append(sentence.text)
+    MarianText_raw = "\n".join(_marian_fr_en.translate(list_of_sentence))
+    del list_of_sentence
+    return MarianText_raw
+@st_cache_data_if(supported_cache, max_entries=5, ttl=3600)
+def correct_marian(MarianText_space, dict_correction, Last_name, First_name):
+    MarianText = MarianText_space
+    list_replaced = []
+    for key, value in dict_correction.items():
+        if key in MarianText:
+            list_replaced.append(
+                {
+                    "name": Last_name,
+                    "surname": First_name,
+                    "type": "marian_correction",
+                    "value": key,
+                    "correction": value,
+                    "lf_detected": True,
+                    "manual_validation": True,
+                }
+            )
+            MarianText = MarianText.replace(key, value)
+    return MarianText, list_replaced
+@st_cache_resource_if(supported_cache, max_entries=5, ttl=3600)
+def get_translation_dict_correction():
+    dict_correction_FRspec = {
+        "PC": "head circumference",
+        "palatine slot": "cleft palate",
+        "ASD": "autism",
+        "ADHD": "attention deficit hyperactivity disorder",
+        "IUGR": "intrauterin growth retardation",
+        "QI": "IQ ",
+        "QIT": "FSIQ ",
+        "ITQ": "FSIQ ",
+        "DS": "SD",
+        "FOP": "patent foramen ovale",
+        "PFO": "patent foramen ovale",
+        "ARCF": "fetal distress",
+        "\n": " ",
+        "associated": "with",
+        "Mr.": "Mr",
+        "Mrs.": "Mrs",
+    }
+    dict_correction = {}
+    for key, value in dict_correction_FRspec.items():
+        dict_correction[" " + key + " "] = " " + value + " "
+    with open("data/hp_fr_en_translated_marian_review_lwg.json", "r") as outfile:
+        hpo_translated = json.load(outfile)
+    for key, value in hpo_translated.items():
+        dict_correction[" " + key + " "] = " " + value + " "
+    with open("data/fr_abbreviations_translation.json", "r") as outfile:
+        hpo_translated_abbreviations = json.load(outfile)
+    for key, value in hpo_translated_abbreviations.items():
+        dict_correction[" " + key + " "] = " " + value + " "
+    del hpo_translated
+    del hpo_translated_abbreviations
+    return dict_correction

utilities/web_utilities.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 from PIL import Image
 def display_page_title(title: str):
@@ -47,3 +48,33 @@ def display_sidebar():
 #                    file_name="Mentions_legales_lf.pdf",
 #                    mime='application/octet-stream')
 # st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")

 import streamlit as st
 from PIL import Image
+import inspect
+import os
 def display_page_title(title: str):
 #                    file_name="Mentions_legales_lf.pdf",
 #                    mime='application/octet-stream')
 # st.sidebar.markdown("[Mentions légales](data/Mentions_legales_lf.pdf)")
+def st_cache_data_if(condition, *args, **kwargs):
+    def decorator(func):
+        if condition:
+            return st.cache_data(*args, **kwargs)(func)
+        else:
+            return func
+    return decorator
+def st_cache_resource_if(condition, *args, **kwargs):
+    def decorator(func):
+        if condition:
+            return st.cache_resource(*args, **kwargs)(func)
+        else:
+            return func
+    return decorator
+supported_cache = False
+def stack_checker():
+    caller_frame = inspect.stack()
+    for e in caller_frame:
+        if os.path.basename(e.filename) == "clinfly_app_st.py":
+            global supported_cache
+            supported_cache = True
+stack_checker()