Spaces:

kyauy
/

ClinFly

Running

App Files Files Community

kyauy commited on Apr 3, 2023

Commit

458ae64

1 Parent(s): 4bd4fd8

feat(deidentification): #1 add french cities

Browse files

Files changed (7) hide show

clinphen_src/data/hpo_synonym_filter.txt +0 -0
clinphen_src/get_phenotypes_lf.py +3 -3
data/fr_abbreviations.json +1 -1
data/fr_abbreviations_translation.json +1 -1
data/hp_fr_en_translated_marian_review_lwg.json +0 -0
data/proper_noun_location_sort.csv +0 -0
lf_app.py +29 -5

clinphen_src/data/hpo_synonym_filter.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

clinphen_src/get_phenotypes_lf.py CHANGED Viewed

@@ -3,7 +3,7 @@ from nltk.stem import WordNetLemmatizer
 import pandas as pd
 import re
-HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonyms.txt"
 def getNames():
   returnMap = {}
@@ -23,10 +23,10 @@ def end_of_point(word):
   if word == "though": return True
   return False
-subpoint_enders = [":"] #","
 def end_of_subpoint(word):
   if word[-1] in subpoint_enders: return True
-  #if word == "and": return True
   return False
 def string_to_record_linewise(medical_record):

 import pandas as pd
 import re
+HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonym_filter.txt"
 def getNames():
   returnMap = {}
   if word == "though": return True
   return False
+subpoint_enders = [":", ','] #","
 def end_of_subpoint(word):
   if word[-1] in subpoint_enders: return True
+  if word == "and": return True
   return False
 def string_to_record_linewise(medical_record):

data/fr_abbreviations.json CHANGED Viewed

@@ -1 +1 @@

- {"AAA": "anévrisme de l'aorte abdominale", "AAG": "asthme aigu grave", "AAS": "Syndrome Aarskog-Scott", "ABCD": "absence bilatérale des canaux déférents", "ACH": "hypoplasie cérébelleuse atrophique", "ACTH": "hormone adrénocorticotrope", "ACPA": "puce d'hybridation génomique comparative", "ADHD": "trouble d'hyperactivité avec déficit de l'attention", "ADK": "carcinome", "AIT": "accident ischémique transitoire", "ALAT": "alanine aminotransférase", "ANPAA": "Association Nationale de Prévention en Alcoologie et Addictologie", "AOMI": "artériopathie oblitérante des membres inférieurs", "ARCF": "anomalie du rythme cardiarque foetal", "ASAT": "aspartate aminotransférase", "ASD": "trouble du spectre de l'autisme", "AVC": "Accident vasculaire cérébral", "AVS": "Aide à la vie scolaire", "BAV": "bloc atrio-ventriculaire", "BPCO": "Bronchopneumopathie chronique obstructive", "CATTP": "Centre d’accueil à temps partiel thérapeutique", "CCR": "cancer colorectal", "CGH": "puce d'hybridation génomique comparative", "CHU": "Centre hospitalier universitaire", "CN": "clarté nuchal", "CNV": "variantion du nombre de copie", "COPD": "bronchopneumopathie chronique obstructive", "CP": "pericmètre crânien", "CPDPN": "Centre Pluridisciplinaire de Diagnostic Prénatal", "CRA": "Centre Ressources Autisme", "CRIU": "retard de croissance intrautérin", "CRP": "protéine C-réactive", "DCP": "dyskinésie ciliaire primitive", "DFT": "démence frontotemporale", "DGCR": "region critique du syndrome de DiGeorge", "DMLA": "dégénérescence maculaire liée à l'âge", "DMO": "densité minérale osseuse", "DNID": "diabète non insulinodépendant", "DPI": "diagnostic pré-implentatoire", "DPN": "diagnostic prénatal", "DPNI": "depistage prénatal non invasif", "DSD": "trouble de la différentiation sexuelle", "DT1": "diabète de type 1", "DT2": "diabète de type 2", "DYS": "Troubles cognitifs spécifiques et les troubles des apprentissages qu'ils induisent", "EAL": "bilan lipidique", "ECG": "électrocardiogramme", "EDTA": "Éthylènediaminetétraacétique", "EHPAD": "Etablissement d’Hébergement pour Personnes Agées Dépendantes", "EMI": "Institut Médico-Educatif", "ESAT": "Etablissement et Service d’Aides par le Travail", "ETP": "Education Thérapeutique du Patient", "FAM": "Foyer d'accueil médicalisé", "FCS": "fausse couche spontannée", "FISH": "Hybridation in situ avec epifluorescence", "FIV": "Fécondation invitro", "FOP": "foramen ovale perméable", "FSH": "hormone folliculo-stimulante", "FXPOI": "insuffisance ovarienne précoce liée à l’X fragile", "FXTAS": "syndrome tremblement/ataxie associé à l'X fragile", "GAJ": "glycémie à jeun", "GH": "hormone de croissance", "GNRH": "hormone de libération des gonadotrophines hypophysaires", "HAD": "Hospitalisation à domicile", "HDJ": "Hospitalisation de jour", "HTA": "Hypertension artérielle", "HTAP": "hypertension pulmonaire", "HTIC": "hypertension intracrânienne", "IDM": "Infractus du myocarde", "IME": "Institut Médico-Educatif", "IMG": "Interruption médicale de grossesse", "INR": "International Normalized Ratio", "IRC": "Insuffisance rénale chronique", "IRM": "Imagerie de résonance magnétique", "ITQ": "QIT", "IU": "infection urinaire", "IUGR": "retard de croissance intrautérin", "IV": "intraveineuse", "IVG": "Interruption volontaire de grossesse", "LH": "hormone lutéinisante", "MAS": "Maison d’accueil Spécialisée", "MFIU": "mort fœtale in utero", "MICI": "maladies inflammatoires chroniques de l'intestin", "MMR": "Réparation des mésappariements", "MODY": "Diabète de type MODY", "MPR": "Médecine physique et de réadaptation", "MSI": "instabilité des microsatellites", "NEM": "néoplasie endocrinienne multiple", "NFS": "numeration formule sanguine", "OAP": "œdème aigu du poumon", "OGE": "organes génitaux externes", "OMI": "œdème des membres inférieurs", "ORL": "Oto-rhino-laryngologiste", "PAF": "polypose adénomateuse familiale", "PC": "périmètre crânien", "PCN": "périmètre crânien de naissance", "PCR": "réaction de polymérisation en chaîne", "PKRD": "polykystose rénale autosomique dominante", "PMI": "Protection maternelle et infantile", "PN": "poids de naissance", "PO": "per os", "POCS": "Syndrome des pointes ondes continues du sommeil", "POF": "foramen ovale perméable", "PTG": "prothèse totale de genou", "QI": "quotient intellectuel", "QIT": "quotient intellectuel total", "RCIU": "retard de croissance intrautérin", "RCP": "Reunion de Concertation pluridisciplinaire", "RDPM": "retard de developpement psychomoteur", "ROT": "reflexes osteo-tendineux", "RT": "transcription inverse", "SAHS": "syndrome d'apnées/hypopnée du sommeil", "SAMU": "service d'aide médicale urgente", "SDRA": "syndrome de détresse respiratoire aiguë", "SEP": "Sclérose En Plaques", "SESSAD": "Service d’Education Spéciale et de Soins à Domicile", "SIADH": "syndrome de sécrétion inappropriée d'hormone anti-diurétique", "SOPK": "syndrome des ovaires polykystiques", "SPW": "syndrome de Prader-Willi", "SSR": "Soins de Suite et de Réadaptation", "STB": "Sclérose tubéreuse de Bourneville", "TAVI": "Implantation percutanée d'une prothèse valvulaire aortique", "TC": "traumatisme crânien", "TDA": "Trouble de déficit de l'attention avec ou sans hyperactivité", "TDAH": "Trouble de déficit de l'attention avec ou sans hyperactivité", "TDM": "tomodensitométrie", "TED": "Trouble envahissant du développement", "TEP": "tomographie par émission de positon", "TN": "taille de naissance", "TP": "taux de prothrombine", "TSA": "trouble de spectre de l'autisme", "TSH": "thyréostimuline", "TSLA ": "Trouble spécifique des apprentissages et du langage", "TSLO": "trouble spécifique du langage oral", "TTT": "traitement", "UGD": "ulcère gastroduodénal", "UHCD": "unité d'hospitalisation de courte durée", "USI": "unité de soins intensifs", "VD": "ventricule droite", "VG": "ventricule gauche", "WB": "western blot"}


1	+ {}

data/fr_abbreviations_translation.json CHANGED Viewed

@@ -1 +1 @@

- {"abdominal aorta aneurysm": "abdominal aortic aneurysm", "severe acute asthma": "Status asthmaticus", "Aarskog-Scott Syndrome ": "Aarskog-Scott syndrome", "bilateral absence of deferent channels": "Absent vas deferens", "atrophic cerebellar hypoplasia": "Atrophic cerebellar hypoplasia", "adrenocorticotropic hormone": "adrenocorticotropic hormone", "comparative genomic hybridization chip": "comparative genomic hybridization", "Hyperactivity disorder with attention deficit": "attention deficit hyperactivity disorder", "carcinoma": "carcinoma", "transient ischemic accident": "transient ischemic attack", "alanine aminotransferase": "alanine aminotransferase", "Association Nationale de Prevention en Alcoologie et Addictologie": "National association for treating addictive behavior", "obliterating arteriopathy of the lower limbs": "Inflammatory arteriopathy", "anomaly of the fetal cardiorrhythmia": "fetal distress", "aspartate aminotransferase": "aspartate aminotransferase", "autism spectrum disorder": "autism", "Stroke": "Stroke", "Support for school life": "school life assistance", "Atrioventricular block": "Atrioventricular block", "Chronic obstructive bronchopneumopathy": "Chronic pulmonary obstruction", "Part-time therapeutic reception centre": "Part-time Therapeutic Reception Centre", "colorectal cancer": "colorectal cancer", "University Hospital Centre": "University Hospital Center", "nuchal clarity": "nuchal skin", "variation of the number of copies": "copy nulmber variation", "Cranial pericmeter": "head circumference", "Multidisciplinary Centre for Prenatal Diagnostics": "Pluridisciplinary Center for Prenatal Diagnosis", "Autism Resource Centre": "Autism Resource Center for autistic behavior", "retardation of intrauterine growth": "intrauterin growth retardation", "C-reactive protein": "C-reactive protein", "primitive ciliary dyskinesia": "ciliary dyskinesia", "Frontotemporal dementia": "Frontotemporal dementia", "DiGeorge syndrome critical region": "diGeorge chromosomal region", "age-related macular degeneration": "age-related macular degeneration", "bone mineral density": "bone mineral density", "non-insulin-dependent diabetes mellitus": "type II diabetes mellitus", "pre-implentatory diagnosis": "pre-implantation diagnosis", "prenatal diagnosis": "prenatal diagnosis", "Non-invasive prenatal screening": "non-invasive prenatal testing", "disorder of sexual differentiation": "disorders of sex development", "type 1 diabetes": "Insulin-resistant diabetes mellitus", "type 2 diabetes": "type II diabetes mellitus", "Specific cognitive disorders and learning disorders they induce": "specific learning disabilities", "lipid balance": "lipid test", "electrocardiogram": "electrocardiogram", "Ethylenediaminetetraacetic": "Ethylenediaminetetraacetic acid", "Housing for Self-employed persons": "Elderly Dependent Care Accommodation", "Medical-Educational Institute": "medical-educational institute for his intellectual disability", "Establishment and Service of Labour Aids": "establishment and service of help by work for mild global developmental delay", "Patient Therapeutic Education": "Therapeutic Education for the Patient", "Medically assisted foster home ": "medical home for neurodevelopmental delay", "Spontaneous miscarriage": "Miscarriage", "In situ hybridization with epifluorescence": "Fluorescent In Situ Hybridization", "Invitro fertilization": "Fécondation invitro", "permeable oval foramen": "patent foramen ovale", "follicle-stimulating hormone": "follicle-stimulating hormone", "Early ovarian failure due to fragile X": "Fragile X-Associated Primary Ovarian Insufficiency", "tremor/ataxia syndrome associated with fragile X": "Fragile X Tremor Ataxia Syndrome", "fasting blood glucose levels": "fasting blood glucose", "growth hormone": "growth hormone", "pituitary gonadotrophin release hormone": "Gonadotropin Releasing Hormone", "Home Hospitalization": "Home hospitalisation", "Day hospitalization": "day hospital admissions", "Hypertension": "high blood pressure", "pulmonary hypertension": "Pulmonary arterial hypertension", "intracranial hypertension": "Increased intracranial pressure", "Myocardial infractions": "myocardial infarction", "Medical termination of pregnancy": "Medical Termination Of Pregnancy", "International Normalized Ratio": "International Normalized Ratio", "Chronic renal failure": "chronic kidney disease", "Magnetic resonance imaging": "Magnetic resonance imaging", "QIT": "FSIQ", "urinary tract infection": "urinary tract infection", "intravenous": "intravenous", "Voluntary termination of pregnancy": "Voluntary termination of pregnancy", "luteinizing hormone": "luteinizing hormone", "Specialised Home": "medical home for neurodevelopmental delay", "fetal death in utero": "Stillbirth", "chronic inflammatory diseases of the intestine": "Inflammatory bowel disease", "Repair of mismatches": "mismatch repair", "MODY type diabetes": "maturity onset diabetes of the young", "Physical medicine and rehabilitation": "Physical Medicine and Rehabilitation", "Microsatellite instability": "microsatellite instability", "multiple endocrine neoplasia": "Multiple endocrine neoplasia", "Numbering of blood formula": "numeration formule sanguine", "acute oedema of the lung": "Pulmonary edema", "external genital organs": "external genitals", "oedema of the lower limbs": "Pedal edema", "Oto-rhino-laryngologist": "otolaryngologist", "family adenomatous polyposia": "Familial adenomatous polyposis", "Cranial perimeter": "head circumference", "Cranial perimeter of birth": "head circumference at birth", "polymerization chain reaction": "polymerase chain reaction", "autosomal renal polycystosis dominant": "polycystic kidney dysplasia", "Maternal and child protection": "Maternal and child protection", "birth weight": "birth weight", "per os": "per os", "Continuous Sleep Wave Point Syndrome": "Continuous spike and waves during slow sleep", "total knee prosthesis": "Total knee arthroplasty", "intellectual quotient": "IQ", "total intellectual quotient": "FSIQ", "Multidisciplinary Concertation Meeting": "Multidisciplinary Consultation Meeting", "retardation of psychomotor development": "global developmental delay", "osteo-tendinous reflexes": "numeration formule sanguine", "reverse transcription": "reverse transcription", "apnea/sleep hypopnoea syndrome": "Sleep apnea", "emergency medical aid service": "Emergency medical services", "Acute Respiratory Distress Syndrome": "acute respiratory distress syndrome", "Plated Sclerosis": "multiple sclerosis", "Special Education and Home Care Service": "specific education services for his mild global developmental delay", "Inappropriate anti-diuretic hormone secretion syndrome": "syndrome of inappropriate anti-diuretic hormone secretion", "Polycystic Ovarian Syndrome": "Polycystic ovary syndrome", "Prader-Willi syndrome": "Prader-Willi syndrome", "Follow-up and Rehabilitation Care": "Follow-up and Rehabilitation Care", "Tuberous sclerosis of Bourneville": "Tuberous sclerosis", "Percutaneous implantation of aortic valve prosthesis": "Transcatheter Aortic Valve Implantation for abnormal aortic valve morphology", "head injury": "head trauma", "Attention deficit disorder with or without hyperactivity": "attention deficit hyperactivity disorder", "CT-modensitometry": "CT-scan", "Invasive development disorder": "autism", "positon emission tomography": "positron emission tomography", "birth size": "birth heigh", "prothrombin level": "Prothrombin Ratio", "thyreostimulin": "thyroid-stimulating hormone", "Specific learning and language problems": "delayed speech and language development", "speech-specific disorder": "delayed speech and language development", "treatment": "treatment", "Gastroduodenal ulcer": "Peptic ulcer", "Short-term hospitalization unit": "short-term hospitalization unit", "intensive care unit": "intensive care unit", "right ventricle": "right ventricle", "left ventricle": "left ventricle", "Western blot": "western blot"}


1	+ {}

data/hp_fr_en_translated_marian_review_lwg.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/proper_noun_location_sort.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

lf_app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import List
 import transformers
 from typing import Sequence
 import spacy
-from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
 from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
@@ -66,6 +66,18 @@ def get_models():
         print(spacy_model_name + " already downloaded")
     return "Done"
 @st.cache_data(max_entries=30)
 def get_list_not_deidentify():
@@ -116,12 +128,15 @@ def config_deidentify():
     # Create NLP engine based on configuration
     provider = NlpEngineProvider(nlp_configuration=configuration)
     nlp_engine = provider.create_engine()
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
     engine = AnonymizerEngine()
     del configuration
     del provider
     del nlp_engine
     return analyzer, engine
@@ -248,16 +263,21 @@ def anonymize_analyzer(MarianText_letter, _analyzer, nom_propre, nom, prenom):
     analyzer_results_keep = []
     analyzer_results_return = []
     analyzer_results_saved = []
-    analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
     len_to_add = 0
     analyser_results_to_sort = {}
     i = 0
     for element in analyzer_results:
-        analyser_results_to_sort[i] = element.start
         i = i + 1
     sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
     sorted_dict = {k: v for k, v in sorted_tuples}
-    # st.write(sorted_dict)
     exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
     for element_raw in sorted_dict:
@@ -337,6 +357,7 @@ def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp)
         operators={
             "PERSON": OperatorConfig("replace", {"new_value": ""}),
             "LOCATION": OperatorConfig("replace", {"new_value": ""}),
         },
     )
     return reformat_to_letter(result.text, _nlp)
@@ -729,6 +750,8 @@ def main_function(inputStr):
   return returnDf, returnDfUnsafe
 models_status = get_models()
 nlp_fr, marian_fr_en = get_nlp_marian()
 #nlp_en = get_nlp_en()
 dict_correction = get_translation_dict_correction()
@@ -736,6 +759,7 @@ dict_abbreviation_correction = get_abbreviation_dict_correction()
 nom_propre = get_list_not_deidentify()
 analyzer, engine = config_deidentify()
 if "load_state" not in st.session_state:
     st.session_state.load_state = False
@@ -928,7 +952,7 @@ if submit_button or st.session_state.load_state:
     st.download_button(
         "Download summarized letter in PhenoGenius list of HPO format",
         convert_list_phenogenius(clinphen_df),
-        nom + "_" + prenom + "_summarized_letter_list_phenogenius.tsv",
         "text",
         key="download-summarization-phenogenius",
     )

 import transformers
 from typing import Sequence
 import spacy
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
 from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
         print(spacy_model_name + " already downloaded")
     return "Done"
+@st.cache_data(max_entries=30)
+def get_cities_list():
+    cities = pd.read_csv('data/proper_noun_location_sort.csv')
+    cities.columns=['ville']
+    whole_cities_patterns = []
+    list_cities = cities['ville'].to_list()
+    for element in list_cities:
+        whole_cities_patterns.append(element)
+        whole_cities_patterns.append(element.lower().capitalize())
+    del cities
+    del list_cities
+    return whole_cities_patterns
 @st.cache_data(max_entries=30)
 def get_list_not_deidentify():
     # Create NLP engine based on configuration
     provider = NlpEngineProvider(nlp_configuration=configuration)
     nlp_engine = provider.create_engine()
+    frcity_recognizer = PatternRecognizer(supported_entity="FRENCH_CITY", deny_list=cities_list)
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
+    analyzer.registry.add_recognizer(frcity_recognizer)
     engine = AnonymizerEngine()
     del configuration
     del provider
     del nlp_engine
+    del frcity_recognizer
     return analyzer, engine
     analyzer_results_keep = []
     analyzer_results_return = []
     analyzer_results_saved = []
+    analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON", "FRENCH_CITY"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
     len_to_add = 0
     analyser_results_to_sort = {}
     i = 0
+    detect_duplicated = []
     for element in analyzer_results:
+        if element.start not in detect_duplicated:
+            analyser_results_to_sort[i] = element.start
+            detect_duplicated.append(element.start)
+        else:
+            pass
         i = i + 1
     sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
     sorted_dict = {k: v for k, v in sorted_tuples}
+    print(sorted_dict)
     exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
     for element_raw in sorted_dict:
         operators={
             "PERSON": OperatorConfig("replace", {"new_value": ""}),
             "LOCATION": OperatorConfig("replace", {"new_value": ""}),
+            "FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
         },
     )
     return reformat_to_letter(result.text, _nlp)
   return returnDf, returnDfUnsafe
 models_status = get_models()
+cities_list = get_cities_list()
+#print(cities_list)
 nlp_fr, marian_fr_en = get_nlp_marian()
 #nlp_en = get_nlp_en()
 dict_correction = get_translation_dict_correction()
 nom_propre = get_list_not_deidentify()
 analyzer, engine = config_deidentify()
 if "load_state" not in st.session_state:
     st.session_state.load_state = False
     st.download_button(
         "Download summarized letter in PhenoGenius list of HPO format",
         convert_list_phenogenius(clinphen_df),
+        nom + "_" + prenom + "_summarized_letter.txt",
         "text",
         key="download-summarization-phenogenius",
     )