kyauy commited on
Commit
458ae64
·
1 Parent(s): 4bd4fd8

feat(deidentification): #1 add french cities

Browse files
clinphen_src/data/hpo_synonym_filter.txt ADDED
The diff for this file is too large to render. See raw diff
 
clinphen_src/get_phenotypes_lf.py CHANGED
@@ -3,7 +3,7 @@ from nltk.stem import WordNetLemmatizer
3
  import pandas as pd
4
  import re
5
 
6
- HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonyms.txt"
7
 
8
  def getNames():
9
  returnMap = {}
@@ -23,10 +23,10 @@ def end_of_point(word):
23
  if word == "though": return True
24
  return False
25
 
26
- subpoint_enders = [":"] #","
27
  def end_of_subpoint(word):
28
  if word[-1] in subpoint_enders: return True
29
- #if word == "and": return True
30
  return False
31
 
32
  def string_to_record_linewise(medical_record):
 
3
  import pandas as pd
4
  import re
5
 
6
+ HPO_SYN_MAP_FILE = "clinphen_src/data/hpo_synonym_filter.txt"
7
 
8
  def getNames():
9
  returnMap = {}
 
23
  if word == "though": return True
24
  return False
25
 
26
+ subpoint_enders = [":", ','] #","
27
  def end_of_subpoint(word):
28
  if word[-1] in subpoint_enders: return True
29
+ if word == "and": return True
30
  return False
31
 
32
  def string_to_record_linewise(medical_record):
data/fr_abbreviations.json CHANGED
@@ -1 +1 @@
1
- {"AAA": "anévrisme de l'aorte abdominale", "AAG": "asthme aigu grave", "AAS": "Syndrome Aarskog-Scott", "ABCD": "absence bilatérale des canaux déférents", "ACH": "hypoplasie cérébelleuse atrophique", "ACTH": "hormone adrénocorticotrope", "ACPA": "puce d'hybridation génomique comparative", "ADHD": "trouble d'hyperactivité avec déficit de l'attention", "ADK": "carcinome", "AIT": "accident ischémique transitoire", "ALAT": "alanine aminotransférase", "ANPAA": "Association Nationale de Prévention en Alcoologie et Addictologie", "AOMI": "artériopathie oblitérante des membres inférieurs", "ARCF": "anomalie du rythme cardiarque foetal", "ASAT": "aspartate aminotransférase", "ASD": "trouble du spectre de l'autisme", "AVC": "Accident vasculaire cérébral", "AVS": "Aide à la vie scolaire", "BAV": "bloc atrio-ventriculaire", "BPCO": "Bronchopneumopathie chronique obstructive", "CATTP": "Centre d’accueil à temps partiel thérapeutique", "CCR": "cancer colorectal", "CGH": "puce d'hybridation génomique comparative", "CHU": "Centre hospitalier universitaire", "CN": "clarté nuchal", "CNV": "variantion du nombre de copie", "COPD": "bronchopneumopathie chronique obstructive", "CP": "pericmètre crânien", "CPDPN": "Centre Pluridisciplinaire de Diagnostic Prénatal", "CRA": "Centre Ressources Autisme", "CRIU": "retard de croissance intrautérin", "CRP": "protéine C-réactive", "DCP": "dyskinésie ciliaire primitive", "DFT": "démence frontotemporale", "DGCR": "region critique du syndrome de DiGeorge", "DMLA": "dégénérescence maculaire liée à l'âge", "DMO": "densité minérale osseuse", "DNID": "diabète non insulinodépendant", "DPI": "diagnostic pré-implentatoire", "DPN": "diagnostic prénatal", "DPNI": "depistage prénatal non invasif", "DSD": "trouble de la différentiation sexuelle", "DT1": "diabète de type 1", "DT2": "diabète de type 2", "DYS": "Troubles cognitifs spécifiques et les troubles des apprentissages qu'ils induisent", "EAL": "bilan lipidique", "ECG": "électrocardiogramme", "EDTA": "Éthylènediaminetétraacétique", "EHPAD": "Etablissement d’Hébergement pour Personnes Agées Dépendantes", "EMI": "Institut Médico-Educatif", "ESAT": "Etablissement et Service d’Aides par le Travail", "ETP": "Education Thérapeutique du Patient", "FAM": "Foyer d'accueil médicalisé", "FCS": "fausse couche spontannée", "FISH": "Hybridation in situ avec epifluorescence", "FIV": "Fécondation invitro", "FOP": "foramen ovale perméable", "FSH": "hormone folliculo-stimulante", "FXPOI": "insuffisance ovarienne précoce liée à l’X fragile", "FXTAS": "syndrome tremblement/ataxie associé à l'X fragile", "GAJ": "glycémie à jeun", "GH": "hormone de croissance", "GNRH": "hormone de libération des gonadotrophines hypophysaires", "HAD": "Hospitalisation à domicile", "HDJ": "Hospitalisation de jour", "HTA": "Hypertension artérielle", "HTAP": "hypertension pulmonaire", "HTIC": "hypertension intracrânienne", "IDM": "Infractus du myocarde", "IME": "Institut Médico-Educatif", "IMG": "Interruption médicale de grossesse", "INR": "International Normalized Ratio", "IRC": "Insuffisance rénale chronique", "IRM": "Imagerie de résonance magnétique", "ITQ": "QIT", "IU": "infection urinaire", "IUGR": "retard de croissance intrautérin", "IV": "intraveineuse", "IVG": "Interruption volontaire de grossesse", "LH": "hormone lutéinisante", "MAS": "Maison d’accueil Spécialisée", "MFIU": "mort fœtale in utero", "MICI": "maladies inflammatoires chroniques de l'intestin", "MMR": "Réparation des mésappariements", "MODY": "Diabète de type MODY", "MPR": "Médecine physique et de réadaptation", "MSI": "instabilité des microsatellites", "NEM": "néoplasie endocrinienne multiple", "NFS": "numeration formule sanguine", "OAP": "œdème aigu du poumon", "OGE": "organes génitaux externes", "OMI": "œdème des membres inférieurs", "ORL": "Oto-rhino-laryngologiste", "PAF": "polypose adénomateuse familiale", "PC": "périmètre crânien", "PCN": "périmètre crânien de naissance", "PCR": "réaction de polymérisation en chaîne", "PKRD": "polykystose rénale autosomique dominante", "PMI": "Protection maternelle et infantile", "PN": "poids de naissance", "PO": "per os", "POCS": "Syndrome des pointes ondes continues du sommeil", "POF": "foramen ovale perméable", "PTG": "prothèse totale de genou", "QI": "quotient intellectuel", "QIT": "quotient intellectuel total", "RCIU": "retard de croissance intrautérin", "RCP": "Reunion de Concertation pluridisciplinaire", "RDPM": "retard de developpement psychomoteur", "ROT": "reflexes osteo-tendineux", "RT": "transcription inverse", "SAHS": "syndrome d'apnées/hypopnée du sommeil", "SAMU": "service d'aide médicale urgente", "SDRA": "syndrome de détresse respiratoire aiguë", "SEP": "Sclérose En Plaques", "SESSAD": "Service d’Education Spéciale et de Soins à Domicile", "SIADH": "syndrome de sécrétion inappropriée d'hormone anti-diurétique", "SOPK": "syndrome des ovaires polykystiques", "SPW": "syndrome de Prader-Willi", "SSR": "Soins de Suite et de Réadaptation", "STB": "Sclérose tubéreuse de Bourneville", "TAVI": "Implantation percutanée d'une prothèse valvulaire aortique", "TC": "traumatisme crânien", "TDA": "Trouble de déficit de l'attention avec ou sans hyperactivité", "TDAH": "Trouble de déficit de l'attention avec ou sans hyperactivité", "TDM": "tomodensitométrie", "TED": "Trouble envahissant du développement", "TEP": "tomographie par émission de positon", "TN": "taille de naissance", "TP": "taux de prothrombine", "TSA": "trouble de spectre de l'autisme", "TSH": "thyréostimuline", "TSLA ": "Trouble spécifique des apprentissages et du langage", "TSLO": "trouble spécifique du langage oral", "TTT": "traitement", "UGD": "ulcère gastroduodénal", "UHCD": "unité d'hospitalisation de courte durée", "USI": "unité de soins intensifs", "VD": "ventricule droite", "VG": "ventricule gauche", "WB": "western blot"}
 
1
+ {}
data/fr_abbreviations_translation.json CHANGED
@@ -1 +1 @@
1
- {"abdominal aorta aneurysm": "abdominal aortic aneurysm", "severe acute asthma": "Status asthmaticus", "Aarskog-Scott Syndrome ": "Aarskog-Scott syndrome", "bilateral absence of deferent channels": "Absent vas deferens", "atrophic cerebellar hypoplasia": "Atrophic cerebellar hypoplasia", "adrenocorticotropic hormone": "adrenocorticotropic hormone", "comparative genomic hybridization chip": "comparative genomic hybridization", "Hyperactivity disorder with attention deficit": "attention deficit hyperactivity disorder", "carcinoma": "carcinoma", "transient ischemic accident": "transient ischemic attack", "alanine aminotransferase": "alanine aminotransferase", "Association Nationale de Prevention en Alcoologie et Addictologie": "National association for treating addictive behavior", "obliterating arteriopathy of the lower limbs": "Inflammatory arteriopathy", "anomaly of the fetal cardiorrhythmia": "fetal distress", "aspartate aminotransferase": "aspartate aminotransferase", "autism spectrum disorder": "autism", "Stroke": "Stroke", "Support for school life": "school life assistance", "Atrioventricular block": "Atrioventricular block", "Chronic obstructive bronchopneumopathy": "Chronic pulmonary obstruction", "Part-time therapeutic reception centre": "Part-time Therapeutic Reception Centre", "colorectal cancer": "colorectal cancer", "University Hospital Centre": "University Hospital Center", "nuchal clarity": "nuchal skin", "variation of the number of copies": "copy nulmber variation", "Cranial pericmeter": "head circumference", "Multidisciplinary Centre for Prenatal Diagnostics": "Pluridisciplinary Center for Prenatal Diagnosis", "Autism Resource Centre": "Autism Resource Center for autistic behavior", "retardation of intrauterine growth": "intrauterin growth retardation", "C-reactive protein": "C-reactive protein", "primitive ciliary dyskinesia": "ciliary dyskinesia", "Frontotemporal dementia": "Frontotemporal dementia", "DiGeorge syndrome critical region": "diGeorge chromosomal region", "age-related macular degeneration": "age-related macular degeneration", "bone mineral density": "bone mineral density", "non-insulin-dependent diabetes mellitus": "type II diabetes mellitus", "pre-implentatory diagnosis": "pre-implantation diagnosis", "prenatal diagnosis": "prenatal diagnosis", "Non-invasive prenatal screening": "non-invasive prenatal testing", "disorder of sexual differentiation": "disorders of sex development", "type 1 diabetes": "Insulin-resistant diabetes mellitus", "type 2 diabetes": "type II diabetes mellitus", "Specific cognitive disorders and learning disorders they induce": "specific learning disabilities", "lipid balance": "lipid test", "electrocardiogram": "electrocardiogram", "Ethylenediaminetetraacetic": "Ethylenediaminetetraacetic acid", "Housing for Self-employed persons": "Elderly Dependent Care Accommodation", "Medical-Educational Institute": "medical-educational institute for his intellectual disability", "Establishment and Service of Labour Aids": "establishment and service of help by work for mild global developmental delay", "Patient Therapeutic Education": "Therapeutic Education for the Patient", "Medically assisted foster home ": "medical home for neurodevelopmental delay", "Spontaneous miscarriage": "Miscarriage", "In situ hybridization with epifluorescence": "Fluorescent In Situ Hybridization", "Invitro fertilization": "Fécondation invitro", "permeable oval foramen": "patent foramen ovale", "follicle-stimulating hormone": "follicle-stimulating hormone", "Early ovarian failure due to fragile X": "Fragile X-Associated Primary Ovarian Insufficiency", "tremor/ataxia syndrome associated with fragile X": "Fragile X Tremor Ataxia Syndrome", "fasting blood glucose levels": "fasting blood glucose", "growth hormone": "growth hormone", "pituitary gonadotrophin release hormone": "Gonadotropin Releasing Hormone", "Home Hospitalization": "Home hospitalisation", "Day hospitalization": "day hospital admissions", "Hypertension": "high blood pressure", "pulmonary hypertension": "Pulmonary arterial hypertension", "intracranial hypertension": "Increased intracranial pressure", "Myocardial infractions": "myocardial infarction", "Medical termination of pregnancy": "Medical Termination Of Pregnancy", "International Normalized Ratio": "International Normalized Ratio", "Chronic renal failure": "chronic kidney disease", "Magnetic resonance imaging": "Magnetic resonance imaging", "QIT": "FSIQ", "urinary tract infection": "urinary tract infection", "intravenous": "intravenous", "Voluntary termination of pregnancy": "Voluntary termination of pregnancy", "luteinizing hormone": "luteinizing hormone", "Specialised Home": "medical home for neurodevelopmental delay", "fetal death in utero": "Stillbirth", "chronic inflammatory diseases of the intestine": "Inflammatory bowel disease", "Repair of mismatches": "mismatch repair", "MODY type diabetes": "maturity onset diabetes of the young", "Physical medicine and rehabilitation": "Physical Medicine and Rehabilitation", "Microsatellite instability": "microsatellite instability", "multiple endocrine neoplasia": "Multiple endocrine neoplasia", "Numbering of blood formula": "numeration formule sanguine", "acute oedema of the lung": "Pulmonary edema", "external genital organs": "external genitals", "oedema of the lower limbs": "Pedal edema", "Oto-rhino-laryngologist": "otolaryngologist", "family adenomatous polyposia": "Familial adenomatous polyposis", "Cranial perimeter": "head circumference", "Cranial perimeter of birth": "head circumference at birth", "polymerization chain reaction": "polymerase chain reaction", "autosomal renal polycystosis dominant": "polycystic kidney dysplasia", "Maternal and child protection": "Maternal and child protection", "birth weight": "birth weight", "per os": "per os", "Continuous Sleep Wave Point Syndrome": "Continuous spike and waves during slow sleep", "total knee prosthesis": "Total knee arthroplasty", "intellectual quotient": "IQ", "total intellectual quotient": "FSIQ", "Multidisciplinary Concertation Meeting": "Multidisciplinary Consultation Meeting", "retardation of psychomotor development": "global developmental delay", "osteo-tendinous reflexes": "numeration formule sanguine", "reverse transcription": "reverse transcription", "apnea/sleep hypopnoea syndrome": "Sleep apnea", "emergency medical aid service": "Emergency medical services", "Acute Respiratory Distress Syndrome": "acute respiratory distress syndrome", "Plated Sclerosis": "multiple sclerosis", "Special Education and Home Care Service": "specific education services for his mild global developmental delay", "Inappropriate anti-diuretic hormone secretion syndrome": "syndrome of inappropriate anti-diuretic hormone secretion", "Polycystic Ovarian Syndrome": "Polycystic ovary syndrome", "Prader-Willi syndrome": "Prader-Willi syndrome", "Follow-up and Rehabilitation Care": "Follow-up and Rehabilitation Care", "Tuberous sclerosis of Bourneville": "Tuberous sclerosis", "Percutaneous implantation of aortic valve prosthesis": "Transcatheter Aortic Valve Implantation for abnormal aortic valve morphology", "head injury": "head trauma", "Attention deficit disorder with or without hyperactivity": "attention deficit hyperactivity disorder", "CT-modensitometry": "CT-scan", "Invasive development disorder": "autism", "positon emission tomography": "positron emission tomography", "birth size": "birth heigh", "prothrombin level": "Prothrombin Ratio", "thyreostimulin": "thyroid-stimulating hormone", "Specific learning and language problems": "delayed speech and language development", "speech-specific disorder": "delayed speech and language development", "treatment": "treatment", "Gastroduodenal ulcer": "Peptic ulcer", "Short-term hospitalization unit": "short-term hospitalization unit", "intensive care unit": "intensive care unit", "right ventricle": "right ventricle", "left ventricle": "left ventricle", "Western blot": "western blot"}
 
1
+ {}
data/hp_fr_en_translated_marian_review_lwg.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/proper_noun_location_sort.csv ADDED
The diff for this file is too large to render. See raw diff
 
lf_app.py CHANGED
@@ -12,7 +12,7 @@ from typing import List
12
  import transformers
13
  from typing import Sequence
14
  import spacy
15
- from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
16
  from presidio_analyzer.nlp_engine import NlpEngineProvider
17
  from presidio_anonymizer import AnonymizerEngine
18
  from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
@@ -66,6 +66,18 @@ def get_models():
66
  print(spacy_model_name + " already downloaded")
67
  return "Done"
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  @st.cache_data(max_entries=30)
71
  def get_list_not_deidentify():
@@ -116,12 +128,15 @@ def config_deidentify():
116
  # Create NLP engine based on configuration
117
  provider = NlpEngineProvider(nlp_configuration=configuration)
118
  nlp_engine = provider.create_engine()
 
119
 
120
  analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
 
121
  engine = AnonymizerEngine()
122
  del configuration
123
  del provider
124
  del nlp_engine
 
125
  return analyzer, engine
126
 
127
 
@@ -248,16 +263,21 @@ def anonymize_analyzer(MarianText_letter, _analyzer, nom_propre, nom, prenom):
248
  analyzer_results_keep = []
249
  analyzer_results_return = []
250
  analyzer_results_saved = []
251
- analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
252
  len_to_add = 0
253
  analyser_results_to_sort = {}
254
  i = 0
 
255
  for element in analyzer_results:
256
- analyser_results_to_sort[i] = element.start
 
 
 
 
257
  i = i + 1
258
  sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
259
  sorted_dict = {k: v for k, v in sorted_tuples}
260
- # st.write(sorted_dict)
261
  exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
262
 
263
  for element_raw in sorted_dict:
@@ -337,6 +357,7 @@ def anonymize_engine(MarianText_letter, _analyzer_results_return, _engine, _nlp)
337
  operators={
338
  "PERSON": OperatorConfig("replace", {"new_value": ""}),
339
  "LOCATION": OperatorConfig("replace", {"new_value": ""}),
 
340
  },
341
  )
342
  return reformat_to_letter(result.text, _nlp)
@@ -729,6 +750,8 @@ def main_function(inputStr):
729
  return returnDf, returnDfUnsafe
730
 
731
  models_status = get_models()
 
 
732
  nlp_fr, marian_fr_en = get_nlp_marian()
733
  #nlp_en = get_nlp_en()
734
  dict_correction = get_translation_dict_correction()
@@ -736,6 +759,7 @@ dict_abbreviation_correction = get_abbreviation_dict_correction()
736
  nom_propre = get_list_not_deidentify()
737
  analyzer, engine = config_deidentify()
738
 
 
739
  if "load_state" not in st.session_state:
740
  st.session_state.load_state = False
741
 
@@ -928,7 +952,7 @@ if submit_button or st.session_state.load_state:
928
  st.download_button(
929
  "Download summarized letter in PhenoGenius list of HPO format",
930
  convert_list_phenogenius(clinphen_df),
931
- nom + "_" + prenom + "_summarized_letter_list_phenogenius.tsv",
932
  "text",
933
  key="download-summarization-phenogenius",
934
  )
 
12
  import transformers
13
  from typing import Sequence
14
  import spacy
15
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
16
  from presidio_analyzer.nlp_engine import NlpEngineProvider
17
  from presidio_anonymizer import AnonymizerEngine
18
  from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
 
66
  print(spacy_model_name + " already downloaded")
67
  return "Done"
68
 
69
+ @st.cache_data(max_entries=30)
70
+ def get_cities_list():
71
+ cities = pd.read_csv('data/proper_noun_location_sort.csv')
72
+ cities.columns=['ville']
73
+ whole_cities_patterns = []
74
+ list_cities = cities['ville'].to_list()
75
+ for element in list_cities:
76
+ whole_cities_patterns.append(element)
77
+ whole_cities_patterns.append(element.lower().capitalize())
78
+ del cities
79
+ del list_cities
80
+ return whole_cities_patterns
81
 
82
  @st.cache_data(max_entries=30)
83
  def get_list_not_deidentify():
 
128
  # Create NLP engine based on configuration
129
  provider = NlpEngineProvider(nlp_configuration=configuration)
130
  nlp_engine = provider.create_engine()
131
+ frcity_recognizer = PatternRecognizer(supported_entity="FRENCH_CITY", deny_list=cities_list)
132
 
133
  analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
134
+ analyzer.registry.add_recognizer(frcity_recognizer)
135
  engine = AnonymizerEngine()
136
  del configuration
137
  del provider
138
  del nlp_engine
139
+ del frcity_recognizer
140
  return analyzer, engine
141
 
142
 
 
263
  analyzer_results_keep = []
264
  analyzer_results_return = []
265
  analyzer_results_saved = []
266
+ analyzer_results = _analyzer.analyze(text=MarianText_letter, language="en", entities=["DATE_TIME", "PERSON", "FRENCH_CITY"], allow_list=['evening', 'day', 'the day', 'the age of', 'age', 'years', 'week', 'years old', 'months', 'hours', 'night', 'noon', 'nights', 'tomorrow', 'today', 'yesterday'])
267
  len_to_add = 0
268
  analyser_results_to_sort = {}
269
  i = 0
270
+ detect_duplicated = []
271
  for element in analyzer_results:
272
+ if element.start not in detect_duplicated:
273
+ analyser_results_to_sort[i] = element.start
274
+ detect_duplicated.append(element.start)
275
+ else:
276
+ pass
277
  i = i + 1
278
  sorted_tuples = sorted(analyser_results_to_sort.items(), key=lambda x: x[1])
279
  sorted_dict = {k: v for k, v in sorted_tuples}
280
+ print(sorted_dict)
281
  exception_list_presidio = ['age', 'year', 'month', 'day', 'hour', 'week']
282
 
283
  for element_raw in sorted_dict:
 
357
  operators={
358
  "PERSON": OperatorConfig("replace", {"new_value": ""}),
359
  "LOCATION": OperatorConfig("replace", {"new_value": ""}),
360
+ "FRENCH_CITY": OperatorConfig("replace", {"new_value": ""}),
361
  },
362
  )
363
  return reformat_to_letter(result.text, _nlp)
 
750
  return returnDf, returnDfUnsafe
751
 
752
  models_status = get_models()
753
+ cities_list = get_cities_list()
754
+ #print(cities_list)
755
  nlp_fr, marian_fr_en = get_nlp_marian()
756
  #nlp_en = get_nlp_en()
757
  dict_correction = get_translation_dict_correction()
 
759
  nom_propre = get_list_not_deidentify()
760
  analyzer, engine = config_deidentify()
761
 
762
+
763
  if "load_state" not in st.session_state:
764
  st.session_state.load_state = False
765
 
 
952
  st.download_button(
953
  "Download summarized letter in PhenoGenius list of HPO format",
954
  convert_list_phenogenius(clinphen_df),
955
+ nom + "_" + prenom + "_summarized_letter.txt",
956
  "text",
957
  key="download-summarization-phenogenius",
958
  )