GERNET Enody commited on
Commit
44a9dbf
·
unverified ·
1 Parent(s): ccc4c35

Add files via upload

Browse files
Files changed (3) hide show
  1. README.md +32 -12
  2. clinfly_app_cli.py +152 -0
  3. clinfly_app_st.py +195 -0
README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
  title: ClinFly
3
  emoji: small_airplane
4
- sdk: streamlit
5
  sdk_version: 1.21.0
6
- app_file: clinfly_app.py
 
7
  pinned: true
8
  ---
9
 
@@ -26,24 +26,44 @@ By facilitating the translation and anonymization of clinical reports, ClinFly h
26
 
27
  ![](img/pipeline.png)
28
 
29
- ## Run the framework
30
-
31
- A webapp is accessible at [https://huggingface.co/spaces/kyauy/ClinFly](https://huggingface.co/spaces/kyauy/ClinFly), **please try it !**
32
-
33
- It's a streamlit application, where code is accessible in ̀`clinfly_app.py` file.
34
 
35
  To install on your local machine, you need `poetry` package manager and launch in the folder:
36
  ```
37
  poetry install
38
  ```
39
 
40
- To make it run in your local computer:
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  ```
42
  poetry shell
43
- streamlit run clinfly_app.py
44
  ```
45
 
46
- Using requirement ?
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ```
48
- poetry export --without-hashes --format=requirements.txt > requirements.txt
49
- ```
 
1
  ---
2
  title: ClinFly
3
  emoji: small_airplane
 
4
  sdk_version: 1.21.0
5
+ streamlit_file: clinfly_app_st.py
6
+ CLI_file: clinfly_app_cli.py
7
  pinned: true
8
  ---
9
 
 
26
 
27
  ![](img/pipeline.png)
28
 
29
+ ## Poetry Installation
 
 
 
 
30
 
31
  To install on your local machine, you need `poetry` package manager and launch in the folder:
32
  ```
33
  poetry install
34
  ```
35
 
36
+ Using requirement ?
37
+ ```
38
+ poetry export --without-hashes --format=requirements.txt > requirements.txt
39
+ ```
40
+
41
+ ## Run the code
42
+
43
+ ### Graphical User Interface - Single report usage with interactive analysis
44
+
45
+ A webapp is accessible at https://huggingface.co/spaces/kyauy/ClinFly, please try it !
46
+
47
+ It's a streamlit application, where code is accessible in ̀`clinfly_app_st.py` file. The functions are accessible in the `utilities` folder.
48
+
49
+ To run the streamlit application on your local computer :
50
  ```
51
  poetry shell
52
+ streamlit run clinfly_app_st.py
53
  ```
54
 
55
+ ### Command Line Interface - Multiple report usage with offline options
56
+
57
+ The code is accessible in ̀`clinfly_app_cli.py` file. The functions are accessible in the `utilities` folder.
58
+
59
+ The output will be placed in the `results` folder according to the file extension.
60
+
61
+ A resume of the deidentify report will be generated and placed in the `results/Reports` folder.
62
+
63
+ Three HPO extraction output will be generated, TSV, TXT and Json.
64
+
65
+ To run the CLI application on your local computer :
66
+ ```
67
+ poetry shell
68
+ <python running version> clinfly_app_cli.py --file <input csv file with the registration> --language <language of the file> --output_dir <The output directory of the model (OPTIONAL)>
69
  ```
 
 
clinfly_app_cli.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import os
3
+ import argparse
4
+ import pandas as pd
5
+ from utilities.anonymize import get_cities_list,get_abbreviation_dict_correction, reformat_to_report, anonymize_analyzer, anonymize_engine, add_space_to_comma_endpoint,get_list_not_deidentify, config_deidentify
6
+ from utilities.translate import get_translation_dict_correction, translate_report
7
+ from utilities.convert import convert_df_no_header, convert_df, convert_json, convert_list_phenogenius
8
+ from utilities.extract_hpo import add_biometrics, extract_hpo
9
+ from utilities.get_model import get_models, get_nlp_marian
10
+ import gc
11
+
12
+
13
+ def main():
14
+
15
+ print("Code Starting")
16
+ MarianText, _, _ = translate_report(
17
+ Report,
18
+ Last_name,
19
+ First_name,
20
+ nlp_fr,
21
+ marian_fr_en,
22
+ dict_correction,
23
+ dict_abbreviation_correction,
24
+ )
25
+ MarianText_report = reformat_to_report(MarianText, nlp_fr)
26
+ del MarianText
27
+
28
+ print("Translation and De-identification")
29
+ (
30
+ MarianText_anonymize_report_analyze,
31
+ analyzer_results_return,
32
+ _,
33
+ _,
34
+ ) = anonymize_analyzer(MarianText_report, analyzer, proper_noun, Last_name, First_name)
35
+
36
+ print(MarianText_anonymize_report_analyze)
37
+
38
+ MarianText_anonymize_report_engine = anonymize_engine(
39
+ MarianText_report, analyzer_results_return, engine, nlp_fr
40
+ )
41
+
42
+ MarianText_anonymize_report_engine_modif = pd.DataFrame(
43
+ [x for x in MarianText_anonymize_report_engine.split("\n")]
44
+ )
45
+
46
+
47
+
48
+ MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif
49
+ with open("results/Reports/" + Last_name + "_" + First_name + "_translated_and_deindentified_report.txt", 'w') as file:
50
+ file.write(convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8"))
51
+ print("Text file created successfully : " + Last_name + "_" + First_name + "_translated_and_deindentified_report.txt")
52
+
53
+ print("Summarization")
54
+
55
+
56
+
57
+ MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
58
+ MarianText_anonymize_report_engine, nlp_fr
59
+ )
60
+ MarianText_anonymized_reformat_biometrics, _ = add_biometrics(
61
+ MarianText_anonymized_reformat_space, nlp_fr
62
+ )
63
+ clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)
64
+
65
+ del MarianText_anonymize_report_engine
66
+ del MarianText_anonymized_reformat_space
67
+ del MarianText_anonymized_reformat_biometrics
68
+
69
+ clinphen_unsafe_check_raw = clinphen_unsafe
70
+ clinphen_unsafe_check_raw["To keep in list"] = False
71
+ clinphen_unsafe_check_raw["Confidence on extraction"] = "low"
72
+
73
+ del clinphen_unsafe
74
+
75
+ clinphen["Confidence on extraction"] = "high"
76
+ clinphen["To keep in list"] = True
77
+
78
+ cols = [
79
+ "HPO ID",
80
+ "Phenotype name",
81
+ "To keep in list",
82
+ "No. occurrences",
83
+ "Earliness (lower = earlier)",
84
+ "Confidence on extraction",
85
+ "Example sentence",
86
+ ]
87
+ clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
88
+ clinphen_all = clinphen_all[cols]
89
+
90
+ clinphen_df = clinphen_all
91
+ clinphen_df_without_low_confidence = clinphen_df[clinphen_df["To keep in list"]== True]
92
+ del clinphen
93
+ del clinphen_unsafe_check_raw
94
+ gc.collect()
95
+
96
+
97
+ with open("results/TSV/" + Last_name + "_" + First_name + "_summarized_report.tsv", 'w') as file:
98
+ file.write(convert_df(clinphen_df).decode("utf-8"))
99
+ print("Tsv file created successfully : " + Last_name + "_" + First_name + "_summarized_report.tsv")
100
+
101
+
102
+ with open("results/JSON/" + Last_name + "_" + First_name + "_summarized_report.json", 'w') as file:
103
+ file.write(convert_json(clinphen_df_without_low_confidence))
104
+ print("JSON file created successfully : " + Last_name + "_" + First_name + "_summarized_report.json")
105
+
106
+
107
+ with open("results/TXT/" + Last_name + "_" + First_name + "_summarized_report.txt", 'w') as file:
108
+ file.write(convert_list_phenogenius(clinphen_df_without_low_confidence))
109
+ print("Text file created successfully : " + Last_name + "_" + First_name + "_summarized_report.txt")
110
+
111
+
112
+
113
+ if __name__ == "__main__":
114
+
115
+ print("Welcome to the Clinfly app")
116
+
117
+
118
+ parser = argparse.ArgumentParser(description="How to use Clinfly.")
119
+ parser.add_argument("--file", type=str,help="the input file which contains the visits informations", required=True)
120
+ parser.add_argument("--language", choices=['fr', 'es', 'de'],type=str, help="The language of the input : fr, es , de",required=True)
121
+ parser.add_argument("--output_dir",default=os.path.expanduser("~"),type=str, help="The directory where the models will be downloaded.")
122
+
123
+
124
+ args = parser.parse_args()
125
+
126
+ if not os.path.exists(args.output_dir):
127
+ os.makedirs(args.output_dir)
128
+
129
+ print("Language chosen :", args.language)
130
+ models_status = get_models(args.language,args.output_dir)
131
+ dict_correction = get_translation_dict_correction()
132
+ dict_abbreviation_correction = get_abbreviation_dict_correction()
133
+ proper_noun = get_list_not_deidentify()
134
+ cities_list = get_cities_list()
135
+ analyzer, engine = config_deidentify(cities_list)
136
+ nlp_fr, marian_fr_en = get_nlp_marian(args.language)
137
+
138
+ file_name = args.file
139
+ Last_name :str
140
+ First_name : str
141
+ Report : str
142
+ with open(file_name, newline='', encoding='utf-8-sig') as fichier_csv:
143
+ lecteur_csv = csv.reader(fichier_csv, delimiter=";")
144
+ for ligne in lecteur_csv:
145
+ Last_name, First_name, Report = ligne
146
+ print("Last_name:", Last_name)
147
+ print("First_name:", First_name)
148
+ print("Report:", Report)
149
+ main()
150
+ print()
151
+
152
+
clinfly_app_st.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utilities.web_utilities import display_page_title, display_sidebar, stack_checker
3
+ from utilities.anonymize import get_cities_list,get_abbreviation_dict_correction, reformat_to_report, anonymize_analyzer, anonymize_engine, add_space_to_comma_endpoint,get_list_not_deidentify, config_deidentify
4
+ from utilities.translate import get_translation_dict_correction, translate_report
5
+ from utilities.convert import convert_df_no_header, convert_df, convert_json, convert_list_phenogenius
6
+ from utilities.extract_hpo import add_biometrics, extract_hpo
7
+ from utilities.get_model import get_models, get_nlp_marian
8
+ import streamlit as st
9
+ import gc
10
+
11
+ # -- Set page config
12
+ app_title: str = "ClinFly"
13
+
14
+ display_page_title(app_title)
15
+ display_sidebar()
16
+
17
+ cities_list = get_cities_list()
18
+ dict_correction = get_translation_dict_correction()
19
+ dict_abbreviation_correction = get_abbreviation_dict_correction()
20
+ nom_propre = get_list_not_deidentify()
21
+ analyzer, engine = config_deidentify(cities_list)
22
+
23
+ if "load_models" not in st.session_state:
24
+ st.session_state.load_models = False
25
+
26
+ if "select_lang" not in st.session_state:
27
+ st.session_state.select_lang = False
28
+
29
+ if "nlp_fr" not in st.session_state:
30
+ st.session_state.nlp_fr = False
31
+
32
+ if "marian_fr_en" not in st.session_state:
33
+ st.session_state.marian_fr_en = False
34
+
35
+ if "load_report" not in st.session_state:
36
+ st.session_state.load_report = False
37
+
38
+ if st.session_state.load_models is False:
39
+ with st.form("language"):
40
+ source_lang = st.selectbox(
41
+ "Which is the language of the letter :fr: :es: :de: ?", ("fr", "es", "de") # "it"
42
+ )
43
+ submit_button_L = st.form_submit_button(label="Submit language")
44
+
45
+ if submit_button_L:
46
+ with st.spinner('Downloading models, it takes a moment, please wait'):
47
+ models_status = get_models(source_lang)
48
+ nlp_fr, marian_fr_en = get_nlp_marian(source_lang)
49
+ st.session_state.select_lang = source_lang
50
+ st.session_state.nlp_fr = nlp_fr
51
+ st.session_state.marian_fr_en = marian_fr_en
52
+ st.session_state.load_models = True
53
+
54
+ if st.session_state.load_models is True:
55
+ st.info('Selected language is : ' + st.session_state.select_lang)
56
+ with st.form("my_form"):
57
+ c1, c2 = st.columns(2)
58
+ with c1:
59
+ nom = st.text_input("Last name", "Doe", key="name")
60
+ with c2:
61
+ prenom = st.text_input("First name", "John", key="surname")
62
+ courrier = st.text_area(
63
+ "Paste medical letter",
64
+ "Chers collegues, j'ai recu en consultation M. John Doe né le 14/07/1789 pour une fièvre récurrente et une maladie de Crohn. Il a pour antécédent des epistaxis recurrents. Parmi les antécédants familiaux, sa maman a présenté un cancer des ovaires. Il mesure 1.90 m (+2.5 DS), pèse 93 kg (+3.6 DS) et son PC est à 57 cm (+0DS) ...",
65
+ height=200,
66
+ key="letter",
67
+ )
68
+
69
+ submit_button = st.form_submit_button(label="Submit report")
70
+
71
+
72
+ if submit_button or st.session_state.load_report:
73
+ st.session_state.load_report = True
74
+ MarianText, list_replaced, list_replaced_abb_name = translate_report(
75
+ courrier,
76
+ nom,
77
+ prenom,
78
+ st.session_state.nlp_fr,
79
+ st.session_state.marian_fr_en,
80
+ dict_correction,
81
+ dict_abbreviation_correction,
82
+ )
83
+ MarianText_letter = reformat_to_report(MarianText, st.session_state.nlp_fr)
84
+ del MarianText
85
+
86
+ st.subheader("Translation and De-identification")
87
+ (
88
+ MarianText_anonymize_letter_analyze,
89
+ analyzer_results_return,
90
+ analyzer_results_keep,
91
+ analyzer_results_saved,
92
+ ) = anonymize_analyzer(MarianText_letter, analyzer, nom_propre, nom, prenom)
93
+
94
+ st.caption(MarianText_anonymize_letter_analyze)
95
+
96
+ MarianText_anonymize_letter_engine = anonymize_engine(
97
+ MarianText_letter, analyzer_results_return, engine, st.session_state.nlp_fr
98
+ )
99
+
100
+ MarianText_anonymize_letter_engine_modif = pd.DataFrame(
101
+ [x for x in MarianText_anonymize_letter_engine.split("\n")]
102
+ )
103
+ MarianText_anonymize_letter_engine_modif.columns = [
104
+ "Modify / curate the automatically translated and de-identified letter before downloading:"
105
+ ]
106
+ MarianText_anonymize_letter_engine_df = st.data_editor(
107
+ MarianText_anonymize_letter_engine_modif,
108
+ num_rows="dynamic",
109
+ key="letter_editor",
110
+ use_container_width=True,
111
+ )
112
+
113
+ st.caption("Modify cells above 👆 or even ➕ add rows, before downloading 👇")
114
+
115
+ st.download_button(
116
+ "Download translated and de-identified letter",
117
+ convert_df_no_header(MarianText_anonymize_letter_engine_df),
118
+ nom + "_" + prenom + "_translated_and_deindentified_letter.txt",
119
+ "text",
120
+ key="download-translation-deindentification",
121
+ )
122
+
123
+ st.subheader("Summarization")
124
+
125
+ MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
126
+ MarianText_anonymize_letter_engine, st.session_state.nlp_fr
127
+ )
128
+ MarianText_anonymized_reformat_biometrics, additional_terms = add_biometrics(
129
+ MarianText_anonymized_reformat_space, st.session_state.nlp_fr
130
+ )
131
+ clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)
132
+
133
+ del MarianText_anonymize_letter_engine
134
+ del MarianText_anonymized_reformat_space
135
+ del MarianText_anonymized_reformat_biometrics
136
+
137
+ clinphen_unsafe_check_raw = clinphen_unsafe
138
+ # clinphen_unsafe_check_raw["name"] = nom
139
+ # clinphen_unsafe_check_raw["surname"] = prenom
140
+ clinphen_unsafe_check_raw["To keep in list"] = False
141
+ clinphen_unsafe_check_raw["Confidence on extraction"] = "low"
142
+
143
+ del clinphen_unsafe
144
+
145
+ # clinphen["name"] = nom
146
+ # clinphen["surname"] = prenom
147
+ clinphen["Confidence on extraction"] = "high"
148
+ clinphen["To keep in list"] = True
149
+
150
+ cols = [
151
+ "HPO ID",
152
+ "Phenotype name",
153
+ "To keep in list",
154
+ "No. occurrences",
155
+ "Earliness (lower = earlier)",
156
+ "Confidence on extraction",
157
+ "Example sentence",
158
+ ]
159
+ clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
160
+ clinphen_all = clinphen_all[cols]
161
+ clinphen_df = st.data_editor(
162
+ clinphen_all, num_rows="dynamic", key="data_editor"
163
+ )
164
+ clinphen_df_without_low_confidence = clinphen_df[clinphen_df["To keep in list"]== True]
165
+ del clinphen
166
+ del clinphen_unsafe_check_raw
167
+ gc.collect()
168
+
169
+ st.caption(
170
+ "Modify cells above 👆, click ☐ to keep low confidence symptoms in list, or even ➕ add rows, before downloading 👇"
171
+ )
172
+
173
+ st.download_button(
174
+ "Download summarized letter in HPO CSV format",
175
+ convert_df(clinphen_df),
176
+ nom + "_" + prenom + "_summarized_letter.tsv",
177
+ "text/csv",
178
+ key="download-summarization",
179
+ )
180
+
181
+ st.download_button(
182
+ "Download summarized letter in Phenotips JSON format (hygen compatible)",
183
+ convert_json(clinphen_df_without_low_confidence),
184
+ nom + "_" + prenom + "_summarized_letter.json",
185
+ "json",
186
+ key="download-summarization-json",
187
+ )
188
+
189
+ st.download_button(
190
+ "Download summarized letter in PhenoGenius list of HPO format",
191
+ convert_list_phenogenius(clinphen_df_without_low_confidence),
192
+ nom + "_" + prenom + "_summarized_letter.txt",
193
+ "text",
194
+ key="download-summarization-phenogenius",
195
+ )