Spaces:
Running
Running
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| from PIL import Image | |
| import ujson as json | |
| import pickle as pk | |
| from plotnine import * | |
| # -- Set page config | |
| apptitle = "PhenoGenius" | |
| st.set_page_config( | |
| page_title=apptitle, | |
| page_icon=":genie:", | |
| layout="wide", | |
| initial_sidebar_state="auto", | |
| ) | |
| # -- Set Sidebar | |
| image_pg = Image.open("data/img/phenogenius.png") | |
| st.sidebar.image(image_pg, caption=None, width=100) | |
| st.sidebar.title("PhenoGenius") | |
| st.sidebar.header( | |
| "Learning phenotypic patterns in genetic diseases by symptom interaction modeling" | |
| ) | |
| st.sidebar.markdown( | |
| """ | |
| This webapp presents symptom interaction models in genetic diseases to provide: | |
| - Standardized clinical descriptions | |
| - Interpretable matches between symptoms and genes | |
| Code source is available in GitHub: | |
| [https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius) | |
| Last update: 2024-07-15 | |
| PhenoGenius is a collaborative project from: | |
| """ | |
| ) | |
| image_uga = Image.open("data/img/logo-uga.png") | |
| st.sidebar.image(image_uga, caption=None, width=95) | |
| image_seqone = Image.open("data/img/logo-seqone.png") | |
| st.sidebar.image(image_seqone, caption=None, width=95) | |
| image_miai = Image.open("data/img/logoMIAI-rvb.png") | |
| st.sidebar.image(image_miai, caption=None, width=95) | |
| image_chuga = Image.open("data/img/logo-chuga.png") | |
| st.sidebar.image(image_chuga, caption=None, width=60) | |
| def convert_df(df): | |
| return df.to_csv(sep="\t").encode("utf-8") | |
| def load_data(): | |
| matrix = pd.read_csv( | |
| "data/resources/ohe_all_thesaurus_weighted_2024.tsv.gz", | |
| sep="\t", | |
| compression="gzip", | |
| index_col=0, | |
| ) | |
| return matrix | |
| def load_nmf_model(): | |
| with open("data/resources/pheno_NMF_390_model_42_2024.pkl", "rb") as pickle_file: | |
| pheno_NMF = pk.load(pickle_file) | |
| with open("data/resources/pheno_NMF_390_matrix_42_2024.pkl", "rb") as pickle_file: | |
| reduced = pk.load(pickle_file) | |
| return pheno_NMF, reduced | |
| def symbol_to_id_to_dict(): | |
| # from NCBI | |
| ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t") | |
| ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606] | |
| ncbi_df_ncbi = ncbi_df.set_index("Symbol") | |
| ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict() | |
| ncbi_df = ncbi_df.set_index("GeneID") | |
| ncbi_to_dict = ncbi_df["Symbol"].to_dict() | |
| return ncbi_to_dict_ncbi, ncbi_to_dict | |
| def load_hp_ontology(): | |
| with open("data/resources/hpo_obo_2024.json") as json_data: | |
| data_dict = json.load(json_data) | |
| return data_dict | |
| def hpo_description_to_id(): | |
| data_dict = {} | |
| for key, value in hp_onto.items(): | |
| data_dict[value["name"]] = key | |
| return data_dict | |
| def load_topic_data(): | |
| topic = pd.read_csv( | |
| "data/resources/main_topics_hpo_390_42_filtered_norm_004_2024.tsv", | |
| sep="\t", | |
| index_col=0, | |
| ) | |
| return topic | |
| def load_similarity_dict(): | |
| with open("data/resources/similarity_dict_threshold_80_2024.json") as json_data: | |
| data_dict = json.load(json_data) | |
| return data_dict | |
| def get_symbol(gene): | |
| if gene in symbol.keys(): | |
| return symbol[gene] | |
| def get_hpo_name(hpo): | |
| names = {} | |
| if hpo in hp_onto.keys(): | |
| names[hpo] = hp_onto[hpo]["name"] | |
| return names | |
| def get_hpo_name_only(hpo): | |
| if hpo in hp_onto.keys(): | |
| return hp_onto[hpo]["name"] | |
| else: | |
| return None | |
| def get_hpo_name_list(hpo_list, hp_onto): | |
| names = {} | |
| for hpo in hpo_list: | |
| if hpo in hp_onto.keys(): | |
| names[hpo] = hp_onto[hpo]["name"] | |
| return names | |
| def get_similar_terms(hpo_list, similarity_terms_dict): | |
| hpo_list_w_simi = {} | |
| for term in hpo_list: | |
| hpo_list_w_simi[term] = 1 | |
| if term in similarity_terms_dict.keys(): | |
| for key, value in similarity_terms_dict[term].items(): | |
| if value > 0.8: | |
| score = value / len(similarity_terms_dict[term].keys()) | |
| if key in hpo_list_w_simi.keys(): | |
| if score > hpo_list_w_simi[key]: | |
| hpo_list_w_simi[key] = score | |
| else: | |
| pass | |
| else: | |
| hpo_list_w_simi[key] = score | |
| hpo_list_all = hpo_list_w_simi.keys() | |
| return hpo_list_w_simi, list(hpo_list_all) | |
| def score(hpo_list, matrix): | |
| # Create a copy of the filtered matrix to avoid SettingWithCopyWarning | |
| matrix_filter = matrix[hpo_list].copy() | |
| # Use .loc to safely add or modify columns in the copy of the DataFrame | |
| matrix_filter.loc[:, "sum"] = matrix_filter.sum(axis=1) | |
| matrix_filter.loc[:, "gene_symbol"] = matrix_filter.index.to_series().apply( | |
| get_symbol | |
| ) | |
| # Return the modified DataFrame sorted by 'sum' | |
| return matrix_filter.sort_values("sum", ascending=False) | |
| def score_sim_add(hpo_list_add, matrix, sim_dict): | |
| # Ensure matrix_filter is a copy to avoid modifying the original DataFrame | |
| matrix_filter = matrix[hpo_list_add].copy() | |
| # Iterate through sim_dict to update matrix_filter values | |
| for key, value in sim_dict.items(): | |
| if key in matrix_filter.columns: | |
| matrix_filter[key] = ( | |
| matrix_filter[key] * value | |
| ) # Direct column assignment is fine here | |
| # Calculate the sum and assign gene_symbol, using direct assignment for these operations | |
| matrix_filter["sum"] = matrix_filter.sum(axis=1) | |
| matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol) | |
| # Return the DataFrame sorted by 'sum' | |
| return matrix_filter.sort_values("sum", ascending=False) | |
| def get_phenotype_specificity(gene_diag, data_patient): | |
| rank = data_patient.loc[int(ncbi[gene_diag]), "rank"] | |
| max_rank = data_patient["rank"].max() | |
| if rank == max_rank: | |
| return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general." | |
| elif rank < 41: | |
| return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)." | |
| elif rank < 250: | |
| return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)." | |
| else: | |
| return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity." | |
| def get_relatives_list(hpo_list, hp_onto): | |
| all_list = [] | |
| for hpo in hpo_list: | |
| all_list.append(hpo) | |
| if hpo in hp_onto.keys(): | |
| for parent in hp_onto[hpo]["parents"]: | |
| all_list.append(parent) | |
| for children in hp_onto[hpo]["childrens"]: | |
| all_list.append(children) | |
| return list(set(all_list)) | |
| def get_hpo_id(hpo_list): | |
| hpo_id = [] | |
| for description in hpo_list: | |
| hpo_id.append(hp_desc_id[description]) | |
| return ",".join(hpo_id) | |
| hp_onto = load_hp_ontology() | |
| hp_desc_id = hpo_description_to_id() | |
| ncbi, symbol = symbol_to_id_to_dict() | |
| with st.form("my_form"): | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| hpo_raw = st.multiselect( | |
| "Select interactively your HPOs or...", | |
| list(hp_desc_id.keys()), | |
| ["Renal cyst", "Hepatic cysts"], | |
| ) | |
| with c2: | |
| hpo = st.text_input( | |
| "copy/paste your HPOs, separated with comma", | |
| "HP:0000107,HP:0001407", | |
| ) | |
| gene_diag_input = st.multiselect( | |
| "Optional: provide HGNC gene symbol to be tested", | |
| options=list(ncbi.keys()), | |
| default=["PKD1"], | |
| max_selections=1, | |
| ) | |
| submit_button = st.form_submit_button( | |
| label="Submit", | |
| ) | |
| if submit_button: | |
| if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0: | |
| hpo = get_hpo_id(hpo_raw) | |
| data = load_data() | |
| pheno_NMF, reduced = load_nmf_model() | |
| topic = load_topic_data() | |
| similarity_terms_dict = load_similarity_dict() | |
| hpo_list_ini = hpo.strip().split(",") | |
| if gene_diag_input: | |
| if gene_diag_input[0] in ncbi.keys(): | |
| gene_diag = gene_diag_input[0] | |
| else: | |
| st.write( | |
| gene_diag_input | |
| + " gene are not in our database. Please check gene name (need to be in CAPITAL format)." | |
| ) | |
| gene_diag = None | |
| else: | |
| gene_diag = None | |
| hpo_list_up = [] | |
| for hpo in hpo_list_ini: | |
| if hpo in ["HP:0000001"]: | |
| pass | |
| elif len(hpo) != 10: | |
| st.write( | |
| "Incorrect HPO format: " | |
| + hpo | |
| + ". Please check (7-digits terms with prefix HP:, and separed by commas)." | |
| ) | |
| pass | |
| elif hpo not in data.columns: | |
| pass | |
| st.write(hpo + " not available in current database. Please modify.") | |
| else: | |
| if data[hpo].astype(bool).sum(axis=0) != 0: | |
| hpo_list_up.append(hpo) | |
| else: | |
| hpo_to_test = hp_onto[hpo]["direct_parent"][0] | |
| while data[hpo_to_test].astype(bool).sum( | |
| axis=0 | |
| ) == 0 and hpo_to_test not in ["HP:0000001"]: | |
| hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0] | |
| if hpo_to_test in ["HP:0000001"]: | |
| st.write( | |
| "No gene-HPO associations was found for " | |
| + hpo | |
| + " and parents." | |
| ) | |
| else: | |
| hpo_list_up.append(hpo_to_test) | |
| st.write( | |
| "We replaced: ", | |
| hpo, | |
| " by ", | |
| hp_onto[hpo]["direct_parent"][0], | |
| "-", | |
| get_hpo_name(hpo_to_test), | |
| ) | |
| hpo_list = list(set(hpo_list_up)) | |
| del hpo_list_up | |
| if hpo_list: | |
| with st.expander("See HPO inputs"): | |
| st.write(get_hpo_name_list(hpo_list_ini, hp_onto)) | |
| del hpo_list_ini | |
| hpo_list_name = get_relatives_list(hpo_list, hp_onto) | |
| st.header("Clinical description with symptom interaction modeling") | |
| witness = np.zeros(len(data.columns)) | |
| witness_nmf = np.matmul(pheno_NMF.components_, witness) | |
| patient = np.zeros(len(data.columns)) | |
| for hpo in hpo_list: | |
| hpo_index = list(data.columns).index(hpo) | |
| patient[hpo_index] = 1 | |
| patient_nmf = np.matmul(pheno_NMF.components_, patient) | |
| witness_sugg_df = ( | |
| pd.DataFrame(reduced) | |
| .set_index(data.index) | |
| .apply(lambda x: (x - witness_nmf) ** 2, axis=1) | |
| ) | |
| patient_sugg_df = ( | |
| pd.DataFrame(reduced) | |
| .set_index(data.index) | |
| .apply(lambda x: (x - patient_nmf) ** 2, axis=1) | |
| ) | |
| case_sugg_df = (patient_sugg_df - witness_sugg_df).sum() | |
| patient_df_info = pd.DataFrame(case_sugg_df).merge( | |
| topic, left_index=True, right_index=True | |
| ) | |
| patient_df_info["mean_score"] = round( | |
| patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4 | |
| ) | |
| patient_df_info_write = patient_df_info[ | |
| ["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"] | |
| ].sort_values("mean_score", ascending=False) | |
| del case_sugg_df | |
| del patient_sugg_df | |
| del witness_sugg_df | |
| del patient | |
| with st.expander("See projection in groups of symptoms dimension*"): | |
| st.dataframe(patient_df_info_write) | |
| st.write( | |
| "\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations" | |
| ) | |
| match_proj_csv = convert_df(patient_df_info_write) | |
| st.download_button( | |
| "Download description projection", | |
| match_proj_csv, | |
| "clin_desc_projected.tsv", | |
| "text/csv", | |
| key="download-csv-proj", | |
| ) | |
| sim_dict, hpo_list_add_raw = get_similar_terms(hpo_list, similarity_terms_dict) | |
| hpo_list_add = list(set(hpo_list_add_raw) & set(data.columns.tolist())) | |
| similar_list = list(set(hpo_list_add) - set(hpo_list)) | |
| similar_list_desc = get_hpo_name_list(similar_list, hp_onto) | |
| if similar_list_desc: | |
| with st.expander("See symptoms with similarity > 80%"): | |
| similar_list_desc_df = pd.DataFrame.from_dict( | |
| similar_list_desc, orient="index" | |
| ) | |
| similar_list_desc_df.columns = ["description"] | |
| st.write(similar_list_desc_df) | |
| del similar_list_desc_df | |
| del similar_list | |
| del similar_list_desc | |
| st.header("Phenotype matching") | |
| results_sum = score(hpo_list, data) | |
| results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1) | |
| results_sum["score"] = results_sum["matchs"] + results_sum["sum"] | |
| results_sum["rank"] = ( | |
| results_sum["score"].rank(ascending=False, method="max").astype(int) | |
| ) | |
| cols = results_sum.columns.tolist() | |
| cols = cols[-4:] + cols[:-4] | |
| match = results_sum[cols].sort_values(by=["score"], ascending=False) | |
| st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"])) | |
| match_csv = convert_df(match) | |
| st.download_button( | |
| "Download matching results", | |
| match_csv, | |
| "match.tsv", | |
| "text/csv", | |
| key="download-csv-match", | |
| ) | |
| if gene_diag: | |
| if int(ncbi[gene_diag]) in results_sum.index: | |
| p = ( | |
| ggplot(match, aes("score")) | |
| + geom_density() | |
| + geom_vline( | |
| xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"], | |
| linetype="dashed", | |
| color="red", | |
| size=1.5, | |
| ) | |
| + ggtitle("Matching score distribution") | |
| + xlab("Gene matching score") | |
| + ylab("% of genes") | |
| + theme_bw() | |
| + theme( | |
| text=element_text(size=12), | |
| figure_size=(5, 5), | |
| axis_ticks=element_line(colour="black", size=4), | |
| axis_line=element_line(colour="black", size=2), | |
| axis_text_x=element_text(angle=45, hjust=1), | |
| axis_text_y=element_text(angle=60, hjust=1), | |
| subplots_adjust={"wspace": 0.1}, | |
| legend_position=(0.7, 0.35), | |
| ) | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.pyplot(ggplot.draw(p)) | |
| st.write( | |
| "Gene ID rank:", | |
| results_sum.loc[int(ncbi[gene_diag]), "rank"], | |
| " | ", | |
| "Gene ID count:", | |
| round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4), | |
| ) | |
| st.write(results_sum.loc[[int(ncbi[gene_diag])]]) | |
| st.write( | |
| "Gene ID phenotype specificity:", | |
| get_phenotype_specificity(gene_diag, results_sum), | |
| ) | |
| del p | |
| else: | |
| st.write("Gene ID rank:", " Gene not available in PhenoGenius database") | |
| del results_sum | |
| del match | |
| st.header("Phenotype matching by similarity of symptoms") | |
| results_sum_add = score_sim_add(hpo_list_add, data, sim_dict) | |
| results_sum_add["rank"] = ( | |
| results_sum_add["sum"].rank(ascending=False, method="max").astype(int) | |
| ) | |
| cols = results_sum_add.columns.tolist() | |
| cols = cols[-2:] + cols[:-2] | |
| match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False) | |
| st.dataframe(match_sim[match_sim["sum"] > 0.01]) | |
| match_sim_csv = convert_df(match_sim) | |
| st.download_button( | |
| "Download matching results", | |
| match_sim_csv, | |
| "match_sim.tsv", | |
| "text/csv", | |
| key="download-csv-match-sim", | |
| ) | |
| if gene_diag: | |
| if int(ncbi[gene_diag]) in results_sum_add.index: | |
| p2 = ( | |
| ggplot(match_sim, aes("sum")) | |
| + geom_density() | |
| + geom_vline( | |
| xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"], | |
| linetype="dashed", | |
| color="red", | |
| size=1.5, | |
| ) | |
| + ggtitle("Matching score distribution") | |
| + xlab("Gene matching score") | |
| + ylab("% of genes") | |
| + theme_bw() | |
| + theme( | |
| text=element_text(size=12), | |
| figure_size=(5, 5), | |
| axis_ticks=element_line(colour="black", size=4), | |
| axis_line=element_line(colour="black", size=2), | |
| axis_text_x=element_text(angle=45, hjust=1), | |
| axis_text_y=element_text(angle=60, hjust=1), | |
| subplots_adjust={"wspace": 0.1}, | |
| legend_position=(0.7, 0.35), | |
| ) | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.pyplot(ggplot.draw(p2)) | |
| st.write( | |
| "Gene ID rank:", | |
| results_sum_add.loc[int(ncbi[gene_diag]), "rank"], | |
| " | ", | |
| "Gene ID count:", | |
| round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4), | |
| ) | |
| st.write( | |
| "Gene ID phenotype specificity:", | |
| get_phenotype_specificity(gene_diag, results_sum_add), | |
| ) | |
| del p2 | |
| else: | |
| st.write("Gene ID rank:", " Gene not available in PhenoGenius database") | |
| del sim_dict | |
| del hpo_list_add | |
| del results_sum_add | |
| del match_sim | |
| st.header("Phenotype matching by groups of symptoms") | |
| patient_df = ( | |
| pd.DataFrame(reduced) | |
| .set_index(data.index) | |
| .apply(lambda x: sum((x - patient_nmf) ** 2), axis=1) | |
| ) | |
| witness_df = ( | |
| pd.DataFrame(reduced) | |
| .set_index(data.index) | |
| .apply(lambda x: sum((x - witness_nmf) ** 2), axis=1) | |
| ) | |
| del patient_nmf | |
| del witness | |
| del witness_nmf | |
| case_df = pd.DataFrame(patient_df - witness_df) | |
| case_df.columns = ["score"] | |
| case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max()) | |
| # case_df["frequency"] = matrix_frequency["variant_number"] | |
| case_df["sum"] = case_df["score_norm"] # + case_df["frequency"] | |
| case_df_sort = case_df.sort_values(by="sum", ascending=False) | |
| case_df_sort["rank"] = ( | |
| case_df_sort["sum"].rank(ascending=False, method="max").astype(int) | |
| ) | |
| case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol) | |
| match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]] | |
| st.dataframe(match_nmf[match_nmf["sum"] > 0.01]) | |
| match_nmf_csv = convert_df(match_nmf) | |
| st.download_button( | |
| "Download matching results", | |
| match_nmf_csv, | |
| "match_groups.tsv", | |
| "text/csv", | |
| key="download-csv-match-groups", | |
| ) | |
| if gene_diag: | |
| if int(ncbi[gene_diag]) in case_df_sort.index: | |
| p3 = ( | |
| ggplot(match_nmf, aes("sum")) | |
| + geom_density() | |
| + geom_vline( | |
| xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"], | |
| linetype="dashed", | |
| color="red", | |
| size=1.5, | |
| ) | |
| + ggtitle("Matching score distribution") | |
| + xlab("Gene matching score") | |
| + ylab("% of genes") | |
| + theme_bw() | |
| + theme( | |
| text=element_text(size=12), | |
| figure_size=(5, 5), | |
| axis_ticks=element_line(colour="black", size=4), | |
| axis_line=element_line(colour="black", size=2), | |
| axis_text_x=element_text(angle=45, hjust=1), | |
| axis_text_y=element_text(angle=60, hjust=1), | |
| subplots_adjust={"wspace": 0.1}, | |
| legend_position=(0.7, 0.35), | |
| ) | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.pyplot(ggplot.draw(p3)) | |
| st.write( | |
| "Gene ID rank:", | |
| case_df_sort.loc[int(ncbi[gene_diag]), "rank"], | |
| " | ", | |
| "Gene ID count:", | |
| round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4), | |
| ) | |
| st.write( | |
| "Gene ID phenotype specificity:", | |
| get_phenotype_specificity(gene_diag, case_df_sort), | |
| ) | |
| del p3 | |
| else: | |
| st.write("Gene ID rank:", " Gene not available in PhenoGenius database") | |
| del case_df_sort | |
| del match_nmf | |
| del case_df | |
| else: | |
| st.write( | |
| "No HPO terms provided in correct format.", | |
| ) | |