PhenoGenius / phenogenius_app.py
Kévin Yauy
fix(update): update similarity dict to 2024 and remove unavailable similar terms
883337e
import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import ujson as json
import pickle as pk
from plotnine import *
# -- Set page config
apptitle = "PhenoGenius"
st.set_page_config(
page_title=apptitle,
page_icon=":genie:",
layout="wide",
initial_sidebar_state="auto",
)
# -- Set Sidebar
image_pg = Image.open("data/img/phenogenius.png")
st.sidebar.image(image_pg, caption=None, width=100)
st.sidebar.title("PhenoGenius")
st.sidebar.header(
"Learning phenotypic patterns in genetic diseases by symptom interaction modeling"
)
st.sidebar.markdown(
"""
This webapp presents symptom interaction models in genetic diseases to provide:
- Standardized clinical descriptions
- Interpretable matches between symptoms and genes
Code source is available in GitHub:
[https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius)
Last update: 2024-07-15
PhenoGenius is a collaborative project from:
"""
)
image_uga = Image.open("data/img/logo-uga.png")
st.sidebar.image(image_uga, caption=None, width=95)
image_seqone = Image.open("data/img/logo-seqone.png")
st.sidebar.image(image_seqone, caption=None, width=95)
image_miai = Image.open("data/img/logoMIAI-rvb.png")
st.sidebar.image(image_miai, caption=None, width=95)
image_chuga = Image.open("data/img/logo-chuga.png")
st.sidebar.image(image_chuga, caption=None, width=60)
@st.cache_data(max_entries=50)
def convert_df(df):
return df.to_csv(sep="\t").encode("utf-8")
@st.cache_data(max_entries=50)
def load_data():
matrix = pd.read_csv(
"data/resources/ohe_all_thesaurus_weighted_2024.tsv.gz",
sep="\t",
compression="gzip",
index_col=0,
)
return matrix
@st.cache_data(hash_funcs={"Pickle": lambda _: None}, max_entries=50)
def load_nmf_model():
with open("data/resources/pheno_NMF_390_model_42_2024.pkl", "rb") as pickle_file:
pheno_NMF = pk.load(pickle_file)
with open("data/resources/pheno_NMF_390_matrix_42_2024.pkl", "rb") as pickle_file:
reduced = pk.load(pickle_file)
return pheno_NMF, reduced
@st.cache_data(max_entries=50)
def symbol_to_id_to_dict():
# from NCBI
ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606]
ncbi_df_ncbi = ncbi_df.set_index("Symbol")
ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict()
ncbi_df = ncbi_df.set_index("GeneID")
ncbi_to_dict = ncbi_df["Symbol"].to_dict()
return ncbi_to_dict_ncbi, ncbi_to_dict
@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
def load_hp_ontology():
with open("data/resources/hpo_obo_2024.json") as json_data:
data_dict = json.load(json_data)
return data_dict
@st.cache_data(max_entries=50)
def hpo_description_to_id():
data_dict = {}
for key, value in hp_onto.items():
data_dict[value["name"]] = key
return data_dict
@st.cache_data(max_entries=50)
def load_topic_data():
topic = pd.read_csv(
"data/resources/main_topics_hpo_390_42_filtered_norm_004_2024.tsv",
sep="\t",
index_col=0,
)
return topic
@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
def load_similarity_dict():
with open("data/resources/similarity_dict_threshold_80_2024.json") as json_data:
data_dict = json.load(json_data)
return data_dict
def get_symbol(gene):
if gene in symbol.keys():
return symbol[gene]
def get_hpo_name(hpo):
names = {}
if hpo in hp_onto.keys():
names[hpo] = hp_onto[hpo]["name"]
return names
def get_hpo_name_only(hpo):
if hpo in hp_onto.keys():
return hp_onto[hpo]["name"]
else:
return None
def get_hpo_name_list(hpo_list, hp_onto):
names = {}
for hpo in hpo_list:
if hpo in hp_onto.keys():
names[hpo] = hp_onto[hpo]["name"]
return names
def get_similar_terms(hpo_list, similarity_terms_dict):
hpo_list_w_simi = {}
for term in hpo_list:
hpo_list_w_simi[term] = 1
if term in similarity_terms_dict.keys():
for key, value in similarity_terms_dict[term].items():
if value > 0.8:
score = value / len(similarity_terms_dict[term].keys())
if key in hpo_list_w_simi.keys():
if score > hpo_list_w_simi[key]:
hpo_list_w_simi[key] = score
else:
pass
else:
hpo_list_w_simi[key] = score
hpo_list_all = hpo_list_w_simi.keys()
return hpo_list_w_simi, list(hpo_list_all)
def score(hpo_list, matrix):
# Create a copy of the filtered matrix to avoid SettingWithCopyWarning
matrix_filter = matrix[hpo_list].copy()
# Use .loc to safely add or modify columns in the copy of the DataFrame
matrix_filter.loc[:, "sum"] = matrix_filter.sum(axis=1)
matrix_filter.loc[:, "gene_symbol"] = matrix_filter.index.to_series().apply(
get_symbol
)
# Return the modified DataFrame sorted by 'sum'
return matrix_filter.sort_values("sum", ascending=False)
def score_sim_add(hpo_list_add, matrix, sim_dict):
# Ensure matrix_filter is a copy to avoid modifying the original DataFrame
matrix_filter = matrix[hpo_list_add].copy()
# Iterate through sim_dict to update matrix_filter values
for key, value in sim_dict.items():
if key in matrix_filter.columns:
matrix_filter[key] = (
matrix_filter[key] * value
) # Direct column assignment is fine here
# Calculate the sum and assign gene_symbol, using direct assignment for these operations
matrix_filter["sum"] = matrix_filter.sum(axis=1)
matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol)
# Return the DataFrame sorted by 'sum'
return matrix_filter.sort_values("sum", ascending=False)
def get_phenotype_specificity(gene_diag, data_patient):
rank = data_patient.loc[int(ncbi[gene_diag]), "rank"]
max_rank = data_patient["rank"].max()
if rank == max_rank:
return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general."
elif rank < 41:
return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)."
elif rank < 250:
return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)."
else:
return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity."
def get_relatives_list(hpo_list, hp_onto):
all_list = []
for hpo in hpo_list:
all_list.append(hpo)
if hpo in hp_onto.keys():
for parent in hp_onto[hpo]["parents"]:
all_list.append(parent)
for children in hp_onto[hpo]["childrens"]:
all_list.append(children)
return list(set(all_list))
def get_hpo_id(hpo_list):
hpo_id = []
for description in hpo_list:
hpo_id.append(hp_desc_id[description])
return ",".join(hpo_id)
hp_onto = load_hp_ontology()
hp_desc_id = hpo_description_to_id()
ncbi, symbol = symbol_to_id_to_dict()
with st.form("my_form"):
c1, c2 = st.columns(2)
with c1:
hpo_raw = st.multiselect(
"Select interactively your HPOs or...",
list(hp_desc_id.keys()),
["Renal cyst", "Hepatic cysts"],
)
with c2:
hpo = st.text_input(
"copy/paste your HPOs, separated with comma",
"HP:0000107,HP:0001407",
)
gene_diag_input = st.multiselect(
"Optional: provide HGNC gene symbol to be tested",
options=list(ncbi.keys()),
default=["PKD1"],
max_selections=1,
)
submit_button = st.form_submit_button(
label="Submit",
)
if submit_button:
if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0:
hpo = get_hpo_id(hpo_raw)
data = load_data()
pheno_NMF, reduced = load_nmf_model()
topic = load_topic_data()
similarity_terms_dict = load_similarity_dict()
hpo_list_ini = hpo.strip().split(",")
if gene_diag_input:
if gene_diag_input[0] in ncbi.keys():
gene_diag = gene_diag_input[0]
else:
st.write(
gene_diag_input
+ " gene are not in our database. Please check gene name (need to be in CAPITAL format)."
)
gene_diag = None
else:
gene_diag = None
hpo_list_up = []
for hpo in hpo_list_ini:
if hpo in ["HP:0000001"]:
pass
elif len(hpo) != 10:
st.write(
"Incorrect HPO format: "
+ hpo
+ ". Please check (7-digits terms with prefix HP:, and separed by commas)."
)
pass
elif hpo not in data.columns:
pass
st.write(hpo + " not available in current database. Please modify.")
else:
if data[hpo].astype(bool).sum(axis=0) != 0:
hpo_list_up.append(hpo)
else:
hpo_to_test = hp_onto[hpo]["direct_parent"][0]
while data[hpo_to_test].astype(bool).sum(
axis=0
) == 0 and hpo_to_test not in ["HP:0000001"]:
hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0]
if hpo_to_test in ["HP:0000001"]:
st.write(
"No gene-HPO associations was found for "
+ hpo
+ " and parents."
)
else:
hpo_list_up.append(hpo_to_test)
st.write(
"We replaced: ",
hpo,
" by ",
hp_onto[hpo]["direct_parent"][0],
"-",
get_hpo_name(hpo_to_test),
)
hpo_list = list(set(hpo_list_up))
del hpo_list_up
if hpo_list:
with st.expander("See HPO inputs"):
st.write(get_hpo_name_list(hpo_list_ini, hp_onto))
del hpo_list_ini
hpo_list_name = get_relatives_list(hpo_list, hp_onto)
st.header("Clinical description with symptom interaction modeling")
witness = np.zeros(len(data.columns))
witness_nmf = np.matmul(pheno_NMF.components_, witness)
patient = np.zeros(len(data.columns))
for hpo in hpo_list:
hpo_index = list(data.columns).index(hpo)
patient[hpo_index] = 1
patient_nmf = np.matmul(pheno_NMF.components_, patient)
witness_sugg_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: (x - witness_nmf) ** 2, axis=1)
)
patient_sugg_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: (x - patient_nmf) ** 2, axis=1)
)
case_sugg_df = (patient_sugg_df - witness_sugg_df).sum()
patient_df_info = pd.DataFrame(case_sugg_df).merge(
topic, left_index=True, right_index=True
)
patient_df_info["mean_score"] = round(
patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4
)
patient_df_info_write = patient_df_info[
["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"]
].sort_values("mean_score", ascending=False)
del case_sugg_df
del patient_sugg_df
del witness_sugg_df
del patient
with st.expander("See projection in groups of symptoms dimension*"):
st.dataframe(patient_df_info_write)
st.write(
"\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations"
)
match_proj_csv = convert_df(patient_df_info_write)
st.download_button(
"Download description projection",
match_proj_csv,
"clin_desc_projected.tsv",
"text/csv",
key="download-csv-proj",
)
sim_dict, hpo_list_add_raw = get_similar_terms(hpo_list, similarity_terms_dict)
hpo_list_add = list(set(hpo_list_add_raw) & set(data.columns.tolist()))
similar_list = list(set(hpo_list_add) - set(hpo_list))
similar_list_desc = get_hpo_name_list(similar_list, hp_onto)
if similar_list_desc:
with st.expander("See symptoms with similarity > 80%"):
similar_list_desc_df = pd.DataFrame.from_dict(
similar_list_desc, orient="index"
)
similar_list_desc_df.columns = ["description"]
st.write(similar_list_desc_df)
del similar_list_desc_df
del similar_list
del similar_list_desc
st.header("Phenotype matching")
results_sum = score(hpo_list, data)
results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1)
results_sum["score"] = results_sum["matchs"] + results_sum["sum"]
results_sum["rank"] = (
results_sum["score"].rank(ascending=False, method="max").astype(int)
)
cols = results_sum.columns.tolist()
cols = cols[-4:] + cols[:-4]
match = results_sum[cols].sort_values(by=["score"], ascending=False)
st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"]))
match_csv = convert_df(match)
st.download_button(
"Download matching results",
match_csv,
"match.tsv",
"text/csv",
key="download-csv-match",
)
if gene_diag:
if int(ncbi[gene_diag]) in results_sum.index:
p = (
ggplot(match, aes("score"))
+ geom_density()
+ geom_vline(
xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p))
st.write(
"Gene ID rank:",
results_sum.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(results_sum.loc[[int(ncbi[gene_diag])]])
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, results_sum),
)
del p
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del results_sum
del match
st.header("Phenotype matching by similarity of symptoms")
results_sum_add = score_sim_add(hpo_list_add, data, sim_dict)
results_sum_add["rank"] = (
results_sum_add["sum"].rank(ascending=False, method="max").astype(int)
)
cols = results_sum_add.columns.tolist()
cols = cols[-2:] + cols[:-2]
match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False)
st.dataframe(match_sim[match_sim["sum"] > 0.01])
match_sim_csv = convert_df(match_sim)
st.download_button(
"Download matching results",
match_sim_csv,
"match_sim.tsv",
"text/csv",
key="download-csv-match-sim",
)
if gene_diag:
if int(ncbi[gene_diag]) in results_sum_add.index:
p2 = (
ggplot(match_sim, aes("sum"))
+ geom_density()
+ geom_vline(
xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p2))
st.write(
"Gene ID rank:",
results_sum_add.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, results_sum_add),
)
del p2
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del sim_dict
del hpo_list_add
del results_sum_add
del match_sim
st.header("Phenotype matching by groups of symptoms")
patient_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: sum((x - patient_nmf) ** 2), axis=1)
)
witness_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: sum((x - witness_nmf) ** 2), axis=1)
)
del patient_nmf
del witness
del witness_nmf
case_df = pd.DataFrame(patient_df - witness_df)
case_df.columns = ["score"]
case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max())
# case_df["frequency"] = matrix_frequency["variant_number"]
case_df["sum"] = case_df["score_norm"] # + case_df["frequency"]
case_df_sort = case_df.sort_values(by="sum", ascending=False)
case_df_sort["rank"] = (
case_df_sort["sum"].rank(ascending=False, method="max").astype(int)
)
case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol)
match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]]
st.dataframe(match_nmf[match_nmf["sum"] > 0.01])
match_nmf_csv = convert_df(match_nmf)
st.download_button(
"Download matching results",
match_nmf_csv,
"match_groups.tsv",
"text/csv",
key="download-csv-match-groups",
)
if gene_diag:
if int(ncbi[gene_diag]) in case_df_sort.index:
p3 = (
ggplot(match_nmf, aes("sum"))
+ geom_density()
+ geom_vline(
xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p3))
st.write(
"Gene ID rank:",
case_df_sort.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, case_df_sort),
)
del p3
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del case_df_sort
del match_nmf
del case_df
else:
st.write(
"No HPO terms provided in correct format.",
)