Spaces:

kyauy
/

PhenoGenius

Running

PhenoGenius / phenogenius_app.py

Kévin Yauy

fix(update): update similarity dict to 2024 and remove unavailable similar terms

883337e over 1 year ago

22.4 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	from PIL import Image
	import ujson as json
	import pickle as pk
	from plotnine import *

	# -- Set page config
	apptitle = "PhenoGenius"

	st.set_page_config(
	page_title=apptitle,
	page_icon=":genie:",
	layout="wide",
	initial_sidebar_state="auto",
	)

	# -- Set Sidebar
	image_pg = Image.open("data/img/phenogenius.png")
	st.sidebar.image(image_pg, caption=None, width=100)
	st.sidebar.title("PhenoGenius")

	st.sidebar.header(
	"Learning phenotypic patterns in genetic diseases by symptom interaction modeling"
	)

	st.sidebar.markdown(
	"""
	This webapp presents symptom interaction models in genetic diseases to provide:
	- Standardized clinical descriptions
	- Interpretable matches between symptoms and genes

	Code source is available in GitHub:
	[https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius)

	Last update: 2024-07-15

	PhenoGenius is a collaborative project from:
	"""
	)

	image_uga = Image.open("data/img/logo-uga.png")
	st.sidebar.image(image_uga, caption=None, width=95)

	image_seqone = Image.open("data/img/logo-seqone.png")
	st.sidebar.image(image_seqone, caption=None, width=95)

	image_miai = Image.open("data/img/logoMIAI-rvb.png")
	st.sidebar.image(image_miai, caption=None, width=95)

	image_chuga = Image.open("data/img/logo-chuga.png")
	st.sidebar.image(image_chuga, caption=None, width=60)


	@st.cache_data(max_entries=50)
	def convert_df(df):
	return df.to_csv(sep="\t").encode("utf-8")


	@st.cache_data(max_entries=50)
	def load_data():
	matrix = pd.read_csv(
	"data/resources/ohe_all_thesaurus_weighted_2024.tsv.gz",
	sep="\t",
	compression="gzip",
	index_col=0,
	)
	return matrix


	@st.cache_data(hash_funcs={"Pickle": lambda _: None}, max_entries=50)
	def load_nmf_model():
	with open("data/resources/pheno_NMF_390_model_42_2024.pkl", "rb") as pickle_file:
	pheno_NMF = pk.load(pickle_file)
	with open("data/resources/pheno_NMF_390_matrix_42_2024.pkl", "rb") as pickle_file:
	reduced = pk.load(pickle_file)
	return pheno_NMF, reduced


	@st.cache_data(max_entries=50)
	def symbol_to_id_to_dict():
	# from NCBI
	ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
	ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606]
	ncbi_df_ncbi = ncbi_df.set_index("Symbol")
	ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict()
	ncbi_df = ncbi_df.set_index("GeneID")
	ncbi_to_dict = ncbi_df["Symbol"].to_dict()
	return ncbi_to_dict_ncbi, ncbi_to_dict


	@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
	def load_hp_ontology():
	with open("data/resources/hpo_obo_2024.json") as json_data:
	data_dict = json.load(json_data)
	return data_dict


	@st.cache_data(max_entries=50)
	def hpo_description_to_id():
	data_dict = {}
	for key, value in hp_onto.items():
	data_dict[value["name"]] = key
	return data_dict


	@st.cache_data(max_entries=50)
	def load_topic_data():
	topic = pd.read_csv(
	"data/resources/main_topics_hpo_390_42_filtered_norm_004_2024.tsv",
	sep="\t",
	index_col=0,
	)
	return topic


	@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
	def load_similarity_dict():
	with open("data/resources/similarity_dict_threshold_80_2024.json") as json_data:
	data_dict = json.load(json_data)
	return data_dict


	def get_symbol(gene):
	if gene in symbol.keys():
	return symbol[gene]


	def get_hpo_name(hpo):
	names = {}
	if hpo in hp_onto.keys():
	names[hpo] = hp_onto[hpo]["name"]
	return names


	def get_hpo_name_only(hpo):
	if hpo in hp_onto.keys():
	return hp_onto[hpo]["name"]
	else:
	return None


	def get_hpo_name_list(hpo_list, hp_onto):
	names = {}
	for hpo in hpo_list:
	if hpo in hp_onto.keys():
	names[hpo] = hp_onto[hpo]["name"]
	return names


	def get_similar_terms(hpo_list, similarity_terms_dict):
	hpo_list_w_simi = {}
	for term in hpo_list:
	hpo_list_w_simi[term] = 1
	if term in similarity_terms_dict.keys():
	for key, value in similarity_terms_dict[term].items():
	if value > 0.8:
	score = value / len(similarity_terms_dict[term].keys())
	if key in hpo_list_w_simi.keys():
	if score > hpo_list_w_simi[key]:
	hpo_list_w_simi[key] = score
	else:
	pass
	else:
	hpo_list_w_simi[key] = score
	hpo_list_all = hpo_list_w_simi.keys()
	return hpo_list_w_simi, list(hpo_list_all)


	def score(hpo_list, matrix):
	# Create a copy of the filtered matrix to avoid SettingWithCopyWarning
	matrix_filter = matrix[hpo_list].copy()

	# Use .loc to safely add or modify columns in the copy of the DataFrame
	matrix_filter.loc[:, "sum"] = matrix_filter.sum(axis=1)
	matrix_filter.loc[:, "gene_symbol"] = matrix_filter.index.to_series().apply(
	get_symbol
	)

	# Return the modified DataFrame sorted by 'sum'
	return matrix_filter.sort_values("sum", ascending=False)


	def score_sim_add(hpo_list_add, matrix, sim_dict):
	# Ensure matrix_filter is a copy to avoid modifying the original DataFrame
	matrix_filter = matrix[hpo_list_add].copy()

	# Iterate through sim_dict to update matrix_filter values
	for key, value in sim_dict.items():
	if key in matrix_filter.columns:
	matrix_filter[key] = (
	matrix_filter[key] * value
	) # Direct column assignment is fine here

	# Calculate the sum and assign gene_symbol, using direct assignment for these operations
	matrix_filter["sum"] = matrix_filter.sum(axis=1)
	matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol)

	# Return the DataFrame sorted by 'sum'
	return matrix_filter.sort_values("sum", ascending=False)


	def get_phenotype_specificity(gene_diag, data_patient):
	rank = data_patient.loc[int(ncbi[gene_diag]), "rank"]
	max_rank = data_patient["rank"].max()
	if rank == max_rank:
	return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general."
	elif rank < 41:
	return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)."
	elif rank < 250:
	return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)."
	else:
	return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity."


	def get_relatives_list(hpo_list, hp_onto):
	all_list = []
	for hpo in hpo_list:
	all_list.append(hpo)
	if hpo in hp_onto.keys():
	for parent in hp_onto[hpo]["parents"]:
	all_list.append(parent)
	for children in hp_onto[hpo]["childrens"]:
	all_list.append(children)
	return list(set(all_list))


	def get_hpo_id(hpo_list):
	hpo_id = []
	for description in hpo_list:
	hpo_id.append(hp_desc_id[description])
	return ",".join(hpo_id)


	hp_onto = load_hp_ontology()
	hp_desc_id = hpo_description_to_id()
	ncbi, symbol = symbol_to_id_to_dict()


	with st.form("my_form"):
	c1, c2 = st.columns(2)
	with c1:
	hpo_raw = st.multiselect(
	"Select interactively your HPOs or...",
	list(hp_desc_id.keys()),
	["Renal cyst", "Hepatic cysts"],
	)
	with c2:
	hpo = st.text_input(
	"copy/paste your HPOs, separated with comma",
	"HP:0000107,HP:0001407",
	)
	gene_diag_input = st.multiselect(
	"Optional: provide HGNC gene symbol to be tested",
	options=list(ncbi.keys()),
	default=["PKD1"],
	max_selections=1,
	)
	submit_button = st.form_submit_button(
	label="Submit",
	)


	if submit_button:
	if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0:
	hpo = get_hpo_id(hpo_raw)
	data = load_data()
	pheno_NMF, reduced = load_nmf_model()
	topic = load_topic_data()
	similarity_terms_dict = load_similarity_dict()

	hpo_list_ini = hpo.strip().split(",")

	if gene_diag_input:
	if gene_diag_input[0] in ncbi.keys():
	gene_diag = gene_diag_input[0]
	else:
	st.write(
	gene_diag_input
	+ " gene are not in our database. Please check gene name (need to be in CAPITAL format)."
	)
	gene_diag = None
	else:
	gene_diag = None

	hpo_list_up = []
	for hpo in hpo_list_ini:
	if hpo in ["HP:0000001"]:
	pass
	elif len(hpo) != 10:
	st.write(
	"Incorrect HPO format: "
	+ hpo
	+ ". Please check (7-digits terms with prefix HP:, and separed by commas)."
	)
	pass
	elif hpo not in data.columns:
	pass
	st.write(hpo + " not available in current database. Please modify.")
	else:
	if data[hpo].astype(bool).sum(axis=0) != 0:
	hpo_list_up.append(hpo)
	else:
	hpo_to_test = hp_onto[hpo]["direct_parent"][0]
	while data[hpo_to_test].astype(bool).sum(
	axis=0
	) == 0 and hpo_to_test not in ["HP:0000001"]:
	hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0]
	if hpo_to_test in ["HP:0000001"]:
	st.write(
	"No gene-HPO associations was found for "
	+ hpo
	+ " and parents."
	)
	else:
	hpo_list_up.append(hpo_to_test)
	st.write(
	"We replaced: ",
	hpo,
	" by ",
	hp_onto[hpo]["direct_parent"][0],
	"-",
	get_hpo_name(hpo_to_test),
	)
	hpo_list = list(set(hpo_list_up))
	del hpo_list_up

	if hpo_list:
	with st.expander("See HPO inputs"):
	st.write(get_hpo_name_list(hpo_list_ini, hp_onto))
	del hpo_list_ini

	hpo_list_name = get_relatives_list(hpo_list, hp_onto)

	st.header("Clinical description with symptom interaction modeling")

	witness = np.zeros(len(data.columns))
	witness_nmf = np.matmul(pheno_NMF.components_, witness)

	patient = np.zeros(len(data.columns))
	for hpo in hpo_list:
	hpo_index = list(data.columns).index(hpo)
	patient[hpo_index] = 1

	patient_nmf = np.matmul(pheno_NMF.components_, patient)

	witness_sugg_df = (
	pd.DataFrame(reduced)
	.set_index(data.index)
	.apply(lambda x: (x - witness_nmf) ** 2, axis=1)
	)
	patient_sugg_df = (
	pd.DataFrame(reduced)
	.set_index(data.index)
	.apply(lambda x: (x - patient_nmf) ** 2, axis=1)
	)

	case_sugg_df = (patient_sugg_df - witness_sugg_df).sum()

	patient_df_info = pd.DataFrame(case_sugg_df).merge(
	topic, left_index=True, right_index=True
	)

	patient_df_info["mean_score"] = round(
	patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4
	)

	patient_df_info_write = patient_df_info[
	["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"]
	].sort_values("mean_score", ascending=False)

	del case_sugg_df
	del patient_sugg_df
	del witness_sugg_df
	del patient

	with st.expander("See projection in groups of symptoms dimension*"):
	st.dataframe(patient_df_info_write)
	st.write(
	"\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations"
	)
	match_proj_csv = convert_df(patient_df_info_write)

	st.download_button(
	"Download description projection",
	match_proj_csv,
	"clin_desc_projected.tsv",
	"text/csv",
	key="download-csv-proj",
	)

	sim_dict, hpo_list_add_raw = get_similar_terms(hpo_list, similarity_terms_dict)
	hpo_list_add = list(set(hpo_list_add_raw) & set(data.columns.tolist()))
	similar_list = list(set(hpo_list_add) - set(hpo_list))
	similar_list_desc = get_hpo_name_list(similar_list, hp_onto)

	if similar_list_desc:
	with st.expander("See symptoms with similarity > 80%"):
	similar_list_desc_df = pd.DataFrame.from_dict(
	similar_list_desc, orient="index"
	)
	similar_list_desc_df.columns = ["description"]
	st.write(similar_list_desc_df)
	del similar_list_desc_df
	del similar_list
	del similar_list_desc

	st.header("Phenotype matching")
	results_sum = score(hpo_list, data)
	results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1)
	results_sum["score"] = results_sum["matchs"] + results_sum["sum"]
	results_sum["rank"] = (
	results_sum["score"].rank(ascending=False, method="max").astype(int)
	)
	cols = results_sum.columns.tolist()
	cols = cols[-4:] + cols[:-4]
	match = results_sum[cols].sort_values(by=["score"], ascending=False)
	st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"]))
	match_csv = convert_df(match)

	st.download_button(
	"Download matching results",
	match_csv,
	"match.tsv",
	"text/csv",
	key="download-csv-match",
	)

	if gene_diag:
	if int(ncbi[gene_diag]) in results_sum.index:
	p = (
	ggplot(match, aes("score"))
	+ geom_density()
	+ geom_vline(
	xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"],
	linetype="dashed",
	color="red",
	size=1.5,
	)
	+ ggtitle("Matching score distribution")
	+ xlab("Gene matching score")
	+ ylab("% of genes")
	+ theme_bw()
	+ theme(
	text=element_text(size=12),
	figure_size=(5, 5),
	axis_ticks=element_line(colour="black", size=4),
	axis_line=element_line(colour="black", size=2),
	axis_text_x=element_text(angle=45, hjust=1),
	axis_text_y=element_text(angle=60, hjust=1),
	subplots_adjust={"wspace": 0.1},
	legend_position=(0.7, 0.35),
	)
	)
	col1, col2, col3 = st.columns(3)

	with col1:
	st.pyplot(ggplot.draw(p))

	st.write(
	"Gene ID rank:",
	results_sum.loc[int(ncbi[gene_diag]), "rank"],
	" \| ",
	"Gene ID count:",
	round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4),
	)
	st.write(results_sum.loc[[int(ncbi[gene_diag])]])
	st.write(
	"Gene ID phenotype specificity:",
	get_phenotype_specificity(gene_diag, results_sum),
	)
	del p

	else:
	st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
	del results_sum
	del match

	st.header("Phenotype matching by similarity of symptoms")
	results_sum_add = score_sim_add(hpo_list_add, data, sim_dict)
	results_sum_add["rank"] = (
	results_sum_add["sum"].rank(ascending=False, method="max").astype(int)
	)
	cols = results_sum_add.columns.tolist()
	cols = cols[-2:] + cols[:-2]
	match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False)
	st.dataframe(match_sim[match_sim["sum"] > 0.01])

	match_sim_csv = convert_df(match_sim)

	st.download_button(
	"Download matching results",
	match_sim_csv,
	"match_sim.tsv",
	"text/csv",
	key="download-csv-match-sim",
	)

	if gene_diag:
	if int(ncbi[gene_diag]) in results_sum_add.index:
	p2 = (
	ggplot(match_sim, aes("sum"))
	+ geom_density()
	+ geom_vline(
	xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"],
	linetype="dashed",
	color="red",
	size=1.5,
	)
	+ ggtitle("Matching score distribution")
	+ xlab("Gene matching score")
	+ ylab("% of genes")
	+ theme_bw()
	+ theme(
	text=element_text(size=12),
	figure_size=(5, 5),
	axis_ticks=element_line(colour="black", size=4),
	axis_line=element_line(colour="black", size=2),
	axis_text_x=element_text(angle=45, hjust=1),
	axis_text_y=element_text(angle=60, hjust=1),
	subplots_adjust={"wspace": 0.1},
	legend_position=(0.7, 0.35),
	)
	)
	col1, col2, col3 = st.columns(3)

	with col1:
	st.pyplot(ggplot.draw(p2))

	st.write(
	"Gene ID rank:",
	results_sum_add.loc[int(ncbi[gene_diag]), "rank"],
	" \| ",
	"Gene ID count:",
	round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4),
	)
	st.write(
	"Gene ID phenotype specificity:",
	get_phenotype_specificity(gene_diag, results_sum_add),
	)
	del p2

	else:
	st.write("Gene ID rank:", " Gene not available in PhenoGenius database")

	del sim_dict
	del hpo_list_add
	del results_sum_add
	del match_sim

	st.header("Phenotype matching by groups of symptoms")

	patient_df = (
	pd.DataFrame(reduced)
	.set_index(data.index)
	.apply(lambda x: sum((x - patient_nmf) ** 2), axis=1)
	)

	witness_df = (
	pd.DataFrame(reduced)
	.set_index(data.index)
	.apply(lambda x: sum((x - witness_nmf) ** 2), axis=1)
	)
	del patient_nmf
	del witness
	del witness_nmf

	case_df = pd.DataFrame(patient_df - witness_df)
	case_df.columns = ["score"]
	case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max())
	# case_df["frequency"] = matrix_frequency["variant_number"]
	case_df["sum"] = case_df["score_norm"] # + case_df["frequency"]
	case_df_sort = case_df.sort_values(by="sum", ascending=False)
	case_df_sort["rank"] = (
	case_df_sort["sum"].rank(ascending=False, method="max").astype(int)
	)
	case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol)
	match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]]
	st.dataframe(match_nmf[match_nmf["sum"] > 0.01])

	match_nmf_csv = convert_df(match_nmf)

	st.download_button(
	"Download matching results",
	match_nmf_csv,
	"match_groups.tsv",
	"text/csv",
	key="download-csv-match-groups",
	)

	if gene_diag:
	if int(ncbi[gene_diag]) in case_df_sort.index:
	p3 = (
	ggplot(match_nmf, aes("sum"))
	+ geom_density()
	+ geom_vline(
	xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"],
	linetype="dashed",
	color="red",
	size=1.5,
	)
	+ ggtitle("Matching score distribution")
	+ xlab("Gene matching score")
	+ ylab("% of genes")
	+ theme_bw()
	+ theme(
	text=element_text(size=12),
	figure_size=(5, 5),
	axis_ticks=element_line(colour="black", size=4),
	axis_line=element_line(colour="black", size=2),
	axis_text_x=element_text(angle=45, hjust=1),
	axis_text_y=element_text(angle=60, hjust=1),
	subplots_adjust={"wspace": 0.1},
	legend_position=(0.7, 0.35),
	)
	)
	col1, col2, col3 = st.columns(3)

	with col1:
	st.pyplot(ggplot.draw(p3))

	st.write(
	"Gene ID rank:",
	case_df_sort.loc[int(ncbi[gene_diag]), "rank"],
	" \| ",
	"Gene ID count:",
	round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4),
	)
	st.write(
	"Gene ID phenotype specificity:",
	get_phenotype_specificity(gene_diag, case_df_sort),
	)
	del p3
	else:
	st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
	del case_df_sort
	del match_nmf
	del case_df

	else:
	st.write(
	"No HPO terms provided in correct format.",
	)