Spaces:

ntranoslab
/

diff-tol

Sleeping

diff-tol / app.py

Grant

fix bulk download

72230c3 8 months ago

5.29 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import time
	import plotly.graph_objects as go
	from scipy.ndimage import gaussian_filter1d
	from zipfile import ZipFile

	np.random.seed(2024)

	uids = pd.read_csv("uniprot_ids.tsv.gz", names=["selection"], header=None, sep="\t")
	# del_sub_merge = pd.read_csv("del_sub_data.csv.gz")
	zf = ZipFile("ALL_hum_proteins_ESM1b_del_sub.zip")

	width=600

	def plot_interactive_scatter(uid: str):

	user_data = pd.read_csv(zf.open(f"{uid}.csv"))

	# Create scatter plot for user-specified data
	user_trace = go.Scatter(
	x=-np.log10(user_data.aPLLR),
	y=user_data.avg_LLR,
	mode='markers',
	name=f"{uid}<br>Data",
	text=user_data.site,
	hoverinfo='text',
	marker=dict(color='orange'))

	return user_trace, user_data

	def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str,
	hline1: float, hline2: float):

	esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score]
	x_ticks = uid_data["site"].tolist()

	plot_data = esm_data
	hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)]

	line_trace = go.Scatter(
	x=np.arange(1, len(uid_data)+1),
	y=plot_data,
	mode='lines',
	text=hover_text,
	hoverinfo='text',
	marker=dict(color='orange')
	)
	line_fig = go.Figure(data=[line_trace])
	line_fig.update_layout(
	title=f"{uid} {mutation} Scores by Position",
	yaxis_title=f'{mutation} Score<br>(More Negative = More Damaging)',
	yaxis=dict(showgrid=False, zeroline=False, showline=False),
	height=300,
	hoverlabel=dict( # Set hover label font size
	font=dict(size=16) # Specify the font size of the hover text
	)
	)
	for hline in [hline1, hline2]:
	line_fig.add_shape(
	type='line',
	x0=0, x1=1, y0=hline, y1=hline,
	xref='paper', yref='y',
	line=dict(color='Black', dash='dash'),
	)
	return line_fig

	selection = st.selectbox("", uids.selection, index=11409)
	selection_uid = selection.split(",")[0]

	# Base dataset
	base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz")

	# Create base scatter plot
	base_trace = go.Scatter(
	x=-np.log10(base_data.aPLLR),
	y=base_data.avg_LLR,
	mode='markers',
	name='Sample of<br>Genome-Wide<br>Data',
	hoverinfo='none', # Disable hover information for the base data
	marker=dict(color='grey')
	)

	# User-specified data
	ut, ud = plot_interactive_scatter(selection_uid)

	# Combine traces
	fig = go.Figure([base_trace, ut])

	# Customize layout
	fig.update_layout(
	title='Deletion v Substitution Effects',
	xaxis_title='Deletion Score',
	yaxis_title='Substitution Score',
	yaxis=dict(showgrid=False, showline=False, zeroline=False),
	legend=dict(
	font=dict(size=15), # Specify the font size of the legend text
	bordercolor="grey",
	borderwidth=1
	),
	hoverlabel=dict( # Set hover label font size
	font=dict(size=16) # Specify the font size of the hover text
	)
	)

	fig.update_yaxes(showgrid=False)

	# Extract out percentiles
	del_bot, del_top = 0.16500809479645437, -0.7801050825906862
	for del_cutoff in [del_bot, del_top]:
	fig.add_shape(
	type='line',
	x0=del_cutoff, x1=del_cutoff, y0=0, y1=1,
	xref='x', yref='paper',
	line=dict(color='Black', width=2)
	)

	# to avoid reading the entire dataset into memory
	sub_bot, sub_top = -12.004105263157896, -4.871947368421053
	for sub_cutoff in [sub_bot, sub_top]:
	fig.add_shape(
	type='line',
	x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff,
	xref='paper', yref='y',
	line=dict(color='Black', width=2),
	)

	fig.add_annotation(
	x=2.5,
	y=-18,
	text=r"D<sup>+</sup>S<sup>—</sup>",
	font=dict(color="green", size=24),
	showarrow=False
	)

	fig.add_annotation(
	x=-1.5,
	y=0.5,
	text=r"D<sup>—</sup>S<sup>+</sup>",
	font=dict(color="red", size=24),
	showarrow=False
	)

	lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top)

	lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top)

	# Show the scatter plot
	st.plotly_chart(fig)

	show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone")

	if show_line_plots:
	st.plotly_chart(lt_apllr)
	st.plotly_chart(lt_llr)

	st.download_button(
	label=f"Download {selection_uid} data as CSV",
	data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(),
	file_name = f"{selection_uid}_del_sub.csv",
	mime='text/csv'
	)



	st.markdown("""
	README:
	- Deletion scores are visualized on the -log10 scale.
	- The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_proteins_ESM1b_del_sub.zip) (or go to files tab).
	- Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants).
	- Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol).
	""")