Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import time | |
| import plotly.graph_objects as go | |
| from scipy.ndimage import gaussian_filter1d | |
| from zipfile import ZipFile | |
| np.random.seed(2024) | |
| uids = pd.read_csv("uniprot_ids.tsv.gz", names=["selection"], header=None, sep="\t") | |
| # del_sub_merge = pd.read_csv("del_sub_data.csv.gz") | |
| zf = ZipFile("ALL_hum_proteins_ESM1b_del_sub.zip") | |
| width=600 | |
| def plot_interactive_scatter(uid: str): | |
| user_data = pd.read_csv(zf.open(f"{uid}.csv")) | |
| # Create scatter plot for user-specified data | |
| user_trace = go.Scatter( | |
| x=-np.log10(user_data.aPLLR), | |
| y=user_data.avg_LLR, | |
| mode='markers', | |
| name=f"{uid}<br>Data", | |
| text=user_data.site, | |
| hoverinfo='text', | |
| marker=dict(color='orange')) | |
| return user_trace, user_data | |
| def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str, | |
| hline1: float, hline2: float): | |
| esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score] | |
| x_ticks = uid_data["site"].tolist() | |
| plot_data = esm_data | |
| hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)] | |
| line_trace = go.Scatter( | |
| x=np.arange(1, len(uid_data)+1), | |
| y=plot_data, | |
| mode='lines', | |
| text=hover_text, | |
| hoverinfo='text', | |
| marker=dict(color='orange') | |
| ) | |
| line_fig = go.Figure(data=[line_trace]) | |
| line_fig.update_layout( | |
| title=f"{uid} {mutation} Scores by Position", | |
| yaxis_title=f'{mutation} Score<br>(More Negative = More Damaging)', | |
| yaxis=dict(showgrid=False, zeroline=False, showline=False), | |
| height=300, | |
| hoverlabel=dict( # Set hover label font size | |
| font=dict(size=16) # Specify the font size of the hover text | |
| ) | |
| ) | |
| for hline in [hline1, hline2]: | |
| line_fig.add_shape( | |
| type='line', | |
| x0=0, x1=1, y0=hline, y1=hline, | |
| xref='paper', yref='y', | |
| line=dict(color='Black', dash='dash'), | |
| ) | |
| return line_fig | |
| selection = st.selectbox("", uids.selection, index=11409) | |
| selection_uid = selection.split(",")[0] | |
| # Base dataset | |
| base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz") | |
| # Create base scatter plot | |
| base_trace = go.Scatter( | |
| x=-np.log10(base_data.aPLLR), | |
| y=base_data.avg_LLR, | |
| mode='markers', | |
| name='Sample of<br>Genome-Wide<br>Data', | |
| hoverinfo='none', # Disable hover information for the base data | |
| marker=dict(color='grey') | |
| ) | |
| # User-specified data | |
| ut, ud = plot_interactive_scatter(selection_uid) | |
| # Combine traces | |
| fig = go.Figure([base_trace, ut]) | |
| # Customize layout | |
| fig.update_layout( | |
| title='Deletion v Substitution Effects', | |
| xaxis_title='Deletion Score', | |
| yaxis_title='Substitution Score', | |
| yaxis=dict(showgrid=False, showline=False, zeroline=False), | |
| legend=dict( | |
| font=dict(size=15), # Specify the font size of the legend text | |
| bordercolor="grey", | |
| borderwidth=1 | |
| ), | |
| hoverlabel=dict( # Set hover label font size | |
| font=dict(size=16) # Specify the font size of the hover text | |
| ) | |
| ) | |
| fig.update_yaxes(showgrid=False) | |
| # Extract out percentiles | |
| del_bot, del_top = 0.16500809479645437, -0.7801050825906862 | |
| for del_cutoff in [del_bot, del_top]: | |
| fig.add_shape( | |
| type='line', | |
| x0=del_cutoff, x1=del_cutoff, y0=0, y1=1, | |
| xref='x', yref='paper', | |
| line=dict(color='Black', width=2) | |
| ) | |
| # to avoid reading the entire dataset into memory | |
| sub_bot, sub_top = -12.004105263157896, -4.871947368421053 | |
| for sub_cutoff in [sub_bot, sub_top]: | |
| fig.add_shape( | |
| type='line', | |
| x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff, | |
| xref='paper', yref='y', | |
| line=dict(color='Black', width=2), | |
| ) | |
| fig.add_annotation( | |
| x=2.5, | |
| y=-18, | |
| text=r"D<sup>+</sup>S<sup>—</sup>", | |
| font=dict(color="green", size=24), | |
| showarrow=False | |
| ) | |
| fig.add_annotation( | |
| x=-1.5, | |
| y=0.5, | |
| text=r"D<sup>—</sup>S<sup>+</sup>", | |
| font=dict(color="red", size=24), | |
| showarrow=False | |
| ) | |
| lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top) | |
| lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top) | |
| # Show the scatter plot | |
| st.plotly_chart(fig) | |
| show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone") | |
| if show_line_plots: | |
| st.plotly_chart(lt_apllr) | |
| st.plotly_chart(lt_llr) | |
| st.download_button( | |
| label=f"Download {selection_uid} data as CSV", | |
| data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(), | |
| file_name = f"{selection_uid}_del_sub.csv", | |
| mime='text/csv' | |
| ) | |
| st.markdown(""" | |
| **README**: | |
| - Deletion scores are *visualized* on the -log10 scale. | |
| - The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_proteins_ESM1b_del_sub.zip) (or go to files tab). | |
| - Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants). | |
| - Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol). | |
| """) | |