2 / app.py
ligdis's picture
Update app.py
37b61da verified
import os
import streamlit as st
import pandas as pd
import csv
import collections
import joblib
root = os.path.dirname(os.path.abspath(__file__))
FREQUENT_CUTOFF = 40
MEDIUM_CUTOFF = 10
st.set_page_config(
page_title="Ligand Disovery 2: Explore Protein-sets",
page_icon=":home:",
layout="wide", # "centered",
initial_sidebar_state="expanded"
)
st.markdown("""
<style>
.css-13sdm1b.e16nr0p33 {
margin-top: -75px;
}
</style>
""", unsafe_allow_html=True)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
#header {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
# read data
@st.cache_data()
def load_screening_hits():
db = pd.read_csv(os.path.join(root, "./screening_hits.tsv"), sep="\t")
return db
@st.cache_data()
def load_human_proteome():
human_proteome = pd.read_csv(os.path.join(root, "./human_proteome_with_gene_names.tab"), sep="\t")
return human_proteome
@st.cache_data()
def load_hek_proteome():
hek_proteome = []
with open(os.path.join(root, "./hek293t_core.tsv"), "r") as f:
reader = csv.reader(f)
for r in reader:
hek_proteome += [r[0]]
hek_proteome = set(hek_proteome)
return hek_proteome
@st.cache_data()
def load_pid2name_primary():
return joblib.load(os.path.join(root, "./pid2name_primary.joblib"))
@st.cache_data()
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
@st.cache_data()
def convert_df_no_header(df):
return df.to_csv(index=False, header=False).encode('utf-8')
@st.cache_data()
def example_input_load():
pids = []
with open(os.path.join(root, "./example_input.csv"), "r") as f:
reader = csv.reader(f)
for r in reader:
pids += [r[0]]
return pids
db = load_screening_hits()
hek_proteome = load_hek_proteome()
pid2name_primary = load_pid2name_primary()
human_proteome = set(pid2name_primary.keys())
example_input = example_input_load()
any2pid = {}
for k,v in pid2name_primary.items():
any2pid[v] = k
any2pid[k] = k
pid2fid = collections.defaultdict(list)
fid2pid = collections.defaultdict(list)
for r in db[["Accession", "FragID"]].values:
pid2fid[r[0]] += [r[1]]
fid2pid[r[1]] += [r[0]]
frequent_hitters = set()
normal_hitters = set()
specific_hitters = set()
for k,v in pid2fid.items():
if len(v) >= FREQUENT_CUTOFF:
frequent_hitters.update([k])
continue
if len(v) >= MEDIUM_CUTOFF:
normal_hitters.update([k])
continue
specific_hitters.update([k])
options = sorted([x for k,v in pid2name_primary.items() for x in [k,v]])
# layout
st.sidebar.title("Ligand Discovery 2: Explore Protein-sets")
st.sidebar.write("We screened 407 fully-functionalized small molecule fragments ('Ligands') in HEK293t cells. For {0} of the Ligands, we found at least one protein enriched. In total, we enriched {1} proteins at least once. Query your protein sets of interest and explore them in light of our dataset!".format(len(fid2pid), len(pid2fid)))
manual_input = st.sidebar.multiselect(label="Input proteins manually", options = [""] + sorted(options), default=[], help="Select proteins by UniProt Accession code or Gene Symbol")
user_pids = {}
user_input = []
for i in manual_input:
user_pids[i] = any2pid[i]
user_input += [i]
st.sidebar.subheader("OR")
fids = sorted(set(db["FragID"]))
fid_input = st.sidebar.selectbox(label="Select pre-screened Ligand by identifier", options = [""] + fids, help="Select an already profiled Ligand in our primary screening (page Interactions). Use the Ligand identifier (example, C001)")
if fid_input != "":
user_input = fid2pid[fid_input]
user_pids = dict((r,r) for r in user_input)
st.sidebar.subheader("OR")
example_file = db
file_input = st.sidebar.file_uploader(label="Upload a file", help="Provide a file containing one UniProt Accession code or Gene Symbol per row.")
if file_input:
user_input = list(pd.read_csv(file_input, header=None)[0])
for i in user_input:
user_pids[i] = any2pid[i]
st.sidebar.download_button(label="Download example file", data=convert_df_no_header(pd.DataFrame({"uniprot_ac": example_input})), file_name="protein_profile_example.csv", mime="text/csv")
# checks
if not manual_input:
manual_input = None
if not fid_input:
fid_input = None
if not file_input:
file_input = None
if not manual_input and not file_input and not fid_input:
st.sidebar.info("Use any of the options above to explore a protein profile...")
query_is_available = False
else:
c = 0
for x in [manual_input, fid_input, file_input]:
if x is not None:
c += 1
if c > 1:
st.sidebar.error("More than one input type has been provided! Please only choose one of the options, i.e. input proteins manually, or select a pre-screened Ligand, or upload a file. Refresh this window to get started again.")
query_is_available = False
else:
query_is_available = True
def serialize_s(cat, r):
s = [cat] + r[:-1] + [" ".join(r[-1])]
return s
if query_is_available:
columns = st.columns([0.5, 0.5])
done = set()
col = columns[0]
cat_name = "Frequently enriched"
S = []
R = []
for r in user_input:
pid = user_pids[r]
if pid in frequent_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), pid2fid[pid]]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0} (Low specificity)** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
col = columns[1]
cat_name = "Medium specificity"
R = []
for r in user_input:
pid = user_pids[r]
if pid in normal_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
st.divider()
columns = st.columns([0.5, 0.25, 0.25])
col = columns[0]
cat_name = "High specificity"
R = []
for r in user_input:
pid = user_pids[r]
if pid in specific_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
col = columns[1]
cat_name = "Never enriched"
R = []
for r in user_input:
if r in done:
continue
pid = user_pids[r]
if pid in hek_proteome:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)
col = columns[2]
cat_name = "Not in HEK293t"
R = []
for r in user_input:
if r in done:
continue
pid = user_pids[r]
if pid in human_proteome:
fids_ = sorted(pid2fid[pid])
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), fids_]]
S += [serialize_s(cat_name, R[-1])]
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)
data = pd.DataFrame(S, columns = ["Category", "UniProt", "GeneName", "Hits", "Fragments"])
data = data.sort_values(by=["Hits", "GeneName", "Category"], ascending=[False, True, True]).reset_index(drop=True)
data = convert_df(data)
st.download_button(label="Download search results", data=data, file_name="ligand_discovery_search_results.csv", mime="text/csv")