|
|
import os |
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import csv |
|
|
import collections |
|
|
import joblib |
|
|
|
|
|
root = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
FREQUENT_CUTOFF = 40 |
|
|
MEDIUM_CUTOFF = 10 |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Ligand Disovery 2: Explore Protein-sets", |
|
|
page_icon=":home:", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.css-13sdm1b.e16nr0p33 { |
|
|
margin-top: -75px; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
hide_streamlit_style = """ |
|
|
<style> |
|
|
#MainMenu {visibility: hidden;} |
|
|
footer {visibility: hidden;} |
|
|
#header {visibility: hidden;} |
|
|
</style> |
|
|
""" |
|
|
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data() |
|
|
def load_screening_hits(): |
|
|
db = pd.read_csv(os.path.join(root, "./screening_hits.tsv"), sep="\t") |
|
|
return db |
|
|
|
|
|
@st.cache_data() |
|
|
def load_human_proteome(): |
|
|
human_proteome = pd.read_csv(os.path.join(root, "./human_proteome_with_gene_names.tab"), sep="\t") |
|
|
return human_proteome |
|
|
|
|
|
@st.cache_data() |
|
|
def load_hek_proteome(): |
|
|
hek_proteome = [] |
|
|
with open(os.path.join(root, "./hek293t_core.tsv"), "r") as f: |
|
|
reader = csv.reader(f) |
|
|
for r in reader: |
|
|
hek_proteome += [r[0]] |
|
|
hek_proteome = set(hek_proteome) |
|
|
return hek_proteome |
|
|
|
|
|
@st.cache_data() |
|
|
def load_pid2name_primary(): |
|
|
return joblib.load(os.path.join(root, "./pid2name_primary.joblib")) |
|
|
|
|
|
@st.cache_data() |
|
|
def convert_df(df): |
|
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
|
|
@st.cache_data() |
|
|
def convert_df_no_header(df): |
|
|
return df.to_csv(index=False, header=False).encode('utf-8') |
|
|
|
|
|
@st.cache_data() |
|
|
def example_input_load(): |
|
|
pids = [] |
|
|
with open(os.path.join(root, "./example_input.csv"), "r") as f: |
|
|
reader = csv.reader(f) |
|
|
for r in reader: |
|
|
pids += [r[0]] |
|
|
return pids |
|
|
|
|
|
db = load_screening_hits() |
|
|
hek_proteome = load_hek_proteome() |
|
|
pid2name_primary = load_pid2name_primary() |
|
|
human_proteome = set(pid2name_primary.keys()) |
|
|
example_input = example_input_load() |
|
|
|
|
|
any2pid = {} |
|
|
for k,v in pid2name_primary.items(): |
|
|
any2pid[v] = k |
|
|
any2pid[k] = k |
|
|
|
|
|
pid2fid = collections.defaultdict(list) |
|
|
fid2pid = collections.defaultdict(list) |
|
|
for r in db[["Accession", "FragID"]].values: |
|
|
pid2fid[r[0]] += [r[1]] |
|
|
fid2pid[r[1]] += [r[0]] |
|
|
|
|
|
frequent_hitters = set() |
|
|
normal_hitters = set() |
|
|
specific_hitters = set() |
|
|
for k,v in pid2fid.items(): |
|
|
if len(v) >= FREQUENT_CUTOFF: |
|
|
frequent_hitters.update([k]) |
|
|
continue |
|
|
if len(v) >= MEDIUM_CUTOFF: |
|
|
normal_hitters.update([k]) |
|
|
continue |
|
|
specific_hitters.update([k]) |
|
|
|
|
|
options = sorted([x for k,v in pid2name_primary.items() for x in [k,v]]) |
|
|
|
|
|
|
|
|
|
|
|
st.sidebar.title("Ligand Discovery 2: Explore Protein-sets") |
|
|
st.sidebar.write("We screened 407 fully-functionalized small molecule fragments ('Ligands') in HEK293t cells. For {0} of the Ligands, we found at least one protein enriched. In total, we enriched {1} proteins at least once. Query your protein sets of interest and explore them in light of our dataset!".format(len(fid2pid), len(pid2fid))) |
|
|
|
|
|
manual_input = st.sidebar.multiselect(label="Input proteins manually", options = [""] + sorted(options), default=[], help="Select proteins by UniProt Accession code or Gene Symbol") |
|
|
user_pids = {} |
|
|
user_input = [] |
|
|
for i in manual_input: |
|
|
user_pids[i] = any2pid[i] |
|
|
user_input += [i] |
|
|
|
|
|
st.sidebar.subheader("OR") |
|
|
|
|
|
fids = sorted(set(db["FragID"])) |
|
|
fid_input = st.sidebar.selectbox(label="Select pre-screened Ligand by identifier", options = [""] + fids, help="Select an already profiled Ligand in our primary screening (page Interactions). Use the Ligand identifier (example, C001)") |
|
|
if fid_input != "": |
|
|
user_input = fid2pid[fid_input] |
|
|
user_pids = dict((r,r) for r in user_input) |
|
|
|
|
|
st.sidebar.subheader("OR") |
|
|
|
|
|
example_file = db |
|
|
file_input = st.sidebar.file_uploader(label="Upload a file", help="Provide a file containing one UniProt Accession code or Gene Symbol per row.") |
|
|
if file_input: |
|
|
user_input = list(pd.read_csv(file_input, header=None)[0]) |
|
|
for i in user_input: |
|
|
user_pids[i] = any2pid[i] |
|
|
|
|
|
st.sidebar.download_button(label="Download example file", data=convert_df_no_header(pd.DataFrame({"uniprot_ac": example_input})), file_name="protein_profile_example.csv", mime="text/csv") |
|
|
|
|
|
|
|
|
|
|
|
if not manual_input: |
|
|
manual_input = None |
|
|
|
|
|
if not fid_input: |
|
|
fid_input = None |
|
|
|
|
|
if not file_input: |
|
|
file_input = None |
|
|
|
|
|
if not manual_input and not file_input and not fid_input: |
|
|
st.sidebar.info("Use any of the options above to explore a protein profile...") |
|
|
query_is_available = False |
|
|
else: |
|
|
c = 0 |
|
|
for x in [manual_input, fid_input, file_input]: |
|
|
if x is not None: |
|
|
c += 1 |
|
|
if c > 1: |
|
|
st.sidebar.error("More than one input type has been provided! Please only choose one of the options, i.e. input proteins manually, or select a pre-screened Ligand, or upload a file. Refresh this window to get started again.") |
|
|
query_is_available = False |
|
|
else: |
|
|
query_is_available = True |
|
|
|
|
|
|
|
|
def serialize_s(cat, r): |
|
|
s = [cat] + r[:-1] + [" ".join(r[-1])] |
|
|
return s |
|
|
|
|
|
if query_is_available: |
|
|
columns = st.columns([0.5, 0.5]) |
|
|
|
|
|
done = set() |
|
|
|
|
|
col = columns[0] |
|
|
cat_name = "Frequently enriched" |
|
|
S = [] |
|
|
R = [] |
|
|
for r in user_input: |
|
|
pid = user_pids[r] |
|
|
if pid in frequent_hitters: |
|
|
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), pid2fid[pid]]] |
|
|
S += [serialize_s(cat_name, R[-1])] |
|
|
done.update([r]) |
|
|
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
col.markdown("**{0} (Low specificity)** : {1}".format(cat_name, df.shape[0])) |
|
|
col.dataframe(df, use_container_width=True) |
|
|
|
|
|
col = columns[1] |
|
|
cat_name = "Medium specificity" |
|
|
R = [] |
|
|
for r in user_input: |
|
|
pid = user_pids[r] |
|
|
if pid in normal_hitters: |
|
|
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
|
|
S += [serialize_s(cat_name, R[-1])] |
|
|
done.update([r]) |
|
|
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
|
|
col.dataframe(df, use_container_width=True) |
|
|
|
|
|
st.divider() |
|
|
columns = st.columns([0.5, 0.25, 0.25]) |
|
|
|
|
|
col = columns[0] |
|
|
cat_name = "High specificity" |
|
|
R = [] |
|
|
for r in user_input: |
|
|
pid = user_pids[r] |
|
|
if pid in specific_hitters: |
|
|
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
|
|
S += [serialize_s(cat_name, R[-1])] |
|
|
done.update([r]) |
|
|
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
|
|
col.dataframe(df, use_container_width=True) |
|
|
|
|
|
col = columns[1] |
|
|
cat_name = "Never enriched" |
|
|
R = [] |
|
|
for r in user_input: |
|
|
if r in done: |
|
|
continue |
|
|
pid = user_pids[r] |
|
|
if pid in hek_proteome: |
|
|
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
|
|
S += [serialize_s(cat_name, R[-1])] |
|
|
done.update([r]) |
|
|
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
|
|
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True) |
|
|
|
|
|
col = columns[2] |
|
|
cat_name = "Not in HEK293t" |
|
|
R = [] |
|
|
for r in user_input: |
|
|
if r in done: |
|
|
continue |
|
|
pid = user_pids[r] |
|
|
if pid in human_proteome: |
|
|
fids_ = sorted(pid2fid[pid]) |
|
|
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), fids_]] |
|
|
S += [serialize_s(cat_name, R[-1])] |
|
|
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
|
|
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True) |
|
|
|
|
|
data = pd.DataFrame(S, columns = ["Category", "UniProt", "GeneName", "Hits", "Fragments"]) |
|
|
data = data.sort_values(by=["Hits", "GeneName", "Category"], ascending=[False, True, True]).reset_index(drop=True) |
|
|
data = convert_df(data) |
|
|
st.download_button(label="Download search results", data=data, file_name="ligand_discovery_search_results.csv", mime="text/csv") |