File size: 8,441 Bytes
33c4cb4 a8bb0fc 647aa0d a8bb0fc 33c4cb4 a8bb0fc 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da 33c4cb4 37b61da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
import os
import streamlit as st
import pandas as pd
import csv
import collections
import joblib
root = os.path.dirname(os.path.abspath(__file__))
FREQUENT_CUTOFF = 40
MEDIUM_CUTOFF = 10
st.set_page_config(
page_title="Ligand Disovery 2: Explore Protein-sets",
page_icon=":home:",
layout="wide", # "centered",
initial_sidebar_state="expanded"
)
st.markdown("""
<style>
.css-13sdm1b.e16nr0p33 {
margin-top: -75px;
}
</style>
""", unsafe_allow_html=True)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
#header {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
# read data
@st.cache_data()
def load_screening_hits():
db = pd.read_csv(os.path.join(root, "./screening_hits.tsv"), sep="\t")
return db
@st.cache_data()
def load_human_proteome():
human_proteome = pd.read_csv(os.path.join(root, "./human_proteome_with_gene_names.tab"), sep="\t")
return human_proteome
@st.cache_data()
def load_hek_proteome():
hek_proteome = []
with open(os.path.join(root, "./hek293t_core.tsv"), "r") as f:
reader = csv.reader(f)
for r in reader:
hek_proteome += [r[0]]
hek_proteome = set(hek_proteome)
return hek_proteome
@st.cache_data()
def load_pid2name_primary():
return joblib.load(os.path.join(root, "./pid2name_primary.joblib"))
@st.cache_data()
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
@st.cache_data()
def convert_df_no_header(df):
return df.to_csv(index=False, header=False).encode('utf-8')
@st.cache_data()
def example_input_load():
pids = []
with open(os.path.join(root, "./example_input.csv"), "r") as f:
reader = csv.reader(f)
for r in reader:
pids += [r[0]]
return pids
db = load_screening_hits()
hek_proteome = load_hek_proteome()
pid2name_primary = load_pid2name_primary()
human_proteome = set(pid2name_primary.keys())
example_input = example_input_load()
any2pid = {}
for k,v in pid2name_primary.items():
any2pid[v] = k
any2pid[k] = k
pid2fid = collections.defaultdict(list)
fid2pid = collections.defaultdict(list)
for r in db[["Accession", "FragID"]].values:
pid2fid[r[0]] += [r[1]]
fid2pid[r[1]] += [r[0]]
frequent_hitters = set()
normal_hitters = set()
specific_hitters = set()
for k,v in pid2fid.items():
if len(v) >= FREQUENT_CUTOFF:
frequent_hitters.update([k])
continue
if len(v) >= MEDIUM_CUTOFF:
normal_hitters.update([k])
continue
specific_hitters.update([k])
options = sorted([x for k,v in pid2name_primary.items() for x in [k,v]])
# layout
st.sidebar.title("Ligand Discovery 2: Explore Protein-sets")
st.sidebar.write("We screened 407 fully-functionalized small molecule fragments ('Ligands') in HEK293t cells. For {0} of the Ligands, we found at least one protein enriched. In total, we enriched {1} proteins at least once. Query your protein sets of interest and explore them in light of our dataset!".format(len(fid2pid), len(pid2fid)))
manual_input = st.sidebar.multiselect(label="Input proteins manually", options = [""] + sorted(options), default=[], help="Select proteins by UniProt Accession code or Gene Symbol")
user_pids = {}
user_input = []
for i in manual_input:
user_pids[i] = any2pid[i]
user_input += [i]
st.sidebar.subheader("OR")
fids = sorted(set(db["FragID"]))
fid_input = st.sidebar.selectbox(label="Select pre-screened Ligand by identifier", options = [""] + fids, help="Select an already profiled Ligand in our primary screening (page Interactions). Use the Ligand identifier (example, C001)")
if fid_input != "":
user_input = fid2pid[fid_input]
user_pids = dict((r,r) for r in user_input)
st.sidebar.subheader("OR")
example_file = db
file_input = st.sidebar.file_uploader(label="Upload a file", help="Provide a file containing one UniProt Accession code or Gene Symbol per row.")
if file_input:
user_input = list(pd.read_csv(file_input, header=None)[0])
for i in user_input:
user_pids[i] = any2pid[i]
st.sidebar.download_button(label="Download example file", data=convert_df_no_header(pd.DataFrame({"uniprot_ac": example_input})), file_name="protein_profile_example.csv", mime="text/csv")
# checks
if not manual_input:
manual_input = None
if not fid_input:
fid_input = None
if not file_input:
file_input = None
if not manual_input and not file_input and not fid_input:
st.sidebar.info("Use any of the options above to explore a protein profile...")
query_is_available = False
else:
c = 0
for x in [manual_input, fid_input, file_input]:
if x is not None:
c += 1
if c > 1:
st.sidebar.error("More than one input type has been provided! Please only choose one of the options, i.e. input proteins manually, or select a pre-screened Ligand, or upload a file. Refresh this window to get started again.")
query_is_available = False
else:
query_is_available = True
def serialize_s(cat, r):
s = [cat] + r[:-1] + [" ".join(r[-1])]
return s
if query_is_available:
columns = st.columns([0.5, 0.5])
done = set()
col = columns[0]
cat_name = "Frequently enriched"
S = []
R = []
for r in user_input:
pid = user_pids[r]
if pid in frequent_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), pid2fid[pid]]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0} (Low specificity)** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
col = columns[1]
cat_name = "Medium specificity"
R = []
for r in user_input:
pid = user_pids[r]
if pid in normal_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
st.divider()
columns = st.columns([0.5, 0.25, 0.25])
col = columns[0]
cat_name = "High specificity"
R = []
for r in user_input:
pid = user_pids[r]
if pid in specific_hitters:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df, use_container_width=True)
col = columns[1]
cat_name = "Never enriched"
R = []
for r in user_input:
if r in done:
continue
pid = user_pids[r]
if pid in hek_proteome:
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
S += [serialize_s(cat_name, R[-1])]
done.update([r])
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)
col = columns[2]
cat_name = "Not in HEK293t"
R = []
for r in user_input:
if r in done:
continue
pid = user_pids[r]
if pid in human_proteome:
fids_ = sorted(pid2fid[pid])
R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), fids_]]
S += [serialize_s(cat_name, R[-1])]
df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)
data = pd.DataFrame(S, columns = ["Category", "UniProt", "GeneName", "Hits", "Fragments"])
data = data.sort_values(by=["Hits", "GeneName", "Category"], ascending=[False, True, True]).reset_index(drop=True)
data = convert_df(data)
st.download_button(label="Download search results", data=data, file_name="ligand_discovery_search_results.csv", mime="text/csv") |