2
File size: 8,441 Bytes
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
a8bb0fc
647aa0d
a8bb0fc
 
 
 
 
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8bb0fc
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37b61da
 
33c4cb4
37b61da
33c4cb4
 
 
 
 
 
37b61da
33c4cb4
 
37b61da
33c4cb4
 
 
 
37b61da
33c4cb4
 
37b61da
33c4cb4
 
 
 
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
 
 
 
 
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
37b61da
 
 
 
33c4cb4
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
37b61da
33c4cb4
 
 
 
 
 
 
 
 
 
 
37b61da
33c4cb4
 
 
 
 
37b61da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
import streamlit as st
import pandas as pd
import csv
import collections
import joblib

root = os.path.dirname(os.path.abspath(__file__))

FREQUENT_CUTOFF = 40
MEDIUM_CUTOFF = 10

st.set_page_config(
    page_title="Ligand Disovery 2: Explore Protein-sets", 
    page_icon=":home:",
    layout="wide", # "centered",
    initial_sidebar_state="expanded"
)

st.markdown("""
  <style>
    .css-13sdm1b.e16nr0p33 {
      margin-top: -75px;
    }
  </style>
""", unsafe_allow_html=True)

hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            #header {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True) 


# read data

@st.cache_data()
def load_screening_hits():
    db = pd.read_csv(os.path.join(root, "./screening_hits.tsv"), sep="\t")
    return db

@st.cache_data()
def load_human_proteome():
    human_proteome = pd.read_csv(os.path.join(root, "./human_proteome_with_gene_names.tab"), sep="\t")
    return human_proteome

@st.cache_data()
def load_hek_proteome():
    hek_proteome = []
    with open(os.path.join(root, "./hek293t_core.tsv"), "r") as f:
        reader = csv.reader(f)
        for r in reader:
            hek_proteome += [r[0]]
    hek_proteome = set(hek_proteome)
    return hek_proteome

@st.cache_data()
def load_pid2name_primary():
    return joblib.load(os.path.join(root, "./pid2name_primary.joblib"))

@st.cache_data()
def convert_df(df):
    return df.to_csv(index=False).encode('utf-8')

@st.cache_data()
def convert_df_no_header(df):
    return df.to_csv(index=False, header=False).encode('utf-8')

@st.cache_data()
def example_input_load():
    pids = []
    with open(os.path.join(root, "./example_input.csv"), "r") as f:
        reader = csv.reader(f)
        for r in reader:
            pids += [r[0]]
    return pids

db = load_screening_hits()
hek_proteome = load_hek_proteome()
pid2name_primary = load_pid2name_primary()
human_proteome = set(pid2name_primary.keys())
example_input = example_input_load()

any2pid = {}
for k,v in pid2name_primary.items():
    any2pid[v] = k
    any2pid[k] = k

pid2fid = collections.defaultdict(list)
fid2pid = collections.defaultdict(list)
for r in db[["Accession", "FragID"]].values:
    pid2fid[r[0]] += [r[1]]
    fid2pid[r[1]] += [r[0]]

frequent_hitters = set()
normal_hitters = set()
specific_hitters = set()
for k,v in pid2fid.items():
    if len(v) >= FREQUENT_CUTOFF:
        frequent_hitters.update([k])
        continue
    if len(v) >= MEDIUM_CUTOFF:
        normal_hitters.update([k])
        continue
    specific_hitters.update([k])

options = sorted([x for k,v in pid2name_primary.items() for x in [k,v]])

# layout

st.sidebar.title("Ligand Discovery 2: Explore Protein-sets")
st.sidebar.write("We screened 407 fully-functionalized small molecule fragments ('Ligands') in HEK293t cells. For {0} of the Ligands, we found at least one protein enriched. In total, we enriched {1} proteins at least once. Query your protein sets of interest and explore them in light of our dataset!".format(len(fid2pid), len(pid2fid)))

manual_input = st.sidebar.multiselect(label="Input proteins manually", options = [""] + sorted(options), default=[], help="Select proteins by UniProt Accession code or Gene Symbol")
user_pids = {}
user_input = []
for i in manual_input:
    user_pids[i] = any2pid[i]
    user_input += [i]

st.sidebar.subheader("OR")

fids = sorted(set(db["FragID"]))
fid_input = st.sidebar.selectbox(label="Select pre-screened Ligand by identifier", options = [""] + fids, help="Select an already profiled Ligand in our primary screening (page Interactions). Use the Ligand identifier (example, C001)")
if fid_input != "":
    user_input = fid2pid[fid_input]
    user_pids = dict((r,r) for r in user_input)

st.sidebar.subheader("OR")

example_file = db
file_input = st.sidebar.file_uploader(label="Upload a file", help="Provide a file containing one UniProt Accession code or Gene Symbol per row.")
if file_input:
    user_input = list(pd.read_csv(file_input, header=None)[0])
    for i in user_input:
        user_pids[i] = any2pid[i]

st.sidebar.download_button(label="Download example file", data=convert_df_no_header(pd.DataFrame({"uniprot_ac": example_input})), file_name="protein_profile_example.csv", mime="text/csv")

# checks

if not manual_input:
    manual_input = None

if not fid_input:
    fid_input = None

if not file_input:
    file_input = None

if not manual_input and not file_input and not fid_input:
    st.sidebar.info("Use any of the options above to explore a protein profile...")
    query_is_available = False
else:
    c = 0
    for x in [manual_input, fid_input, file_input]:
        if x is not None:
            c += 1
    if c > 1:
        st.sidebar.error("More than one input type has been provided! Please only choose one of the options, i.e. input proteins manually, or select a pre-screened Ligand, or upload a file. Refresh this window to get started again.")
        query_is_available = False
    else:
        query_is_available = True


def serialize_s(cat, r):
    s = [cat] + r[:-1] + [" ".join(r[-1])]
    return s

if query_is_available:
    columns = st.columns([0.5, 0.5])

    done = set()

    col = columns[0]
    cat_name = "Frequently enriched"
    S = []
    R = []
    for r in user_input:
        pid = user_pids[r]
        if pid in frequent_hitters:
            R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), pid2fid[pid]]]
            S += [serialize_s(cat_name, R[-1])]
            done.update([r])
    df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
    col.markdown("**{0} (Low specificity)** : {1}".format(cat_name, df.shape[0]))
    col.dataframe(df, use_container_width=True)

    col = columns[1]
    cat_name = "Medium specificity"
    R = []
    for r in user_input:
        pid = user_pids[r]
        if pid in normal_hitters:
            R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
            S += [serialize_s(cat_name, R[-1])]
            done.update([r])
    df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
    col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
    col.dataframe(df, use_container_width=True)

    st.divider()
    columns = st.columns([0.5, 0.25, 0.25])
    
    col = columns[0]
    cat_name = "High specificity"
    R = []
    for r in user_input:
        pid = user_pids[r]
        if pid in specific_hitters:
            R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
            S += [serialize_s(cat_name, R[-1])]
            done.update([r])
    df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
    col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
    col.dataframe(df, use_container_width=True)

    col = columns[1]
    cat_name = "Never enriched"
    R = []
    for r in user_input:
        if r in done:
            continue
        pid = user_pids[r]
        if pid in hek_proteome:
            R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]]
            S += [serialize_s(cat_name, R[-1])]
            done.update([r])
    df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
    col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
    col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)

    col = columns[2]
    cat_name = "Not in HEK293t"
    R = []
    for r in user_input:
        if r in done:
            continue
        pid = user_pids[r]
        if pid in human_proteome:
            fids_ = sorted(pid2fid[pid])
            R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), fids_]]
            S += [serialize_s(cat_name, R[-1])]
    df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"])
    col.markdown("**{0}** : {1}".format(cat_name, df.shape[0]))
    col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True)
    
    data = pd.DataFrame(S, columns = ["Category", "UniProt", "GeneName", "Hits", "Fragments"])
    data = data.sort_values(by=["Hits", "GeneName", "Category"], ascending=[False, True, True]).reset_index(drop=True)
    data = convert_df(data)
    st.download_button(label="Download search results", data=data, file_name="ligand_discovery_search_results.csv", mime="text/csv")