ronboger commited on
Commit
83010ef
·
1 Parent(s): 16697aa

separate util for analyzing protein vec results

Browse files
Interactive-1.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad0aca44189d80ee7752b2fa0509d027d12e7d18d47bfe9737a50131054c1d96
3
+ size 402714
analyze_protein_vec_results.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d56f906274fd48874071190c7cbf9056aa9f0681d49cded47b3bb7cab6951ea7
3
- size 418551
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e06ff17665134f9e8946846d5536d234213e31cef667b678b51fde2707b6740
3
+ size 13863852
create_learn_then_test.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86ca6eda648119e178962c5ce4e6051f807a68aadba1404621f20987634af78
3
- size 73206
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:375209f78d75e7abe857572b15cccfc82fc6af4d99c93b501c39d88e9d66dcd8
3
+ size 74759
util.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.isotonic import IsotonicRegression
2
+ import numpy as np
3
+ from scipy.stats import binom, norm
4
+
5
+
6
+ def get_sims_labels(data, partial=False):
7
+ sims = []
8
+ labels = []
9
+ for query in data:
10
+ similarity = query["S_i"]
11
+ sims += similarity.tolist()
12
+ if partial:
13
+ labels_to_append = np.logical_or.reduce(query["partial"], axis=1).tolist()
14
+ else:
15
+ labels_to_append = query["exact"]
16
+ labels += labels_to_append
17
+ return sims, labels
18
+
19
+
20
+ def get_thresh(data, alpha):
21
+ # conformal risk control
22
+ all_sim_exact = []
23
+ for query in data:
24
+ idx = query["exact"]
25
+ similarity = query["S_i"]
26
+ sims_to_append = similarity[idx]
27
+ all_sim_exact += list(sims_to_append)
28
+ n = len(all_sim_exact)
29
+ if n > 0:
30
+ lhat = np.quantile(
31
+ all_sim_exact,
32
+ np.maximum(alpha - (1 - alpha) / n, 0),
33
+ interpolation="lower",
34
+ )
35
+ else:
36
+ lhat = 0
37
+ return lhat
38
+
39
+
40
+ # Bentkus p value
41
+ def bentkus_p_value(r_hat, n, alpha):
42
+ return binom.cdf(np.ceil(n * r_hat), n, alpha / np.e)
43
+
44
+
45
+ # def clt_p_value(r_hat,n,alpha):
46
+
47
+
48
+ def clt_p_value(r_hat, std_hat, n, alpha):
49
+ z = (r_hat - alpha) / (std_hat / np.sqrt(n))
50
+ p_value = norm.cdf(z)
51
+ return p_value
52
+
53
+
54
+ def percentage_of_discoveries(sims, labels, lam):
55
+ # FDR: Number of false matches / number of matches
56
+ total_discoveries = (sims >= lam).sum(axis=1)
57
+ return total_discoveries.mean() / len(labels) # or sims.shape[1]
58
+
59
+
60
+ def risk(sims, labels, lam):
61
+ # FDR: Number of false matches / number of matches
62
+ total_discoveries = (sims >= lam).sum(axis=1)
63
+ false_discoveries = ((1 - labels) * (sims >= lam)).sum(axis=1)
64
+ total_discoveries = np.maximum(total_discoveries, 1)
65
+ return (false_discoveries / total_discoveries).mean()
66
+
67
+
68
+ def calculate_false_negatives(sims, labels, lam):
69
+ # FNR: Number of false non-matches / number of non-matches
70
+ total_non_matches = labels.sum(axis=1)
71
+ false_non_matches = (labels & (sims < lam)).sum(axis=1)
72
+ total_non_matches = np.maximum(total_non_matches, 1)
73
+ return (false_non_matches / total_non_matches).mean()
74
+
75
+
76
+ def risk_no_empties(sims, labels, lam):
77
+ # FDR: Number of false matches / number of matches
78
+ total_discoveries = (sims >= lam).sum(axis=1)
79
+ false_discoveries = ((1 - labels) * (sims >= lam)).sum(axis=1)
80
+ idx = total_discoveries > 0
81
+ total_discoveries = total_discoveries[idx]
82
+ false_discoveries = false_discoveries[idx]
83
+ return (false_discoveries / total_discoveries).mean()
84
+
85
+
86
+ def std_loss(sims, labels, lam):
87
+ # FDR: Number of false matches / number of matches
88
+ total_discoveries = (sims >= lam).sum(axis=1)
89
+ false_discoveries = ((1 - labels) * (sims >= lam)).sum(axis=1)
90
+ total_discoveries = np.maximum(total_discoveries, 1)
91
+ return (false_discoveries / total_discoveries).std()
92
+
93
+
94
+ def get_thresh_FDR(labels, sims, alpha, delta=0.5, N=5000):
95
+ # FDR control with LTT
96
+ # labels = np.stack([query['exact'] for query in data], axis=0)
97
+ # sims = np.stack([query['S_i'] for query in data], axis=0)
98
+ print(f"sims.max: {sims.max()}")
99
+ n = len(labels)
100
+ lambdas = np.linspace(sims.min(), sims.max(), N)
101
+ risks = np.array([risk(sims, labels, lam) for lam in lambdas])
102
+ stds = np.array([std_loss(sims, labels, lam) for lam in lambdas])
103
+ # pvals = np.array( [bentkus_p_value(r,n,alpha) for r in risks] )
104
+ pvals = np.array([clt_p_value(r, s, n, alpha) for r, s in zip(risks, stds)])
105
+ below = pvals <= delta
106
+ # Pick the smallest lambda such that all lambda above it have p-value below delta
107
+ pvals_satisfy_condition = np.array([np.all(below[i:]) for i in range(N)])
108
+ lhat = lambdas[np.argmax(pvals_satisfy_condition)]
109
+ print(f"lhat: {lhat}")
110
+ print(f"risk: {risk(sims, labels, lhat)}")
111
+ return lhat
112
+
113
+
114
+ def get_isotone_regression(data):
115
+ sims, labels = get_sims_labels(data, partial=True)
116
+ ir = IsotonicRegression(out_of_bounds="clip")
117
+ ir.fit(sims, labels)
118
+ return ir
119
+
120
+
121
+ def simplifed_venn_abers_prediction(calib_data, test_data_point):
122
+ sims, labels = get_sims_labels(calib_data, partial=False)
123
+ print(sims)
124
+ print(labels)
125
+ print(len(sims))
126
+ print(len(labels))
127
+
128
+ sims.append(test_data_point)
129
+ labels.append(True)
130
+ print(len(sims))
131
+ print(len(labels))
132
+
133
+ ir_0 = IsotonicRegression(out_of_bounds="clip")
134
+ ir_1 = IsotonicRegression(out_of_bounds="clip")
135
+
136
+ ir_0.fit(sims, labels)
137
+
138
+ labels[-1] = False
139
+ ir_1.fit(sims, labels)
140
+
141
+ p_0 = ir_0.predict([test_data_point])[0]
142
+ p_1 = ir_1.predict([test_data_point])[0]
143
+
144
+ return p_0, p_1
145
+
146
+
147
+ def validate_lhat(data, lhat):
148
+ total_missed = 0
149
+ total_missed_partial = 0
150
+ total_exact = 0
151
+ total_inexact_identified = 0
152
+ total_identified = 0
153
+ total_partial = 0
154
+ total_partial_identified = 0
155
+ for query in data:
156
+ idx = query["exact"]
157
+ # if partial has multiple rows, we want to take the logical or of all of them. Otherwise just set it to the single row
158
+ # check if there is one or more rows
159
+ # query['partial'] = np.array(query['partial'])
160
+ if len(np.array(query["partial"]).shape) > 1:
161
+ idx_partial = np.logical_or.reduce(query["partial"], axis=1)
162
+ else:
163
+ idx_partial = query["partial"]
164
+
165
+ sims = query["S_i"]
166
+ sims_exact = sims[idx]
167
+ sims_partial = sims[idx_partial]
168
+ total_missed += (sims_exact < lhat).sum()
169
+
170
+ # TODO: are there any divisions by zero here?
171
+ total_missed_partial += (sims_partial < lhat).sum()
172
+ total_partial_identified += (sims_partial >= lhat).sum()
173
+ total_partial += len(sims_partial)
174
+
175
+ total_exact += len(sims_exact)
176
+ total_inexact_identified += (sims[~np.array(idx)] >= lhat).sum()
177
+ total_identified += (sims >= lhat).sum()
178
+ return (
179
+ total_missed / total_exact,
180
+ total_inexact_identified / total_identified,
181
+ total_missed_partial / total_partial,
182
+ total_partial_identified / total_identified,
183
+ )