seyonec commited on
Commit
4520ee8
·
1 Parent(s): 2da269e

clean basic LTT

Browse files
clean_selection/analyze_clean_hierarchical_loss_protein_vec.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d4e54f9a6d53d0fa29ae72291e719a394f18bae83a55408ad867934a19657b
3
+ size 49522
clean_selection/process_clean_ec.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2fd5b4ecd25db8f579847e7a21230f98dff5a850de3cf418e0e6cca8fe227f1
3
- size 5471
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6a7b8853fa3df8fe380cea2391bdace7b76605c3e80770b397c1ba5776aee6c
3
+ size 5860
protein_conformal/scope_utils.py CHANGED
@@ -287,14 +287,17 @@ def get_hierarchical_loss(data_, lambda_):
287
  return np.mean(losses)
288
 
289
 
290
- def get_thresh_max_hierarchical(data, lambdas, alpha):
291
  # get the worst case loss
292
  wc_loss = 4 # in the max case, the max_loss is simply retrieving a protein with different class
293
  loss_thresh = alpha - (wc_loss - alpha)/len(data) # normalize by size of calib set
294
  losses = []
295
  best_lam = None
296
- for lam in reversed(lambdas): # start from the largest lambda since we are dealing with raw similarity scores
297
- per_lam_loss = get_hierarchical_max_loss(data, lam)
 
 
 
298
  if per_lam_loss > loss_thresh:
299
  break
300
  best_lam = lam
@@ -306,10 +309,13 @@ def get_thresh_max_hierarchical(data, lambdas, alpha):
306
 
307
  return best_lam, losses
308
 
309
- def get_hierarchical_max_loss(data_, lambda_):
310
  losses = []
311
  for query in data_:
312
- thresh_idx = query['S_i'] >= lambda_
 
 
 
313
  if np.sum(thresh_idx) == 0:
314
  loss = 0
315
  else:
 
287
  return np.mean(losses)
288
 
289
 
290
+ def get_thresh_max_hierarchical(data, lambdas, alpha, sim="cosine"):
291
  # get the worst case loss
292
  wc_loss = 4 # in the max case, the max_loss is simply retrieving a protein with different class
293
  loss_thresh = alpha - (wc_loss - alpha)/len(data) # normalize by size of calib set
294
  losses = []
295
  best_lam = None
296
+ if sim == "cosine":
297
+ ## reverse lambdas and return list
298
+ lambdas = list(reversed(lambdas))
299
+ for lam in lambdas: # start from the largest lambda since we are dealing with raw similarity scores
300
+ per_lam_loss = get_hierarchical_max_loss(data, lam, sim=sim)
301
  if per_lam_loss > loss_thresh:
302
  break
303
  best_lam = lam
 
309
 
310
  return best_lam, losses
311
 
312
+ def get_hierarchical_max_loss(data_, lambda_, sim="cosine"):
313
  losses = []
314
  for query in data_:
315
+ if sim == "cosine":
316
+ thresh_idx = query['S_i'] >= lambda_
317
+ else:
318
+ thresh_idx = query['S_i'] <= lambda_
319
  if np.sum(thresh_idx) == 0:
320
  loss = 0
321
  else: