Spaces:

LoocasGoose
/

cpr

Sleeping

seyonec commited on Apr 19, 2024

Commit

4520ee8

1 Parent(s): 2da269e

clean basic LTT

Files changed (3) hide show

clean_selection/analyze_clean_hierarchical_loss_protein_vec.ipynb ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d4e54f9a6d53d0fa29ae72291e719a394f18bae83a55408ad867934a19657b
+size 49522

clean_selection/process_clean_ec.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e2fd5b4ecd25db8f579847e7a21230f98dff5a850de3cf418e0e6cca8fe227f1
-size 5471

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6a7b8853fa3df8fe380cea2391bdace7b76605c3e80770b397c1ba5776aee6c
+size 5860

protein_conformal/scope_utils.py CHANGED Viewed

@@ -287,14 +287,17 @@ def get_hierarchical_loss(data_, lambda_):
     return np.mean(losses)
-def get_thresh_max_hierarchical(data, lambdas, alpha):
     # get the worst case loss
     wc_loss = 4 # in the max case, the max_loss is simply retrieving a protein with different class
     loss_thresh = alpha - (wc_loss - alpha)/len(data) # normalize by size of calib set
     losses = []
     best_lam = None
-    for lam in reversed(lambdas): # start from the largest lambda since we are dealing with raw similarity scores
-        per_lam_loss = get_hierarchical_max_loss(data, lam)
         if per_lam_loss > loss_thresh:
             break
         best_lam = lam
@@ -306,10 +309,13 @@ def get_thresh_max_hierarchical(data, lambdas, alpha):
     return best_lam, losses
-def get_hierarchical_max_loss(data_, lambda_):
     losses = []
     for query in data_:
-        thresh_idx = query['S_i'] >= lambda_
         if np.sum(thresh_idx) == 0:
             loss = 0
         else:

     return np.mean(losses)
+def get_thresh_max_hierarchical(data, lambdas, alpha, sim="cosine"):
     # get the worst case loss
     wc_loss = 4 # in the max case, the max_loss is simply retrieving a protein with different class
     loss_thresh = alpha - (wc_loss - alpha)/len(data) # normalize by size of calib set
     losses = []
     best_lam = None
+    if sim == "cosine":
+        ## reverse lambdas and return list
+        lambdas = list(reversed(lambdas))
+    for lam in lambdas: # start from the largest lambda since we are dealing with raw similarity scores
+        per_lam_loss = get_hierarchical_max_loss(data, lam, sim=sim)
         if per_lam_loss > loss_thresh:
             break
         best_lam = lam
     return best_lam, losses
+def get_hierarchical_max_loss(data_, lambda_, sim="cosine"):
     losses = []
     for query in data_:
+        if sim == "cosine":
+            thresh_idx = query['S_i'] >= lambda_
+        else:
+            thresh_idx = query['S_i'] <= lambda_
         if np.sum(thresh_idx) == 0:
             loss = 0
         else: