Spaces:
Running
Running
File size: 5,287 Bytes
c95d941 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | #!/usr/bin/env python
"""
Verify FDR algorithm using available calibration data.
This script tests the core FDR threshold computation algorithm using the
Pfam calibration data. It verifies that:
1. The FAISS similarity search works correctly
2. The FDR threshold computation produces the expected value
3. The Venn-Abers probability calibration works
This is a functional test of the algorithm, not a reproduction of the
exact Syn3.0 results (which require additional query embeddings).
Usage:
python scripts/verify_fdr_algorithm.py
"""
import sys
from pathlib import Path
import numpy as np
# Add parent directory to path for imports
repo_root = str(Path(__file__).parent.parent)
sys.path.insert(0, repo_root)
# Import util directly to avoid gradio dependency in __init__.py
import importlib.util
spec = importlib.util.spec_from_file_location("util", f"{repo_root}/protein_conformal/util.py")
util = importlib.util.module_from_spec(spec)
spec.loader.exec_module(util)
load_database = util.load_database
query = util.query
simplifed_venn_abers_prediction = util.simplifed_venn_abers_prediction
get_sims_labels = util.get_sims_labels
get_thresh_FDR = util.get_thresh_FDR
def main():
data_dir = Path(__file__).parent.parent / 'data'
print("=" * 60)
print("FDR Algorithm Verification")
print("=" * 60)
# Check required files
lookup_embeddings_path = data_dir / 'lookup_embeddings.npy'
lookup_metadata_path = data_dir / 'lookup_embeddings_meta_data.tsv'
calibration_data_path = data_dir / 'pfam_new_proteins.npy'
missing = []
for p in [lookup_embeddings_path, lookup_metadata_path, calibration_data_path]:
if not p.exists():
missing.append(p)
if missing:
print("ERROR: Missing required files:")
for f in missing:
print(f" - {f}")
sys.exit(1)
# Test 1: Load lookup embeddings and build FAISS index
print("\n1. Testing FAISS index construction...")
embeddings = np.load(lookup_embeddings_path)
print(f" Loaded embeddings: {embeddings.shape}")
# Build index on a subset for speed
subset_size = 10000
subset_embeddings = embeddings[:subset_size]
db = load_database(subset_embeddings)
print(f" Built FAISS index on {subset_size} embeddings")
# Test 2: Query the database
print("\n2. Testing similarity search...")
# Use random query
np.random.seed(42)
query_emb = np.random.randn(10, 512).astype(np.float32)
query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)
D, I = query(db, query_emb, k=5)
print(f" Query shape: {query_emb.shape}")
print(f" Results D shape: {D.shape}, I shape: {I.shape}")
print(f" Max similarity: {D.max():.6f}")
print(f" Min similarity: {D.min():.6f}")
# Test 3: Load calibration data and compute FDR threshold
print("\n3. Testing FDR threshold computation...")
cal_data = np.load(calibration_data_path, allow_pickle=True)
print(f" Loaded {len(cal_data)} calibration samples")
# Use a subset for faster testing
np.random.seed(42)
np.random.shuffle(cal_data)
cal_subset = cal_data[:100]
sims, labels = get_sims_labels(cal_subset, partial=False)
print(f" Calibration sims shape: {sims.shape}")
print(f" Calibration labels shape: {labels.shape}")
# Compute FDR threshold
alpha = 0.1
delta = 0.5
try:
l_hat, risk_fdr = get_thresh_FDR(labels.flatten(), sims.flatten(), alpha=alpha, delta=delta, N=50)
print(f" FDR threshold (α={alpha}): λ = {l_hat:.12f}")
print(f" FDR risk at threshold: {risk_fdr:.6f}")
# Expected threshold is around 0.999980
if 0.9999 < l_hat < 1.0001:
print(" ✓ Threshold is in expected range [0.9999, 1.0001]")
else:
print(f" ⚠ Threshold {l_hat} outside expected range")
except Exception as e:
print(f" ✗ FDR computation failed: {e}")
import traceback
traceback.print_exc()
l_hat = None
# Test 4: Venn-Abers probability computation
print("\n4. Testing Venn-Abers probability...")
X_cal = sims.flatten()
y_cal = labels.flatten()
# Test with some similarity values
test_sims = np.array([0.999, 0.9999, 0.99999, 1.0])
for sim in test_sims:
p0, p1 = simplifed_venn_abers_prediction(X_cal, y_cal, sim)
prob = (p0 + p1) / 2
uncertainty = abs(p1 - p0)
print(f" sim={sim:.5f} → prob={prob:.4f} (uncertainty={uncertainty:.4f})")
print("\n" + "=" * 60)
print("VERIFICATION COMPLETE")
print("=" * 60)
# Summary
print("\nSummary:")
print(" ✓ FAISS index construction works")
print(" ✓ Similarity search works")
if l_hat:
print(" ✓ FDR threshold computation works")
else:
print(" ✗ FDR threshold computation failed")
print(" ✓ Venn-Abers probability works")
print("\nNote: To reproduce exact Syn3.0 results (59/149 = 39.6%),")
print("you need the query embeddings for the 149 unknown genes.")
print("These can be generated using the Protein-Vec model:")
print(" python -m protein_conformal.embed_protein_vec --input unknown_aa_seqs.fasta")
if __name__ == '__main__':
main()
|