#!/usr/bin/env python """ Verify FDR algorithm using available calibration data. This script tests the core FDR threshold computation algorithm using the Pfam calibration data. It verifies that: 1. The FAISS similarity search works correctly 2. The FDR threshold computation produces the expected value 3. The Venn-Abers probability calibration works This is a functional test of the algorithm, not a reproduction of the exact Syn3.0 results (which require additional query embeddings). Usage: python scripts/verify_fdr_algorithm.py """ import sys from pathlib import Path import numpy as np # Add parent directory to path for imports repo_root = str(Path(__file__).parent.parent) sys.path.insert(0, repo_root) # Import util directly to avoid gradio dependency in __init__.py import importlib.util spec = importlib.util.spec_from_file_location("util", f"{repo_root}/protein_conformal/util.py") util = importlib.util.module_from_spec(spec) spec.loader.exec_module(util) load_database = util.load_database query = util.query simplifed_venn_abers_prediction = util.simplifed_venn_abers_prediction get_sims_labels = util.get_sims_labels get_thresh_FDR = util.get_thresh_FDR def main(): data_dir = Path(__file__).parent.parent / 'data' print("=" * 60) print("FDR Algorithm Verification") print("=" * 60) # Check required files lookup_embeddings_path = data_dir / 'lookup_embeddings.npy' lookup_metadata_path = data_dir / 'lookup_embeddings_meta_data.tsv' calibration_data_path = data_dir / 'pfam_new_proteins.npy' missing = [] for p in [lookup_embeddings_path, lookup_metadata_path, calibration_data_path]: if not p.exists(): missing.append(p) if missing: print("ERROR: Missing required files:") for f in missing: print(f" - {f}") sys.exit(1) # Test 1: Load lookup embeddings and build FAISS index print("\n1. Testing FAISS index construction...") embeddings = np.load(lookup_embeddings_path) print(f" Loaded embeddings: {embeddings.shape}") # Build index on a subset for speed subset_size = 10000 subset_embeddings = embeddings[:subset_size] db = load_database(subset_embeddings) print(f" Built FAISS index on {subset_size} embeddings") # Test 2: Query the database print("\n2. Testing similarity search...") # Use random query np.random.seed(42) query_emb = np.random.randn(10, 512).astype(np.float32) query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True) D, I = query(db, query_emb, k=5) print(f" Query shape: {query_emb.shape}") print(f" Results D shape: {D.shape}, I shape: {I.shape}") print(f" Max similarity: {D.max():.6f}") print(f" Min similarity: {D.min():.6f}") # Test 3: Load calibration data and compute FDR threshold print("\n3. Testing FDR threshold computation...") cal_data = np.load(calibration_data_path, allow_pickle=True) print(f" Loaded {len(cal_data)} calibration samples") # Use a subset for faster testing np.random.seed(42) np.random.shuffle(cal_data) cal_subset = cal_data[:100] sims, labels = get_sims_labels(cal_subset, partial=False) print(f" Calibration sims shape: {sims.shape}") print(f" Calibration labels shape: {labels.shape}") # Compute FDR threshold alpha = 0.1 delta = 0.5 try: l_hat, risk_fdr = get_thresh_FDR(labels.flatten(), sims.flatten(), alpha=alpha, delta=delta, N=50) print(f" FDR threshold (α={alpha}): λ = {l_hat:.12f}") print(f" FDR risk at threshold: {risk_fdr:.6f}") # Expected threshold is around 0.999980 if 0.9999 < l_hat < 1.0001: print(" ✓ Threshold is in expected range [0.9999, 1.0001]") else: print(f" ⚠ Threshold {l_hat} outside expected range") except Exception as e: print(f" ✗ FDR computation failed: {e}") import traceback traceback.print_exc() l_hat = None # Test 4: Venn-Abers probability computation print("\n4. Testing Venn-Abers probability...") X_cal = sims.flatten() y_cal = labels.flatten() # Test with some similarity values test_sims = np.array([0.999, 0.9999, 0.99999, 1.0]) for sim in test_sims: p0, p1 = simplifed_venn_abers_prediction(X_cal, y_cal, sim) prob = (p0 + p1) / 2 uncertainty = abs(p1 - p0) print(f" sim={sim:.5f} → prob={prob:.4f} (uncertainty={uncertainty:.4f})") print("\n" + "=" * 60) print("VERIFICATION COMPLETE") print("=" * 60) # Summary print("\nSummary:") print(" ✓ FAISS index construction works") print(" ✓ Similarity search works") if l_hat: print(" ✓ FDR threshold computation works") else: print(" ✗ FDR threshold computation failed") print(" ✓ Venn-Abers probability works") print("\nNote: To reproduce exact Syn3.0 results (59/149 = 39.6%),") print("you need the query embeddings for the 149 unknown genes.") print("These can be generated using the Protein-Vec model:") print(" python -m protein_conformal.embed_protein_vec --input unknown_aa_seqs.fasta") if __name__ == '__main__': main()