Spaces:

LoocasGoose
/

cpr

Running

App Files Files Community

cpr / scripts /verify_fdr_algorithm.py

ronboger

feat: add CLI, fix FDR bug, verify paper result (59/149 = 39.6%)

c95d941 3 months ago

raw

history blame contribute delete

5.29 kB

	#!/usr/bin/env python
	"""
	Verify FDR algorithm using available calibration data.

	This script tests the core FDR threshold computation algorithm using the
	Pfam calibration data. It verifies that:
	1. The FAISS similarity search works correctly
	2. The FDR threshold computation produces the expected value
	3. The Venn-Abers probability calibration works

	This is a functional test of the algorithm, not a reproduction of the
	exact Syn3.0 results (which require additional query embeddings).

	Usage:
	python scripts/verify_fdr_algorithm.py
	"""

	import sys
	from pathlib import Path

	import numpy as np

	# Add parent directory to path for imports
	repo_root = str(Path(__file__).parent.parent)
	sys.path.insert(0, repo_root)

	# Import util directly to avoid gradio dependency in __init__.py
	import importlib.util
	spec = importlib.util.spec_from_file_location("util", f"{repo_root}/protein_conformal/util.py")
	util = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(util)

	load_database = util.load_database
	query = util.query
	simplifed_venn_abers_prediction = util.simplifed_venn_abers_prediction
	get_sims_labels = util.get_sims_labels
	get_thresh_FDR = util.get_thresh_FDR


	def main():
	data_dir = Path(__file__).parent.parent / 'data'

	print("=" * 60)
	print("FDR Algorithm Verification")
	print("=" * 60)

	# Check required files
	lookup_embeddings_path = data_dir / 'lookup_embeddings.npy'
	lookup_metadata_path = data_dir / 'lookup_embeddings_meta_data.tsv'
	calibration_data_path = data_dir / 'pfam_new_proteins.npy'

	missing = []
	for p in [lookup_embeddings_path, lookup_metadata_path, calibration_data_path]:
	if not p.exists():
	missing.append(p)

	if missing:
	print("ERROR: Missing required files:")
	for f in missing:
	print(f" - {f}")
	sys.exit(1)

	# Test 1: Load lookup embeddings and build FAISS index
	print("\n1. Testing FAISS index construction...")
	embeddings = np.load(lookup_embeddings_path)
	print(f" Loaded embeddings: {embeddings.shape}")

	# Build index on a subset for speed
	subset_size = 10000
	subset_embeddings = embeddings[:subset_size]
	db = load_database(subset_embeddings)
	print(f" Built FAISS index on {subset_size} embeddings")

	# Test 2: Query the database
	print("\n2. Testing similarity search...")
	# Use random query
	np.random.seed(42)
	query_emb = np.random.randn(10, 512).astype(np.float32)
	query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)

	D, I = query(db, query_emb, k=5)
	print(f" Query shape: {query_emb.shape}")
	print(f" Results D shape: {D.shape}, I shape: {I.shape}")
	print(f" Max similarity: {D.max():.6f}")
	print(f" Min similarity: {D.min():.6f}")

	# Test 3: Load calibration data and compute FDR threshold
	print("\n3. Testing FDR threshold computation...")
	cal_data = np.load(calibration_data_path, allow_pickle=True)
	print(f" Loaded {len(cal_data)} calibration samples")

	# Use a subset for faster testing
	np.random.seed(42)
	np.random.shuffle(cal_data)
	cal_subset = cal_data[:100]

	sims, labels = get_sims_labels(cal_subset, partial=False)
	print(f" Calibration sims shape: {sims.shape}")
	print(f" Calibration labels shape: {labels.shape}")

	# Compute FDR threshold
	alpha = 0.1
	delta = 0.5
	try:
	l_hat, risk_fdr = get_thresh_FDR(labels.flatten(), sims.flatten(), alpha=alpha, delta=delta, N=50)
	print(f" FDR threshold (α={alpha}): λ = {l_hat:.12f}")
	print(f" FDR risk at threshold: {risk_fdr:.6f}")

	# Expected threshold is around 0.999980
	if 0.9999 < l_hat < 1.0001:
	print(" ✓ Threshold is in expected range [0.9999, 1.0001]")
	else:
	print(f" ⚠ Threshold {l_hat} outside expected range")
	except Exception as e:
	print(f" ✗ FDR computation failed: {e}")
	import traceback
	traceback.print_exc()
	l_hat = None

	# Test 4: Venn-Abers probability computation
	print("\n4. Testing Venn-Abers probability...")
	X_cal = sims.flatten()
	y_cal = labels.flatten()

	# Test with some similarity values
	test_sims = np.array([0.999, 0.9999, 0.99999, 1.0])
	for sim in test_sims:
	p0, p1 = simplifed_venn_abers_prediction(X_cal, y_cal, sim)
	prob = (p0 + p1) / 2
	uncertainty = abs(p1 - p0)
	print(f" sim={sim:.5f} → prob={prob:.4f} (uncertainty={uncertainty:.4f})")

	print("\n" + "=" * 60)
	print("VERIFICATION COMPLETE")
	print("=" * 60)

	# Summary
	print("\nSummary:")
	print(" ✓ FAISS index construction works")
	print(" ✓ Similarity search works")
	if l_hat:
	print(" ✓ FDR threshold computation works")
	else:
	print(" ✗ FDR threshold computation failed")
	print(" ✓ Venn-Abers probability works")

	print("\nNote: To reproduce exact Syn3.0 results (59/149 = 39.6%),")
	print("you need the query embeddings for the 149 unknown genes.")
	print("These can be generated using the Protein-Vec model:")
	print(" python -m protein_conformal.embed_protein_vec --input unknown_aa_seqs.fasta")


	if __name__ == '__main__':
	main()