Spaces:

LoocasGoose
/

cpr

Running

File size: 5,287 Bytes

c95d941

#!/usr/bin/env python
"""
Verify FDR algorithm using available calibration data.

This script tests the core FDR threshold computation algorithm using the
Pfam calibration data. It verifies that:
1. The FAISS similarity search works correctly
2. The FDR threshold computation produces the expected value
3. The Venn-Abers probability calibration works

This is a functional test of the algorithm, not a reproduction of the
exact Syn3.0 results (which require additional query embeddings).

Usage:
    python scripts/verify_fdr_algorithm.py
"""

import sys
from pathlib import Path

import numpy as np

# Add parent directory to path for imports
repo_root = str(Path(__file__).parent.parent)
sys.path.insert(0, repo_root)

# Import util directly to avoid gradio dependency in __init__.py
import importlib.util
spec = importlib.util.spec_from_file_location("util", f"{repo_root}/protein_conformal/util.py")
util = importlib.util.module_from_spec(spec)
spec.loader.exec_module(util)

load_database = util.load_database
query = util.query
simplifed_venn_abers_prediction = util.simplifed_venn_abers_prediction
get_sims_labels = util.get_sims_labels
get_thresh_FDR = util.get_thresh_FDR


def main():
    data_dir = Path(__file__).parent.parent / 'data'

    print("=" * 60)
    print("FDR Algorithm Verification")
    print("=" * 60)

    # Check required files
    lookup_embeddings_path = data_dir / 'lookup_embeddings.npy'
    lookup_metadata_path = data_dir / 'lookup_embeddings_meta_data.tsv'
    calibration_data_path = data_dir / 'pfam_new_proteins.npy'

    missing = []
    for p in [lookup_embeddings_path, lookup_metadata_path, calibration_data_path]:
        if not p.exists():
            missing.append(p)

    if missing:
        print("ERROR: Missing required files:")
        for f in missing:
            print(f"  - {f}")
        sys.exit(1)

    # Test 1: Load lookup embeddings and build FAISS index
    print("\n1. Testing FAISS index construction...")
    embeddings = np.load(lookup_embeddings_path)
    print(f"   Loaded embeddings: {embeddings.shape}")

    # Build index on a subset for speed
    subset_size = 10000
    subset_embeddings = embeddings[:subset_size]
    db = load_database(subset_embeddings)
    print(f"   Built FAISS index on {subset_size} embeddings")

    # Test 2: Query the database
    print("\n2. Testing similarity search...")
    # Use random query
    np.random.seed(42)
    query_emb = np.random.randn(10, 512).astype(np.float32)
    query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)

    D, I = query(db, query_emb, k=5)
    print(f"   Query shape: {query_emb.shape}")
    print(f"   Results D shape: {D.shape}, I shape: {I.shape}")
    print(f"   Max similarity: {D.max():.6f}")
    print(f"   Min similarity: {D.min():.6f}")

    # Test 3: Load calibration data and compute FDR threshold
    print("\n3. Testing FDR threshold computation...")
    cal_data = np.load(calibration_data_path, allow_pickle=True)
    print(f"   Loaded {len(cal_data)} calibration samples")

    # Use a subset for faster testing
    np.random.seed(42)
    np.random.shuffle(cal_data)
    cal_subset = cal_data[:100]

    sims, labels = get_sims_labels(cal_subset, partial=False)
    print(f"   Calibration sims shape: {sims.shape}")
    print(f"   Calibration labels shape: {labels.shape}")

    # Compute FDR threshold
    alpha = 0.1
    delta = 0.5
    try:
        l_hat, risk_fdr = get_thresh_FDR(labels.flatten(), sims.flatten(), alpha=alpha, delta=delta, N=50)
        print(f"   FDR threshold (α={alpha}): λ = {l_hat:.12f}")
        print(f"   FDR risk at threshold: {risk_fdr:.6f}")

        # Expected threshold is around 0.999980
        if 0.9999 < l_hat < 1.0001:
            print("   ✓ Threshold is in expected range [0.9999, 1.0001]")
        else:
            print(f"   ⚠ Threshold {l_hat} outside expected range")
    except Exception as e:
        print(f"   ✗ FDR computation failed: {e}")
        import traceback
        traceback.print_exc()
        l_hat = None

    # Test 4: Venn-Abers probability computation
    print("\n4. Testing Venn-Abers probability...")
    X_cal = sims.flatten()
    y_cal = labels.flatten()

    # Test with some similarity values
    test_sims = np.array([0.999, 0.9999, 0.99999, 1.0])
    for sim in test_sims:
        p0, p1 = simplifed_venn_abers_prediction(X_cal, y_cal, sim)
        prob = (p0 + p1) / 2
        uncertainty = abs(p1 - p0)
        print(f"   sim={sim:.5f} → prob={prob:.4f} (uncertainty={uncertainty:.4f})")

    print("\n" + "=" * 60)
    print("VERIFICATION COMPLETE")
    print("=" * 60)

    # Summary
    print("\nSummary:")
    print("  ✓ FAISS index construction works")
    print("  ✓ Similarity search works")
    if l_hat:
        print("  ✓ FDR threshold computation works")
    else:
        print("  ✗ FDR threshold computation failed")
    print("  ✓ Venn-Abers probability works")

    print("\nNote: To reproduce exact Syn3.0 results (59/149 = 39.6%),")
    print("you need the query embeddings for the 149 unknown genes.")
    print("These can be generated using the Protein-Vec model:")
    print("  python -m protein_conformal.embed_protein_vec --input unknown_aa_seqs.fasta")


if __name__ == '__main__':
    main()