File size: 5,287 Bytes
c95d941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
"""
Verify FDR algorithm using available calibration data.

This script tests the core FDR threshold computation algorithm using the
Pfam calibration data. It verifies that:
1. The FAISS similarity search works correctly
2. The FDR threshold computation produces the expected value
3. The Venn-Abers probability calibration works

This is a functional test of the algorithm, not a reproduction of the
exact Syn3.0 results (which require additional query embeddings).

Usage:
    python scripts/verify_fdr_algorithm.py
"""

import sys
from pathlib import Path

import numpy as np

# Add parent directory to path for imports
repo_root = str(Path(__file__).parent.parent)
sys.path.insert(0, repo_root)

# Import util directly to avoid gradio dependency in __init__.py
import importlib.util
spec = importlib.util.spec_from_file_location("util", f"{repo_root}/protein_conformal/util.py")
util = importlib.util.module_from_spec(spec)
spec.loader.exec_module(util)

load_database = util.load_database
query = util.query
simplifed_venn_abers_prediction = util.simplifed_venn_abers_prediction
get_sims_labels = util.get_sims_labels
get_thresh_FDR = util.get_thresh_FDR


def main():
    data_dir = Path(__file__).parent.parent / 'data'

    print("=" * 60)
    print("FDR Algorithm Verification")
    print("=" * 60)

    # Check required files
    lookup_embeddings_path = data_dir / 'lookup_embeddings.npy'
    lookup_metadata_path = data_dir / 'lookup_embeddings_meta_data.tsv'
    calibration_data_path = data_dir / 'pfam_new_proteins.npy'

    missing = []
    for p in [lookup_embeddings_path, lookup_metadata_path, calibration_data_path]:
        if not p.exists():
            missing.append(p)

    if missing:
        print("ERROR: Missing required files:")
        for f in missing:
            print(f"  - {f}")
        sys.exit(1)

    # Test 1: Load lookup embeddings and build FAISS index
    print("\n1. Testing FAISS index construction...")
    embeddings = np.load(lookup_embeddings_path)
    print(f"   Loaded embeddings: {embeddings.shape}")

    # Build index on a subset for speed
    subset_size = 10000
    subset_embeddings = embeddings[:subset_size]
    db = load_database(subset_embeddings)
    print(f"   Built FAISS index on {subset_size} embeddings")

    # Test 2: Query the database
    print("\n2. Testing similarity search...")
    # Use random query
    np.random.seed(42)
    query_emb = np.random.randn(10, 512).astype(np.float32)
    query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)

    D, I = query(db, query_emb, k=5)
    print(f"   Query shape: {query_emb.shape}")
    print(f"   Results D shape: {D.shape}, I shape: {I.shape}")
    print(f"   Max similarity: {D.max():.6f}")
    print(f"   Min similarity: {D.min():.6f}")

    # Test 3: Load calibration data and compute FDR threshold
    print("\n3. Testing FDR threshold computation...")
    cal_data = np.load(calibration_data_path, allow_pickle=True)
    print(f"   Loaded {len(cal_data)} calibration samples")

    # Use a subset for faster testing
    np.random.seed(42)
    np.random.shuffle(cal_data)
    cal_subset = cal_data[:100]

    sims, labels = get_sims_labels(cal_subset, partial=False)
    print(f"   Calibration sims shape: {sims.shape}")
    print(f"   Calibration labels shape: {labels.shape}")

    # Compute FDR threshold
    alpha = 0.1
    delta = 0.5
    try:
        l_hat, risk_fdr = get_thresh_FDR(labels.flatten(), sims.flatten(), alpha=alpha, delta=delta, N=50)
        print(f"   FDR threshold (α={alpha}): λ = {l_hat:.12f}")
        print(f"   FDR risk at threshold: {risk_fdr:.6f}")

        # Expected threshold is around 0.999980
        if 0.9999 < l_hat < 1.0001:
            print("   ✓ Threshold is in expected range [0.9999, 1.0001]")
        else:
            print(f"   ⚠ Threshold {l_hat} outside expected range")
    except Exception as e:
        print(f"   ✗ FDR computation failed: {e}")
        import traceback
        traceback.print_exc()
        l_hat = None

    # Test 4: Venn-Abers probability computation
    print("\n4. Testing Venn-Abers probability...")
    X_cal = sims.flatten()
    y_cal = labels.flatten()

    # Test with some similarity values
    test_sims = np.array([0.999, 0.9999, 0.99999, 1.0])
    for sim in test_sims:
        p0, p1 = simplifed_venn_abers_prediction(X_cal, y_cal, sim)
        prob = (p0 + p1) / 2
        uncertainty = abs(p1 - p0)
        print(f"   sim={sim:.5f} → prob={prob:.4f} (uncertainty={uncertainty:.4f})")

    print("\n" + "=" * 60)
    print("VERIFICATION COMPLETE")
    print("=" * 60)

    # Summary
    print("\nSummary:")
    print("  ✓ FAISS index construction works")
    print("  ✓ Similarity search works")
    if l_hat:
        print("  ✓ FDR threshold computation works")
    else:
        print("  ✗ FDR threshold computation failed")
    print("  ✓ Venn-Abers probability works")

    print("\nNote: To reproduce exact Syn3.0 results (59/149 = 39.6%),")
    print("you need the query embeddings for the 149 unknown genes.")
    print("These can be generated using the Protein-Vec model:")
    print("  python -m protein_conformal.embed_protein_vec --input unknown_aa_seqs.fasta")


if __name__ == '__main__':
    main()