File size: 3,133 Bytes
95ecd72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
"""Extract example sequences from test data for the 'Try Example' feature."""

import sys
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

import numpy as np
from joblib import load
import json


def oh_to_sequence(one_hot):
    """Convert one-hot encoding back to sequence string."""
    alphabet = "ACGT"
    return "".join(alphabet[np.argmax(pos)] for pos in one_hot)


def main():
    """Extract example sequences from test data."""
    data_path = project_root / "data"

    print("Loading test data...")
    xTe = load(data_path / "xTe_ES7_HeLa_ABC.pkl.gz")
    yTe = load(data_path / "yTe_ES7_HeLa_ABC.pkl.gz")

    print(f"Test set size: {len(yTe)} samples")
    print(f"PSI range: {yTe.min():.3f} - {yTe.max():.3f}")
    print(f"PSI mean: {yTe.mean():.3f}")

    # Get sequences from one-hot encoding
    # xTe is tuple: (seq_oh, struct_oh, wobble)
    seq_oh = xTe[0]  # Shape: (N, 90, 4)

    # Extract the 70nt exon part (positions 10-80)
    exon_seq_oh = seq_oh[:, 10:80, :]

    # Find sequences with specific PSI values
    examples = []

    # High PSI (>0.85)
    high_psi_idx = np.where(yTe > 0.85)[0]
    if len(high_psi_idx) > 0:
        idx = high_psi_idx[0]
        seq = oh_to_sequence(exon_seq_oh[idx])
        examples.append({
            "name": "High Inclusion Example",
            "sequence": seq,
            "description": f"This sequence demonstrates strong exon inclusion (actual PSI = {yTe[idx]:.3f})",
            "expected_psi": float(yTe[idx]),
        })
        print(f"High PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")

    # Medium PSI (0.45-0.55)
    medium_psi_idx = np.where((yTe > 0.45) & (yTe < 0.55))[0]
    if len(medium_psi_idx) > 0:
        idx = medium_psi_idx[0]
        seq = oh_to_sequence(exon_seq_oh[idx])
        examples.append({
            "name": "Balanced Example",
            "sequence": seq,
            "description": f"This sequence shows balanced inclusion/skipping (actual PSI = {yTe[idx]:.3f})",
            "expected_psi": float(yTe[idx]),
        })
        print(f"Medium PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")

    # Low PSI (<0.15)
    low_psi_idx = np.where(yTe < 0.15)[0]
    if len(low_psi_idx) > 0:
        idx = low_psi_idx[0]
        seq = oh_to_sequence(exon_seq_oh[idx])
        examples.append({
            "name": "High Skipping Example",
            "sequence": seq,
            "description": f"This sequence demonstrates strong exon skipping (actual PSI = {yTe[idx]:.3f})",
            "expected_psi": float(yTe[idx]),
        })
        print(f"Low PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")

    # Save examples to JSON
    output_path = project_root / "webapp" / "static" / "examples.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, "w") as f:
        json.dump({"sequences": examples}, f, indent=2)

    print(f"\nExamples saved to: {output_path}")
    print(f"Total examples: {len(examples)}")

    return examples


if __name__ == "__main__":
    main()