Spaces:
Running
Running
File size: 3,133 Bytes
95ecd72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#!/usr/bin/env python
"""Extract example sequences from test data for the 'Try Example' feature."""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
import numpy as np
from joblib import load
import json
def oh_to_sequence(one_hot):
"""Convert one-hot encoding back to sequence string."""
alphabet = "ACGT"
return "".join(alphabet[np.argmax(pos)] for pos in one_hot)
def main():
"""Extract example sequences from test data."""
data_path = project_root / "data"
print("Loading test data...")
xTe = load(data_path / "xTe_ES7_HeLa_ABC.pkl.gz")
yTe = load(data_path / "yTe_ES7_HeLa_ABC.pkl.gz")
print(f"Test set size: {len(yTe)} samples")
print(f"PSI range: {yTe.min():.3f} - {yTe.max():.3f}")
print(f"PSI mean: {yTe.mean():.3f}")
# Get sequences from one-hot encoding
# xTe is tuple: (seq_oh, struct_oh, wobble)
seq_oh = xTe[0] # Shape: (N, 90, 4)
# Extract the 70nt exon part (positions 10-80)
exon_seq_oh = seq_oh[:, 10:80, :]
# Find sequences with specific PSI values
examples = []
# High PSI (>0.85)
high_psi_idx = np.where(yTe > 0.85)[0]
if len(high_psi_idx) > 0:
idx = high_psi_idx[0]
seq = oh_to_sequence(exon_seq_oh[idx])
examples.append({
"name": "High Inclusion Example",
"sequence": seq,
"description": f"This sequence demonstrates strong exon inclusion (actual PSI = {yTe[idx]:.3f})",
"expected_psi": float(yTe[idx]),
})
print(f"High PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")
# Medium PSI (0.45-0.55)
medium_psi_idx = np.where((yTe > 0.45) & (yTe < 0.55))[0]
if len(medium_psi_idx) > 0:
idx = medium_psi_idx[0]
seq = oh_to_sequence(exon_seq_oh[idx])
examples.append({
"name": "Balanced Example",
"sequence": seq,
"description": f"This sequence shows balanced inclusion/skipping (actual PSI = {yTe[idx]:.3f})",
"expected_psi": float(yTe[idx]),
})
print(f"Medium PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")
# Low PSI (<0.15)
low_psi_idx = np.where(yTe < 0.15)[0]
if len(low_psi_idx) > 0:
idx = low_psi_idx[0]
seq = oh_to_sequence(exon_seq_oh[idx])
examples.append({
"name": "High Skipping Example",
"sequence": seq,
"description": f"This sequence demonstrates strong exon skipping (actual PSI = {yTe[idx]:.3f})",
"expected_psi": float(yTe[idx]),
})
print(f"Low PSI example: {seq[:20]}... (PSI={yTe[idx]:.3f})")
# Save examples to JSON
output_path = project_root / "webapp" / "static" / "examples.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump({"sequences": examples}, f, indent=2)
print(f"\nExamples saved to: {output_path}")
print(f"Total examples: {len(examples)}")
return examples
if __name__ == "__main__":
main()
|