SparseAE / check_vocab.py
nancyH's picture
Upload folder using huggingface_hub
b46126b verified
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
# =====================
# 6. MODEL: SPARSE AUTOENCODER
# =====================
INPUT_DIM = L * 5 # 4 DNA + 1 phyloP
LATENT_DIM = 2048
HIDDEN_DIM = 1024
class SparseAE(nn.Module):
def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
super().__init__()
# Encoder
self.encoder = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, latent_dim),
nn.ReLU() # ReLU helps sparsity with L1
)
# Decoder shared
self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
# Decoder heads
self.dec_dna = nn.Linear(hidden_dim, L * 4)
self.dec_phy = nn.Linear(hidden_dim, L * 1)
def forward(self, dna, phy):
B = dna.size(0)
x = torch.cat(
[dna.reshape(B, -1), phy.reshape(B, -1)],
dim=1
) # (B, INPUT_DIM)
h = self.encoder(x)
dec = F.relu(self.dec_hidden(h))
recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4)
recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L)
return recon_dna, recon_phy, h
# Setup
L = 50
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SparseAE().to(device)
# Load the final checkpoint
model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
model.eval()
print("Model loaded.")
# --- GENERATE FAKE DATA (Or load real if you prefer) ---
print("Generating test data...")
# Create random DNA (approximate genomic distribution)
N_SAMPLES = 10000
probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T
test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L)
test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device)
test_phy = torch.randn(N_SAMPLES, L).to(device)
# --- RUN INFERENCE ---
print("Running inference...")
with torch.no_grad():
# Run model to get latent 'h'
# Note: If you used Top-K in training, ensure you use it here too.
# If you used standard L1/KL, just get 'h' from encoder.
B = test_dna.size(0)
x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1)
h = model.encoder(x)
# --- ANALYZE VOCABULARY ---
h_np = h.cpu().numpy()
# 1. How often is each token used? (Frequency)
# We count a neuron as "firing" if it > 0.1
neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,)
# 2. Sort them
sorted_counts = np.sort(neuron_firing_counts)[::-1]
print("\n--- VOCABULARY HEALTH CHECK ---")
print(f"Total Neurons: 2048")
print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}")
print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}")
print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}")