import torch import torch.nn as nn import numpy as np import matplotlib.pyplot as plt import torch.nn.functional as F # ===================== # 6. MODEL: SPARSE AUTOENCODER # ===================== INPUT_DIM = L * 5 # 4 DNA + 1 phyloP LATENT_DIM = 2048 HIDDEN_DIM = 1024 class SparseAE(nn.Module): def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM): super().__init__() # Encoder self.encoder = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, latent_dim), nn.ReLU() # ReLU helps sparsity with L1 ) # Decoder shared self.dec_hidden = nn.Linear(latent_dim, hidden_dim) # Decoder heads self.dec_dna = nn.Linear(hidden_dim, L * 4) self.dec_phy = nn.Linear(hidden_dim, L * 1) def forward(self, dna, phy): B = dna.size(0) x = torch.cat( [dna.reshape(B, -1), phy.reshape(B, -1)], dim=1 ) # (B, INPUT_DIM) h = self.encoder(x) dec = F.relu(self.dec_hidden(h)) recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4) recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L) return recon_dna, recon_phy, h # Setup L = 50 device = "cuda" if torch.cuda.is_available() else "cpu" model = SparseAE().to(device) # Load the final checkpoint model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device)) model.eval() print("Model loaded.") # --- GENERATE FAKE DATA (Or load real if you prefer) --- print("Generating test data...") # Create random DNA (approximate genomic distribution) N_SAMPLES = 10000 probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L) test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device) test_phy = torch.randn(N_SAMPLES, L).to(device) # --- RUN INFERENCE --- print("Running inference...") with torch.no_grad(): # Run model to get latent 'h' # Note: If you used Top-K in training, ensure you use it here too. # If you used standard L1/KL, just get 'h' from encoder. B = test_dna.size(0) x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1) h = model.encoder(x) # --- ANALYZE VOCABULARY --- h_np = h.cpu().numpy() # 1. How often is each token used? (Frequency) # We count a neuron as "firing" if it > 0.1 neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,) # 2. Sort them sorted_counts = np.sort(neuron_firing_counts)[::-1] print("\n--- VOCABULARY HEALTH CHECK ---") print(f"Total Neurons: 2048") print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}") print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}") print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}")