import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

# =====================
# 6. MODEL: SPARSE AUTOENCODER
# =====================
INPUT_DIM = L * 5      # 4 DNA + 1 phyloP
LATENT_DIM = 2048
HIDDEN_DIM = 1024

class SparseAE(nn.Module):
    def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
        super().__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
            nn.ReLU()  # ReLU helps sparsity with L1
        )

        # Decoder shared
        self.dec_hidden = nn.Linear(latent_dim, hidden_dim)

        # Decoder heads
        self.dec_dna = nn.Linear(hidden_dim, L * 4)
        self.dec_phy = nn.Linear(hidden_dim, L * 1)

    def forward(self, dna, phy):
        B = dna.size(0)

        x = torch.cat(
            [dna.reshape(B, -1), phy.reshape(B, -1)],
            dim=1
        )  # (B, INPUT_DIM)

        h = self.encoder(x)
        dec = F.relu(self.dec_hidden(h))

        recon_dna = self.dec_dna(dec).reshape(B, L, 4)      # (B, L, 4)
        recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L)  # (B, L)

        return recon_dna, recon_phy, h


# Setup
L = 50
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SparseAE().to(device)

# Load the final checkpoint
model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
model.eval()
print("Model loaded.")

# --- GENERATE FAKE DATA (Or load real if you prefer) ---
print("Generating test data...")
# Create random DNA (approximate genomic distribution)
N_SAMPLES = 10000
probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T
test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L)
test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device)
test_phy = torch.randn(N_SAMPLES, L).to(device)

# --- RUN INFERENCE ---
print("Running inference...")
with torch.no_grad():
    # Run model to get latent 'h'
    # Note: If you used Top-K in training, ensure you use it here too.
    # If you used standard L1/KL, just get 'h' from encoder.
    B = test_dna.size(0)
    x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1)
    h = model.encoder(x)


# --- ANALYZE VOCABULARY ---
h_np = h.cpu().numpy()

# 1. How often is each token used? (Frequency)
# We count a neuron as "firing" if it > 0.1
neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,)

# 2. Sort them
sorted_counts = np.sort(neuron_firing_counts)[::-1]

print("\n--- VOCABULARY HEALTH CHECK ---")
print(f"Total Neurons: 2048")
print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}")
print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}")
print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}")