"""
Train a morphism model on Eigenverse structure-preserving maps.

Architecture: MorphismNet — a multi-head model where:
- Shared encoder learns the common Eigenverse structure
- Per-morphism heads specialize in each transformation
- Domain embedding distinguishes ℝ vs GF(p)
- Residual prediction head learns to verify morphism properties
  (all residuals should be ≈ 0 when the morphism holds)

The model learns the Eigenverse's "grammar" — the rules connecting
different mathematical objects through structure-preserving maps.
"""

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
import json
import time

# ════════════════════════════════════════════════════════════════════════
# Load data
# ════════════════════════════════════════════════════════════════════════

print("Loading dataset...")
inputs = np.load("data/inputs.npy")
outputs = np.load("data/outputs.npy")
morphism_ids = np.load("data/morphism_ids.npy")
domain_ids = np.load("data/domain_ids.npy")

N = len(inputs)
IN_DIM = inputs.shape[1]   # 4
OUT_DIM = outputs.shape[1]  # 6
N_MORPHISMS = 7  # 0-6
N_DOMAINS = 2    # ℝ, GF(p)

print(f"Dataset: {N} samples, in={IN_DIM}, out={OUT_DIM}")

# Train/val split (90/10)
perm = np.random.permutation(N)
split = int(0.9 * N)
train_idx, val_idx = perm[:split], perm[split:]

X_train = torch.tensor(inputs[train_idx], dtype=torch.float32)
Y_train = torch.tensor(outputs[train_idx], dtype=torch.float32)
M_train = torch.tensor(morphism_ids[train_idx], dtype=torch.long)
D_train = torch.tensor(domain_ids[train_idx], dtype=torch.long)

X_val = torch.tensor(inputs[val_idx], dtype=torch.float32)
Y_val = torch.tensor(outputs[val_idx], dtype=torch.float32)
M_val = torch.tensor(morphism_ids[val_idx], dtype=torch.long)
D_val = torch.tensor(domain_ids[val_idx], dtype=torch.long)

train_ds = TensorDataset(X_train, Y_train, M_train, D_train)
val_ds = TensorDataset(X_val, Y_val, M_val, D_val)

BATCH = 512
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=0)


# ════════════════════════════════════════════════════════════════════════
# Model: MorphismNet
# ════════════════════════════════════════════════════════════════════════

class MorphismNet(nn.Module):
    """Multi-head network for Eigenverse morphism learning.

    Architecture:
    - Morphism embedding (7 types) + Domain embedding (2 types)
    - Shared encoder: input + embeddings → hidden representation
    - Per-morphism decoder heads: hidden → output prediction
    - Residual head: predicts whether the morphism property holds (≈ 0)
    """

    def __init__(self, in_dim=4, out_dim=6, hidden=256, n_morphisms=7, n_domains=2):
        super().__init__()
        self.n_morphisms = n_morphisms
        self.out_dim = out_dim

        # Embeddings
        self.morph_embed = nn.Embedding(n_morphisms, 32)
        self.domain_embed = nn.Embedding(n_domains, 16)

        # Shared encoder
        enc_in = in_dim + 32 + 16  # input + morph_embed + domain_embed
        self.encoder = nn.Sequential(
            nn.Linear(enc_in, hidden),
            nn.GELU(),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.LayerNorm(hidden),
        )

        # Per-morphism heads
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden, hidden // 2),
                nn.GELU(),
                nn.Linear(hidden // 2, out_dim),
            )
            for _ in range(n_morphisms)
        ])

        # Residual classifier: does the morphism property hold?
        # (binary: 1 = residual ≈ 0, i.e. property holds)
        self.residual_head = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.GELU(),
            nn.Linear(64, 1),
            nn.Sigmoid(),
        )

    def forward(self, x, morph_id, domain_id):
        # Embeddings
        m_emb = self.morph_embed(morph_id)       # (B, 32)
        d_emb = self.domain_embed(domain_id)     # (B, 16)

        # Concatenate
        h = torch.cat([x, m_emb, d_emb], dim=-1)  # (B, in+48)

        # Encode
        h = self.encoder(h)  # (B, hidden)

        # Route to per-morphism heads
        out = torch.zeros(x.shape[0], self.out_dim, device=x.device)
        for m in range(self.n_morphisms):
            mask = (morph_id == m)
            if mask.any():
                out[mask] = self.heads[m](h[mask])

        # Residual prediction
        residual_prob = self.residual_head(h).squeeze(-1)  # (B,)

        return out, residual_prob


# ════════════════════════════════════════════════════════════════════════
# Training
# ════════════════════════════════════════════════════════════════════════

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

model = MorphismNet().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Loss: MSE for output prediction + BCE for residual classification
mse_loss = nn.MSELoss()
bce_loss = nn.BCELoss()

# For residual labels: residual columns are near 0 when morphism holds
# Column indices for residual per morphism: col 2 for most, col 5 for orbit
RESIDUAL_COL = {0: 2, 1: 2, 2: 2, 3: 2, 4: 5, 5: 4, 6: 2}

EPOCHS = 50
best_val_loss = float('inf')
history = []

print(f"\nTraining MorphismNet ({sum(p.numel() for p in model.parameters()):,} params)")
print(f"Epochs: {EPOCHS}, Batch: {BATCH}")
print("=" * 60)

for epoch in range(EPOCHS):
    model.train()
    train_mse, train_n = 0.0, 0
    t0 = time.time()

    for x, y, m, d in train_dl:
        x, y, m, d = x.to(device), y.to(device), m.to(device), d.to(device)

        pred, res_prob = model(x, m, d)

        # Output MSE
        loss_mse = mse_loss(pred, y)

        # Residual labels: 1 if morphism holds (residual near 0)
        # Use the actual output residuals to generate labels
        res_labels = torch.zeros(x.shape[0], device=device)
        for mi in range(7):
            mask = (m == mi)
            if mask.any():
                col = RESIDUAL_COL[mi]
                if col < y.shape[1]:
                    res_labels[mask] = (y[mask, col].abs() < 0.01).float()

        loss_res = bce_loss(res_prob, res_labels)

        loss = loss_mse + 0.1 * loss_res

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_mse += loss_mse.item() * x.shape[0]
        train_n += x.shape[0]

    scheduler.step()

    # Validation
    model.eval()
    val_mse, val_res_acc, val_n = 0.0, 0.0, 0
    with torch.no_grad():
        for x, y, m, d in val_dl:
            x, y, m, d = x.to(device), y.to(device), m.to(device), d.to(device)
            pred, res_prob = model(x, m, d)
            val_mse += mse_loss(pred, y).item() * x.shape[0]

            # Residual accuracy
            for mi in range(7):
                mask = (m == mi)
                if mask.any():
                    col = RESIDUAL_COL[mi]
                    if col < y.shape[1]:
                        labels = (y[mask, col].abs() < 0.01).float()
                        preds = (res_prob[mask] > 0.5).float()
                        val_res_acc += (preds == labels).sum().item()
            val_n += x.shape[0]

    train_mse /= train_n
    val_mse /= val_n
    val_res_acc /= max(val_n, 1)
    elapsed = time.time() - t0

    history.append({
        "epoch": epoch + 1,
        "train_mse": train_mse,
        "val_mse": val_mse,
        "val_residual_acc": val_res_acc,
        "lr": scheduler.get_last_lr()[0],
        "time": elapsed,
    })

    if val_mse < best_val_loss:
        best_val_loss = val_mse
        torch.save(model.state_dict(), "morphism_net.pt")
        marker = " ★"
    else:
        marker = ""

    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"  [{epoch+1:3d}/{EPOCHS}] train_mse={train_mse:.6f} "
              f"val_mse={val_mse:.6f} res_acc={val_res_acc:.3f} "
              f"lr={scheduler.get_last_lr()[0]:.2e} ({elapsed:.1f}s){marker}")

print("=" * 60)
print(f"Best val MSE: {best_val_loss:.6f}")

# ════════════════════════════════════════════════════════════════════════
# Per-morphism evaluation
# ════════════════════════════════════════════════════════════════════════

print("\nPer-morphism validation MSE:")
model.load_state_dict(torch.load("morphism_net.pt", weights_only=True))
model.eval()

names = ["§1 coherence_even", "§2 palindrome_odd", "§3 lyapunov_bridge",
         "§4 μ_isometry", "§5 orbit_hom", "§6 reality_linear", "§7 composition"]

with torch.no_grad():
    x_all = X_val.to(device)
    y_all = Y_val.to(device)
    m_all = M_val.to(device)
    d_all = D_val.to(device)
    pred_all, res_all = model(x_all, m_all, d_all)

    for mi in range(7):
        mask = (m_all == mi)
        if mask.sum() > 0:
            mse = ((pred_all[mask] - y_all[mask]) ** 2).mean().item()
            # Check residual accuracy
            col = RESIDUAL_COL[mi]
            if col < y_all.shape[1]:
                true_res = y_all[mask, col].abs()
                pred_res = pred_all[mask, col].abs()
                res_mse = ((pred_res - true_res) ** 2).mean().item()
            else:
                res_mse = 0.0
            print(f"  {names[mi]:25s}: MSE={mse:.6f}, residual_MSE={res_mse:.6f}, n={mask.sum().item()}")

# ════════════════════════════════════════════════════════════════════════
# Test the mod paradox: does the model distinguish ℝ from GF(p)?
# ════════════════════════════════════════════════════════════════════════

print("\nMod paradox test (§1 coherence_even):")
with torch.no_grad():
    mask_r = (m_all == 0) & (d_all == 0)
    mask_gfp = (m_all == 0) & (d_all == 1)

    if mask_r.sum() > 0:
        mse_r = ((pred_all[mask_r] - y_all[mask_r]) ** 2).mean().item()
        res_r = y_all[mask_r, 2].abs().mean().item()
        pred_res_r = pred_all[mask_r, 2].abs().mean().item()
        print(f"  ℝ domain:    MSE={mse_r:.6f}, true_residual={res_r:.2e}, "
              f"pred_residual={pred_res_r:.2e}, n={mask_r.sum().item()}")

    if mask_gfp.sum() > 0:
        mse_gfp = ((pred_all[mask_gfp] - y_all[mask_gfp]) ** 2).mean().item()
        res_gfp = y_all[mask_gfp, 2].abs().mean().item()
        pred_res_gfp = pred_all[mask_gfp, 2].abs().mean().item()
        print(f"  GF(p) domain: MSE={mse_gfp:.6f}, true_residual={res_gfp:.2e}, "
              f"pred_residual={pred_res_gfp:.2e}, n={mask_gfp.sum().item()}")
        print(f"\n  The paradox: C(r)=C(1/r) holds exactly over ℝ (residual≈0)")
        print(f"  but over GF(p), the 'residual' is nonzero — mod breaks symmetry.")
    else:
        print(f"  (No GF(p) samples in validation set)")

# Save history
with open("training_history.json", "w") as f:
    json.dump(history, f, indent=2)

# Save model info
info = {
    "name": "MorphismNet",
    "params": sum(p.numel() for p in model.parameters()),
    "morphisms": names,
    "best_val_mse": best_val_loss,
    "epochs": EPOCHS,
    "dataset_size": N,
    "architecture": "shared_encoder(3x256) + 7_heads(128→6) + residual_classifier",
}
with open("model_info.json", "w") as f:
    json.dump(info, f, indent=2)

print(f"\nModel saved: morphism_net.pt ({sum(p.numel() for p in model.parameters()):,} params)")
print("Done. 🧬")