#!/usr/bin/env python3
"""
GLADIUS — Deploy Synthase into Omega Kernel

This script:
1. Loads the Omega checkpoint (step 1000 or latest)
2. Builds the GladiusKernel
3. Loads checkpoint weights
4. Runs upgrade_kernel_to_synthase()
5. Saves the upgraded checkpoint (preserving optimizer state structure)
6. Verifies by running a forward pass

Usage (run on Victus from gladius-training dir):
    python deploy_synthase.py --checkpoint runs/omega/omega_step_1000.pt --output runs/omega/synthase_step_1000.pt

Authors: Ali A. Shakil, Ava Shakil
Date: March 27, 2026
"""

import sys
import os
import json
import time
import argparse
from pathlib import Path

import torch
import torch.nn as nn

# Setup paths (same as train_omega.py)
script_dir = Path(__file__).resolve().parent
gladius_dir = Path(r'C:\Users\ali\gladius-training')
staging_dir = gladius_dir / 'staging'
omega_dir = gladius_dir / 'omega'

for p in [str(staging_dir), str(omega_dir), str(gladius_dir), str(script_dir)]:
    if p not in sys.path:
        sys.path.insert(0, p)

from kernel.config import KernelConfig
from kernel.kernel import GladiusKernel
from synthase_attention import SynthaseDepthAttention, DepthCacheBuilder
from synthase_layer import SynthaseTransformerLayer
from synthase_surgery import upgrade_kernel_to_synthase, get_synthase_diagnostics

try:
    from omega_config import OmegaConfig, omega_rtx2050_config
except ImportError:
    sys.path.insert(0, str(omega_dir))
    from omega_config import OmegaConfig, omega_rtx2050_config


def build_kernel_config(omega_cfg):
    """Build KernelConfig from OmegaConfig (same as train_omega.py)."""
    return KernelConfig(
        hidden_dim=omega_cfg.hidden_dim,
        num_layers=omega_cfg.num_layers,
        num_heads=omega_cfg.num_heads,
        head_dim=omega_cfg.head_dim,
        ffn_dim=omega_cfg.ffn_dim,
        vocab_size=omega_cfg.new_vocab_size,
        max_seq_len=omega_cfg.max_seq_len,
        hot_memory_slots=omega_cfg.hot_memory_slots,
        warm_rank=omega_cfg.warm_rank,
        time_dim=omega_cfg.time_dim,
        time_num_frequencies=omega_cfg.time_num_frequencies,
        time_max_events=omega_cfg.time_max_events,
        cognition_state_dim=omega_cfg.cognition_state_dim,
        cognition_modes=omega_cfg.cognition_modes,
        cognition_prompt_types=omega_cfg.cognition_prompt_types,
        register_dim=omega_cfg.register_dim,
        intent_dim=omega_cfg.intent_dim,
        max_tools=omega_cfg.max_tools,
        num_specialists=omega_cfg.num_specialists,
        router_top_k=omega_cfg.router_top_k,
        attention_sparse_budget=omega_cfg.attention_sparse_budget,
        specialist_residual_scale=omega_cfg.specialist_residual_scale,
    )


def main():
    parser = argparse.ArgumentParser(description='Deploy Synthase into GLADIUS Omega')
    parser.add_argument('--checkpoint', type=str, required=True,
                        help='Omega checkpoint to upgrade (e.g. runs/omega/omega_step_1000.pt)')
    parser.add_argument('--output', type=str, default=None,
                        help='Output path for upgraded checkpoint (default: auto-name)')
    parser.add_argument('--bottleneck', action='store_true',
                        help='Use bottleneck variant (41% fewer depth params)')
    parser.add_argument('--depth-k', type=int, default=32,
                        help='Positions per layer in depth cache (default: 32)')
    parser.add_argument('--depth-kv-heads', type=int, default=4,
                        help='Depth KV heads for GQA (default: 4)')
    parser.add_argument('--device', type=str, default='cpu',
                        help='Device for surgery (cpu recommended — saves VRAM for training)')
    args = parser.parse_args()

    device = torch.device(args.device)
    print(f"\n{'='*60}")
    print(f"  GLADIUS SYNTHASE DEPLOYMENT")
    print(f"  ATP Synthase Depth Attention → Omega Kernel")
    print(f"{'='*60}\n")

    # 1. Build config
    omega_cfg = omega_rtx2050_config()
    kernel_config = build_kernel_config(omega_cfg)

    # 2. Build fresh model
    print("[1/5] Building Omega kernel...")
    model = GladiusKernel(kernel_config).to(device)
    total_before = sum(p.numel() for p in model.parameters())
    print(f"  Params (pre-synthase): {total_before:,}")

    # 3. Load checkpoint
    print(f"\n[2/5] Loading checkpoint: {args.checkpoint}")
    cp = torch.load(args.checkpoint, map_location='cpu', weights_only=False)

    if 'model_state_dict' in cp:
        model.load_state_dict(cp['model_state_dict'], strict=False)
        step = cp.get('step', 0)
        best_loss = cp.get('best_loss', float('inf'))
        print(f"  Step: {step}, Best loss: {best_loss:.4f}")
    else:
        model.load_state_dict(cp, strict=False)
        step = 0
        best_loss = float('inf')
        print("  Raw state dict (no metadata)")

    # 4. Upgrade to Synthase
    print(f"\n[3/5] Performing Synthase surgery...")
    model = upgrade_kernel_to_synthase(
        model,
        num_depth_kv_heads=args.depth_kv_heads,
        depth_k=args.depth_k,
        use_bottleneck=args.bottleneck,
        init_from_backbone=True,
    )
    total_after = sum(p.numel() for p in model.parameters())
    delta = total_after - total_before
    print(f"  Params added: {delta:,} ({delta/total_before*100:.1f}% overhead)")

    # 5. Verify with forward pass
    print(f"\n[4/5] Verification forward pass...")
    model.eval()
    with torch.no_grad():
        test_ids = torch.randint(0, omega_cfg.new_vocab_size, (1, 64), device=device)
        try:
            result = model(test_ids)
            logits = result['logits']
            print(f"  Output shape: {logits.shape}")
            print(f"  Logits range: [{logits.min().item():.4f}, {logits.max().item():.4f}]")
            
            # Check depth diagnostics
            diag = get_synthase_diagnostics(model)
            for k, v in sorted(diag.items()):
                if 'scale' in k:
                    print(f"  {k}: {v:.4f}")
            
            # Check balance_loss returned
            if 'balance_loss' in result:
                print(f"  balance_loss: {result['balance_loss'].item():.6f}")
            if 'router_indices' in result and result['router_indices'] is not None:
                print(f"  router_indices: {result['router_indices']}")
            
            print("  Forward pass: OK")
        except Exception as e:
            print(f"  Forward pass FAILED: {e}")
            import traceback
            traceback.print_exc()
            return 1

    # 6. Save upgraded checkpoint
    output_path = args.output
    if output_path is None:
        base = Path(args.checkpoint)
        output_path = str(base.parent / f"synthase_{base.name}")

    print(f"\n[5/5] Saving upgraded checkpoint: {output_path}")
    
    save_dict = {
        'model_state_dict': model.state_dict(),
        'step': step,
        'best_loss': best_loss,
        'phase': cp.get('phase', 'unknown'),
        'synthase': True,
        'synthase_config': {
            'depth_k': args.depth_k,
            'num_depth_kv_heads': args.depth_kv_heads,
            'use_bottleneck': args.bottleneck,
            'params_added': delta,
        },
    }
    
    # NOTE: We intentionally do NOT copy optimizer state.
    # The optimizer param groups will change (new depth params need their own group).
    # train_omega.py will create a fresh optimizer with the right groups.
    # The LR warmup will re-ramp from the current step.
    
    torch.save(save_dict, output_path)
    size_mb = os.path.getsize(output_path) / 1024 / 1024
    print(f"  Size: {size_mb:.1f} MB")
    
    print(f"\n{'='*60}")
    print(f"  DEPLOYMENT COMPLETE")
    print(f"  ")
    print(f"  Next: Resume training with:")
    print(f"    python train_omega.py --resume {output_path}")
    print(f"  ")
    print(f"  The optimizer will be rebuilt with depth params in a new group.")
    print(f"  LR scheduler will continue from step {step}.")
    print(f"{'='*60}\n")
    return 0


if __name__ == '__main__':
    sys.exit(main())