#!/usr/bin/env python3
"""
End-to-end test: data loading → model forward → backward.
Verifies that the full pipeline works before committing to long training.

Usage:
    python test_pipeline.py
    python test_pipeline.py --dataset active_matter --no-streaming --local_path /data/well
"""
import argparse
import sys
import time
import traceback

import torch
import torch.nn as nn


def fmt_mem():
    if torch.cuda.is_available():
        alloc = torch.cuda.memory_allocated() / 1e9
        res = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        return f"alloc={alloc:.2f}GB, reserved={res:.2f}GB, total={total:.1f}GB"
    return "CPU only"


def test_data_loading(args):
    """Test 1: Load data and print shapes."""
    print("\n" + "=" * 60)
    print("TEST 1: Data Loading")
    print("=" * 60)

    from data_pipeline import create_dataloader, prepare_batch, get_channel_info, get_data_info

    t0 = time.time()
    loader, dataset = create_dataloader(
        dataset_name=args.dataset,
        split="train",
        batch_size=args.batch_size,
        streaming=args.streaming,
        local_path=args.local_path,
    )
    print(f"  Dataset created in {time.time() - t0:.1f}s")
    print(f"  Dataset length: {len(dataset)}")

    # Probe shapes
    info = get_data_info(dataset)
    print(f"  Sample fields:")
    for k, v in info.items():
        print(f"    {k}: {v}")

    ch = get_channel_info(dataset)
    print(f"  Channel info: {ch}")

    # Load one batch
    t0 = time.time()
    batch = next(iter(loader))
    print(f"  First batch loaded in {time.time() - t0:.1f}s")
    print(f"  Batch keys: {list(batch.keys())}")
    for k, v in batch.items():
        if isinstance(v, torch.Tensor):
            print(f"    {k}: {v.shape} ({v.dtype})")

    # Prepare for model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    x_in, x_out = prepare_batch(batch, device)
    print(f"  Model input:  {x_in.shape} ({x_in.dtype})")
    print(f"  Model target: {x_out.shape} ({x_out.dtype})")
    print(f"  GPU memory: {fmt_mem()}")

    return ch, x_in, x_out


def test_diffusion(ch, x_in, x_out):
    """Test 2: Diffusion model forward + backward."""
    print("\n" + "=" * 60)
    print("TEST 2: Diffusion Model")
    print("=" * 60)

    from unet import UNet
    from diffusion import GaussianDiffusion

    c_in = ch["input_channels"]
    c_out = ch["output_channels"]

    unet = UNet(
        in_channels=c_out + c_in,
        out_channels=c_out,
        base_ch=64,
        ch_mults=(1, 2, 4, 8),
        n_res=2,
        attn_levels=(3,),
    )
    model = GaussianDiffusion(unet, timesteps=1000)
    device = x_in.device
    model = model.to(device)

    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Parameters: {n_params:,}")
    print(f"  GPU memory after model: {fmt_mem()}")

    # Forward
    t0 = time.time()
    with torch.amp.autocast("cuda", dtype=torch.bfloat16):
        loss = model.training_loss(x_out, x_in)
    print(f"  Forward pass: loss={loss.item():.4f} ({time.time() - t0:.3f}s)")
    print(f"  GPU memory after forward: {fmt_mem()}")

    # Backward
    t0 = time.time()
    loss.backward()
    print(f"  Backward pass: ({time.time() - t0:.3f}s)")
    print(f"  GPU memory after backward: {fmt_mem()}")

    # Quick sampling test (just 5 steps for speed)
    model.eval()
    model.T = 5  # temporarily reduce for testing
    model.betas = model.betas[:5]
    model.alphas = model.alphas[:5]
    model.alpha_bar = model.alpha_bar[:5]
    model.sqrt_alpha_bar = model.sqrt_alpha_bar[:5]
    model.sqrt_one_minus_alpha_bar = model.sqrt_one_minus_alpha_bar[:5]
    model.sqrt_recip_alpha = model.sqrt_recip_alpha[:5]
    model.posterior_variance = model.posterior_variance[:5]

    t0 = time.time()
    with torch.no_grad():
        sample = model.sample(x_in[:2], shape=(2, c_out, x_in.shape[2], x_in.shape[3]))
    print(f"  Sampling (5 steps, B=2): shape={sample.shape} ({time.time() - t0:.3f}s)")

    del model
    torch.cuda.empty_cache()
    print(f"  DIFFUSION OK")


def test_jepa(ch, x_in, x_out):
    """Test 3: JEPA forward + backward."""
    print("\n" + "=" * 60)
    print("TEST 3: JEPA Model")
    print("=" * 60)

    from jepa import JEPA

    c_in = ch["input_channels"]
    device = x_in.device

    model = JEPA(
        in_channels=c_in,
        latent_channels=128,
        base_ch=32,
        pred_hidden=256,
    ).to(device)

    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"  Trainable parameters: {n_params:,}")
    print(f"  Total parameters (incl EMA target): {total_params:,}")
    print(f"  GPU memory after model: {fmt_mem()}")

    # Forward
    t0 = time.time()
    with torch.amp.autocast("cuda", dtype=torch.bfloat16):
        loss, metrics = model.compute_loss(x_in, x_out)
    print(f"  Forward: loss={loss.item():.4f}, metrics={metrics} ({time.time() - t0:.3f}s)")
    print(f"  GPU memory after forward: {fmt_mem()}")

    # Backward
    t0 = time.time()
    loss.backward()
    print(f"  Backward: ({time.time() - t0:.3f}s)")
    print(f"  GPU memory after backward: {fmt_mem()}")

    # EMA update
    model.update_target()
    print(f"  EMA update: OK")

    # Check latent shapes
    model.eval()
    with torch.no_grad():
        z_pred, z_target = model(x_in[:2], x_out[:2])
    print(f"  Latent shapes: pred={z_pred.shape}, target={z_target.shape}")

    del model
    torch.cuda.empty_cache()
    print(f"  JEPA OK")


def test_training_step(ch, loader):
    """Test 4: Full training step with optimizer and grad scaling."""
    print("\n" + "=" * 60)
    print("TEST 4: Full Training Step")
    print("=" * 60)

    from data_pipeline import prepare_batch
    from unet import UNet
    from diffusion import GaussianDiffusion

    c_in = ch["input_channels"]
    c_out = ch["output_channels"]
    device = "cuda" if torch.cuda.is_available() else "cpu"

    unet = UNet(in_channels=c_out + c_in, out_channels=c_out, base_ch=64)
    model = GaussianDiffusion(unet, timesteps=1000).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    scaler = torch.amp.GradScaler("cuda")

    model.train()
    losses = []

    for i, batch in enumerate(loader):
        if i >= 3:
            break

        x_in, x_out = prepare_batch(batch, device)
        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            loss = model.training_loss(x_out, x_in)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        losses.append(loss.item())
        print(f"  Step {i}: loss={loss.item():.4f}, mem={fmt_mem()}")

    print(f"  3 training steps completed. Losses: {[f'{l:.4f}' for l in losses]}")
    del model, optimizer, scaler
    torch.cuda.empty_cache()
    print(f"  TRAINING STEP OK")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", default="turbulent_radiative_layer_2D")
    parser.add_argument("--streaming", action="store_true", default=True)
    parser.add_argument("--no-streaming", dest="streaming", action="store_false")
    parser.add_argument("--local_path", default=None)
    parser.add_argument("--batch_size", type=int, default=4)
    args = parser.parse_args()

    print("=" * 60)
    print("THE WELL - Pipeline End-to-End Test")
    print("=" * 60)
    print(f"Dataset:   {args.dataset}")
    print(f"Streaming: {args.streaming}")
    print(f"Batch:     {args.batch_size}")
    print(f"Device:    {'cuda' if torch.cuda.is_available() else 'cpu'}")
    if torch.cuda.is_available():
        print(f"GPU:       {torch.cuda.get_device_name(0)}")
    print(f"Memory:    {fmt_mem()}")

    results = {}

    # Test 1: Data
    try:
        ch, x_in, x_out = test_data_loading(args)
        results["data"] = "PASS"
    except Exception as e:
        print(f"  FAIL: {e}")
        traceback.print_exc()
        results["data"] = f"FAIL: {e}"
        sys.exit(1)

    # Test 2: Diffusion
    try:
        test_diffusion(ch, x_in, x_out)
        results["diffusion"] = "PASS"
    except Exception as e:
        print(f"  FAIL: {e}")
        traceback.print_exc()
        results["diffusion"] = f"FAIL: {e}"

    # Test 3: JEPA
    try:
        test_jepa(ch, x_in, x_out)
        results["jepa"] = "PASS"
    except Exception as e:
        print(f"  FAIL: {e}")
        traceback.print_exc()
        results["jepa"] = f"FAIL: {e}"

    # Test 4: Training step
    try:
        loader, _ = __import__("data_pipeline").create_dataloader(
            dataset_name=args.dataset,
            split="train",
            batch_size=args.batch_size,
            streaming=args.streaming,
            local_path=args.local_path,
        )
        test_training_step(ch, loader)
        results["training_step"] = "PASS"
    except Exception as e:
        print(f"  FAIL: {e}")
        traceback.print_exc()
        results["training_step"] = f"FAIL: {e}"

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    all_pass = True
    for name, status in results.items():
        icon = "PASS" if status == "PASS" else "FAIL"
        print(f"  [{icon}] {name}")
        if status != "PASS":
            all_pass = False

    if all_pass:
        print("\nAll tests passed! Pipeline is ready for training.")
    else:
        print("\nSome tests failed. Check output above.")
        sys.exit(1)


if __name__ == "__main__":
    main()