"""
compare_3modes_speed.py — Head-to-head comparison of original vs optimized
3-mode training for N iterations.

Supports 3 modes:
  --pipeline orig    Run original pipeline only, save results to JSON
  --pipeline opt     Run optimized pipeline only, save results to JSON
  --pipeline both    Run both sequentially (may OOM on large batchsize)

After running orig and opt separately, use --pipeline compare to print
the comparison table from saved JSONs.

Usage:
    # Separate jobs (recommended for batchsize>=3):
    python tests/compare_3modes_speed.py --pipeline orig --device xpu --batchsize 3
    python tests/compare_3modes_speed.py --pipeline opt  --device xpu --batchsize 3
    python tests/compare_3modes_speed.py --pipeline compare

    # Single job (only for small batchsize):
    python tests/compare_3modes_speed.py --pipeline both --device xpu --batchsize 1
"""

import os, sys, time, json, random, argparse
import numpy as np
import torch
import torch.nn.functional as F

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT_DIR)

# ========================== Config ==========================

IMG_SIZE = 128
BATCHSIZE = 3
TIMESTEPS = 80
NDIMS = 3
V_SCALE = 5e-5
NOISE_SCALE = 0.05
NET_NAME = "recmulmodmutattnnet"
LR = 1e-5
NUM_STEPS = 10

# Training constants (must match both scripts)
MSK_EPS = 0.01
TEXT_EMBED_PROB = 0.7
AUG_RESAMPLE_PROB = 0.5
LOSS_WEIGHTS_DIFF = [2.0, 2.0, 4.0]
LOSS_WEIGHTS_REGIST = [1.0, 0.05, 128]
LOSS_WEIGHT_CONTRASTIVE = 1.0
CONTRASTIVE_STEP_RATIO = 2
DIFF_REG_BATCH_RATIO = 2

# Output directory for results
RESULTS_DIR = os.path.join(ROOT_DIR, "Logs")

parser = argparse.ArgumentParser()
parser.add_argument("--pipeline", type=str, default="both",
                    choices=["orig", "opt", "both", "compare"],
                    help="Which pipeline to run (orig/opt/both/compare)")
parser.add_argument("--device", type=str, default="xpu", choices=["cpu", "cuda", "xpu"])
parser.add_argument("--steps", type=int, default=NUM_STEPS)
parser.add_argument("--img-size", type=int, default=IMG_SIZE)
parser.add_argument("--batchsize", type=int, default=BATCHSIZE)
parser.add_argument("--results-dir", type=str, default=RESULTS_DIR,
                    help="Directory to save/load result JSONs")
args = parser.parse_args()

DEVICE = args.device
NUM_STEPS = args.steps
IMG_SIZE = args.img_size
BATCHSIZE = args.batchsize
RESULTS_DIR = args.results_dir


def detect_device(device):
    """Auto-detect device availability."""
    if device == "xpu":
        if not hasattr(torch, 'xpu') or not torch.xpu.is_available():
            print("XPU not available, falling back to CPU")
            return "cpu"
        print(f"XPU available: {torch.xpu.get_device_name(0)}")
    elif device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        return "cpu"
    return device


def seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


# ========================== Data Generation ==========================

def generate_dummy_data(num_steps, batchsize, img_size, embd_dim=1024):
    """Pre-generate all random data for reproducibility."""
    seed_all(999)
    S = img_size
    indiv_batches = []
    pair_batches = []
    for _ in range(num_steps):
        x0 = torch.rand(batchsize, 1, S, S, S, dtype=torch.float32)
        embd = torch.randn(batchsize, embd_dim, dtype=torch.float32)
        indiv_batches.append((x0, embd))

        bp = max(1, batchsize // DIFF_REG_BATCH_RATIO)
        x1 = torch.rand(bp, 1, S, S, S, dtype=torch.float32)
        y1 = torch.rand(bp, 1, S, S, S, dtype=torch.float32)
        embd_x = torch.randn(bp, embd_dim, dtype=torch.float32)
        embd_y = torch.randn(bp, embd_dim, dtype=torch.float32)
        pair_batches.append((x1, y1, embd_x, embd_y))
    return indiv_batches, pair_batches


# ========================== Single Pipeline Run ==========================

def run_pipeline(DDPMClass, LossModule, indiv_batches, pair_batches, device, label,
                 use_opt_net=False):
    """Run NUM_STEPS iterations of 3-mode training. Returns per-step losses and times."""
    import utils
    from Dataloader.dataloader_utils import thresh_img

    seed_all(42)
    if use_opt_net:
        from Diffusion.networks_opt import get_net_opt, OptSTN
        Net = get_net_opt(NET_NAME)
        stn_cls = OptSTN
    else:
        from Diffusion.networks import get_net, STN
        Net = get_net(NET_NAME)
        stn_cls = STN

    network = Net(n_steps=TIMESTEPS, ndims=NDIMS, num_input_chn=1, res=IMG_SIZE)
    ddpm = DDPMClass(
        network=network,
        n_steps=TIMESTEPS,
        image_chw=[1] + [IMG_SIZE] * NDIMS,
        device=device,
        batch_size=BATCHSIZE,
        img_pad_mode="zeros",
        v_scale=V_SCALE,
    )
    ddf_stn = stn_cls(img_sz=IMG_SIZE, ndims=NDIMS, padding_mode="border", device=device)
    ddpm.to(device)
    ddf_stn.to(device)

    loss_reg = LossModule.Grad(penalty=['l1', 'negdetj', 'range'], ndims=NDIMS,
                                outrange_thresh=0.2, outrange_weight=1e3)
    loss_reg1 = LossModule.Grad(penalty=['l1', 'negdetj', 'range'], ndims=NDIMS,
                                 outrange_thresh=0.6, outrange_weight=1e3)
    loss_dist = LossModule.MRSE(img_sz=IMG_SIZE)
    loss_ang = LossModule.NCC(img_sz=IMG_SIZE)
    loss_imgsim = LossModule.MSLNCC()
    loss_imgmse = LossModule.LMSE()

    # Move buffer-based losses to device
    loss_imgsim.to(device)
    loss_imgmse.to(device)

    optimizer = torch.optim.Adam(ddpm.parameters(), lr=LR)

    ddpm.train()
    step_losses = []
    step_times = []

    # Warmup: 1 step (not timed)
    print(f"  [{label}] Warmup step...")
    _run_one_step(0, ddpm, ddf_stn, optimizer,
                  loss_reg, loss_reg1, loss_dist, loss_ang, loss_imgsim, loss_imgmse,
                  indiv_batches[0], pair_batches[0], device, seed_offset=9999)

    print(f"  [{label}] Running {len(indiv_batches)} timed steps...")
    total_start = time.time()

    for step in range(len(indiv_batches)):
        step_start = time.time()
        losses = _run_one_step(step, ddpm, ddf_stn, optimizer,
                               loss_reg, loss_reg1, loss_dist, loss_ang, loss_imgsim, loss_imgmse,
                               indiv_batches[step], pair_batches[step], device, seed_offset=step)
        # Synchronize device
        if device == "xpu":
            torch.xpu.synchronize()
        elif device == "cuda":
            torch.cuda.synchronize()
        step_time = time.time() - step_start
        step_losses.append(losses)
        step_times.append(step_time)
        print(f"  [{label}] step {step}: diff={losses['diff']:.6f} contra={losses['contra']:.6f} regist={losses['regist']:.6f} | {step_time:.2f}s")

        # Free XPU memory between steps to avoid fragmentation-induced OOM
        if device == "xpu":
            torch.xpu.empty_cache()

    total_time = time.time() - total_start

    # Cleanup
    del ddpm, ddf_stn, optimizer
    if device == "xpu":
        torch.xpu.empty_cache()
    elif device == "cuda":
        torch.cuda.empty_cache()
    import gc; gc.collect()

    return step_losses, step_times, total_time


def _run_one_step(step, ddpm, ddf_stn, optimizer,
                  loss_reg, loss_reg1, loss_dist, loss_ang, loss_imgsim, loss_imgmse,
                  indiv_batch, pair_batch, device, seed_offset=0):
    """Execute one full 3-mode training step. Returns loss dict."""
    import utils
    from Dataloader.dataloader_utils import thresh_img

    # Seed for reproducibility of augmentation/proc_type choices
    seed_all(1000 + seed_offset)

    x0, embd = indiv_batch
    x0 = x0.to(device).type(torch.float32)
    embd_dev = embd.to(device).type(torch.float32)
    if np.random.uniform(0, 1) < TEXT_EMBED_PROB:
        embd_in = embd_dev
    else:
        embd_in = None

    n = x0.size()[0]
    blind_mask = utils.get_random_deformed_mask(x0.shape[2:], apply_possibility=0.6).to(device)

    if NDIMS > 2:
        if np.random.uniform(0, 1) < AUG_RESAMPLE_PROB:
            x0 = utils.random_resample(x0, deform_scale=0)
        else:
            [x0] = utils.random_permute([x0], select_dims=[-1, -2, -3])
    if NOISE_SCALE > 0:
        if np.random.uniform(0, 1) < AUG_RESAMPLE_PROB:
            x0 = thresh_img(x0, [0, 2 * NOISE_SCALE])
        x0 = x0 * (np.random.normal(1, NOISE_SCALE)) + np.random.normal(0, NOISE_SCALE)

    t = torch.randint(0, TIMESTEPS, (n,)).to(device)
    proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
    cond_img, _, cond_ratio = ddpm.proc_cond_img(x0, proc_type=proc_type)

    pre_dvf_I, dvf_I = ddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask, proc_type=[], text=embd_in)

    loss_ddf = loss_reg(pre_dvf_I, img=x0)
    trm_pred = ddf_stn(pre_dvf_I, dvf_I)
    loss_gen_d = loss_dist(pred=trm_pred, inv_lab=dvf_I, ddf_stn=None, mask=blind_mask)
    loss_gen_a = loss_ang(pred=trm_pred, inv_lab=dvf_I, ddf_stn=None, mask=blind_mask)

    loss_tot = LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d + LOSS_WEIGHTS_DIFF[2] * loss_ddf
    loss_tot = torch.sqrt(1. + MSK_EPS - cond_ratio) * loss_tot

    optimizer.zero_grad()
    loss_tot.backward()
    optimizer.step()

    diff_val = loss_tot.item()

    # --- Contrastive ---
    contra_val = 0.0
    if step % CONTRASTIVE_STEP_RATIO == 0:
        raw_network = ddpm.network
        t_contra = torch.randint(0, TIMESTEPS, (n,)).to(device)
        _ = raw_network(x=(x0 * blind_mask).detach(), y=cond_img.detach(), t=t_contra, text=None)
        if hasattr(raw_network, 'img_embd') and raw_network.img_embd is not None:
            img_embd = raw_network.img_embd
            loss_contra = LOSS_WEIGHT_CONTRASTIVE * (1 - F.cosine_similarity(img_embd, embd_dev, dim=-1).mean())
            optimizer.zero_grad()
            loss_contra.backward()
            torch.nn.utils.clip_grad_norm_(ddpm.parameters(), max_norm=0.05)
            optimizer.step()
            contra_val = loss_contra.item()

    # --- Registration ---
    x1, y1, _, embd_y = pair_batch
    if np.random.uniform(0, 1) < TEXT_EMBED_PROB:
        embd_y = embd_y.to(device).type(torch.float32)
    else:
        embd_y = None
    x1 = x1.to(device).type(torch.float32)
    y1 = y1.to(device).type(torch.float32)
    [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1, -2, -3])
    if NOISE_SCALE > 0:
        [x1, y1] = thresh_img([x1, y1], [0, 2 * NOISE_SCALE])
        rs = np.random.normal(1, NOISE_SCALE)
        rsh = np.random.normal(0, NOISE_SCALE)
        x1 = x1 * rs + rsh
        y1 = y1 * rs + rsh

    scale_regist = np.random.uniform(0.0, 0.7)
    select_timestep = 16  # fixed for both
    t_pool = list(range(int(TIMESTEPS * scale_regist), TIMESTEPS))
    select_timestep = min(select_timestep, len(t_pool))
    T_regist = sorted(random.sample(t_pool, select_timestep), reverse=True)
    T_regist = [[t_val for _ in range(max(1, BATCHSIZE // 2))] for t_val in T_regist]

    proc_type_r = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
    y1_proc, msk_tgt, cond_ratio_r = ddpm.proc_cond_img(y1, proc_type=proc_type_r)
    msk_tgt = msk_tgt + MSK_EPS

    [ddf_comp, _], [img_rec, _, _], _ = ddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[], text=embd_y)
    loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt * (y1 > 0.01))
    loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt * (y1 >= 0.0))
    loss_ddf1 = loss_reg1(ddf_comp, img=y1)

    loss_regist = LOSS_WEIGHTS_REGIST[0] * loss_sim + LOSS_WEIGHTS_REGIST[1] * loss_mse + LOSS_WEIGHTS_REGIST[2] * loss_ddf1
    loss_regist = torch.sqrt(cond_ratio_r + MSK_EPS) * loss_regist
    optimizer.zero_grad()
    loss_regist.backward()
    torch.nn.utils.clip_grad_norm_(ddpm.parameters(), max_norm=0.2)
    optimizer.step()

    regist_val = loss_regist.item()

    return {'diff': diff_val, 'contra': contra_val, 'regist': regist_val}


# ========================== Save / Load Results ==========================

def save_results(label, step_losses, step_times, total_time, results_dir):
    """Save pipeline results to JSON."""
    data = {
        'label': label,
        'device': DEVICE,
        'img_size': IMG_SIZE,
        'batchsize': BATCHSIZE,
        'num_steps': NUM_STEPS,
        'step_losses': step_losses,
        'step_times': step_times,
        'total_time': total_time,
    }
    os.makedirs(results_dir, exist_ok=True)
    path = os.path.join(results_dir, f"compare_{label}.json")
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"\nResults saved to {path}")
    return path


def load_results(label, results_dir):
    """Load pipeline results from JSON."""
    path = os.path.join(results_dir, f"compare_{label}.json")
    if not os.path.exists(path):
        print(f"ERROR: Results file not found: {path}")
        print(f"Run with --pipeline {label} first.")
        sys.exit(1)
    with open(path) as f:
        return json.load(f)


# ========================== Compare ==========================

def print_comparison(orig_data, opt_data):
    """Print loss and timing comparison table."""
    print("\n" + "=" * 70)
    print("LOSS COMPARISON (Original vs Optimized)")
    print(f"Device={orig_data['device']}, IMG_SIZE={orig_data['img_size']}, "
          f"BATCHSIZE={orig_data['batchsize']}, STEPS={orig_data['num_steps']}")
    print("=" * 70)
    print(f"{'Step':>4}  {'Diff_Orig':>12} {'Diff_Opt':>12} {'Match':>6}  "
          f"{'Contra_Orig':>12} {'Contra_Opt':>12} {'Match':>6}  "
          f"{'Regist_Orig':>12} {'Regist_Opt':>12} {'Match':>6}")

    n = min(len(orig_data['step_losses']), len(opt_data['step_losses']))
    all_match = True
    for i in range(n):
        o = orig_data['step_losses'][i]
        p = opt_data['step_losses'][i]
        dm = "YES" if abs(o['diff'] - p['diff']) < 1e-4 else "NO"
        cm = "YES" if abs(o['contra'] - p['contra']) < 1e-4 else "NO"
        rm = "YES" if abs(o['regist'] - p['regist']) < 1e-4 else "NO"
        if dm == "NO" or cm == "NO" or rm == "NO":
            all_match = False
        print(f"{i:>4}  {o['diff']:>12.6f} {p['diff']:>12.6f} {dm:>6}  "
              f"{o['contra']:>12.6f} {p['contra']:>12.6f} {cm:>6}  "
              f"{o['regist']:>12.6f} {p['regist']:>12.6f} {rm:>6}")

    print("\n" + "=" * 70)
    print("TIMING COMPARISON")
    print("=" * 70)
    print(f"{'Step':>4}  {'Orig (s)':>10} {'Opt (s)':>10} {'Speedup':>10}")
    for i in range(n):
        ot = orig_data['step_times'][i]
        pt = opt_data['step_times'][i]
        sp = ot / pt if pt > 0 else float('inf')
        print(f"{i:>4}  {ot:>10.2f} {pt:>10.2f} {sp:>9.2f}x")

    avg_orig = np.mean(orig_data['step_times'][:n])
    avg_opt = np.mean(opt_data['step_times'][:n])
    avg_speedup = avg_orig / avg_opt if avg_opt > 0 else float('inf')

    print(f"\n{'Avg':>4}  {avg_orig:>10.2f} {avg_opt:>10.2f} {avg_speedup:>9.2f}x")
    print(f"Total: ORIG={orig_data['total_time']:.2f}s  OPT={opt_data['total_time']:.2f}s  "
          f"Speedup={orig_data['total_time']/opt_data['total_time']:.2f}x")

    print("\n" + "=" * 70)
    print(f"Losses identical: {'YES' if all_match else 'NO'}")
    print(f"Average speedup:  {avg_speedup:.2f}x")
    print("=" * 70)


# ========================== Main ==========================

if __name__ == "__main__":
    if args.pipeline == "compare":
        # Just load and compare saved results
        orig_data = load_results("orig", RESULTS_DIR)
        opt_data = load_results("opt", RESULTS_DIR)
        print_comparison(orig_data, opt_data)
        sys.exit(0)

    # Detect device
    DEVICE = detect_device(DEVICE)

    print("=" * 70)
    print(f"3-Mode Training: Speed Comparison (pipeline={args.pipeline})")
    print(f"Device={DEVICE}, IMG_SIZE={IMG_SIZE}, BATCHSIZE={BATCHSIZE}, STEPS={NUM_STEPS}")
    print("=" * 70)

    print("\nPre-generating dummy data...")
    indiv_batches, pair_batches = generate_dummy_data(NUM_STEPS, BATCHSIZE, IMG_SIZE)

    if args.pipeline in ("orig", "both"):
        from Diffusion.diffuser import DeformDDPM as OrigDeformDDPM
        import Diffusion.losses as orig_losses_mod

        print("\n" + "-" * 70)
        print("Running ORIGINAL pipeline (OM_train_3modes.py logic)")
        print("-" * 70)
        orig_losses_list, orig_times, orig_total = run_pipeline(
            OrigDeformDDPM, orig_losses_mod, indiv_batches, pair_batches, DEVICE, "ORIG",
            use_opt_net=False)
        save_results("orig", orig_losses_list, orig_times, orig_total, RESULTS_DIR)

    if args.pipeline in ("opt", "both"):
        from Diffusion.diffuser_opt import DeformDDPM as OptDeformDDPM
        import Diffusion.losses_opt as opt_losses_mod

        # Re-generate data if running 'both' (original consumed the tensors on device)
        if args.pipeline == "both":
            indiv_batches, pair_batches = generate_dummy_data(NUM_STEPS, BATCHSIZE, IMG_SIZE)

        print("\n" + "-" * 70)
        print("Running OPTIMIZED pipeline (OM_train_3modes_opt.py logic)")
        print("-" * 70)
        opt_losses_list, opt_times, opt_total = run_pipeline(
            OptDeformDDPM, opt_losses_mod, indiv_batches, pair_batches, DEVICE, "OPT",
            use_opt_net=True)
        save_results("opt", opt_losses_list, opt_times, opt_total, RESULTS_DIR)

    if args.pipeline == "both":
        orig_data = load_results("orig", RESULTS_DIR)
        opt_data = load_results("opt", RESULTS_DIR)
        print_comparison(orig_data, opt_data)