"""DAgger online training loop.

Orchestrates the full DAgger pipeline: collect data via model + oracle,
train on buffer, evaluate periodically, and checkpoint.
"""

from __future__ import annotations

import logging
import random
import time
from pathlib import Path
from types import SimpleNamespace

import numpy as np
import torch
import torch.nn as nn
import yaml

from src.buffer import ReplayBuffer
from src.config import make_run_dir
from src.diffusion.forward import q_sample
from src.diffusion.loss import auxiliary_goal_loss, mdlm_loss
from src.diffusion.schedules import get_schedule
from src.models.denoiser import ModelEMA, make_model, try_compile
from src.planners.collect import DataCollector
from src.planners.inference import Evaluator, save_eval_json
from src.planners.logging import (
    Logger, gpu_memory_mb, reset_gpu_memory_stats,
    compute_param_norm, compute_param_drift,
)
from src.curriculum import DynamicCurriculum
from src.envs.minihack_env import collect_oracle_trajectory

logger = logging.getLogger(__name__)


class Trainer:
    """Full DAgger training loop.

    Args:
        model: Denoising model.
        ema_model: EMA tracker.
        optimizer: Torch optimizer.
        scheduler: Optional LR scheduler.
        buffer: Replay buffer.
        collector: DAgger data collector.
        evaluator: Evaluation runner.
        log: Centralised logger.
        cfg: Config namespace.
        device: Torch device.
    """

    def __init__(
        self,
        model: nn.Module,
        ema_model: ModelEMA,
        optimizer: torch.optim.Optimizer,
        scheduler: torch.optim.lr_scheduler.LRScheduler | None,
        buffer: ReplayBuffer,
        collector: DataCollector,
        evaluator: Evaluator,
        log: Logger,
        cfg: SimpleNamespace,
        device: torch.device | str,
        raw_model: nn.Module | None = None,
    ) -> None:
        self.model = model
        # raw_model is the uncompiled model used for eval deep-copies.
        # When torch.compile is off, raw_model is the same as model.
        self._raw_model = raw_model if raw_model is not None else model
        self.ema_model = ema_model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.buffer = buffer
        self.collector = collector
        self.evaluator = evaluator
        self.log = log
        self.cfg = cfg
        self.device = device
        self._schedule_fn = get_schedule(cfg.noise_schedule)
        # Snapshot of initial weights for param drift tracking
        self._init_state = {
            k: v.clone() for k, v in self._raw_model.state_dict().items()
            if v.is_floating_point()
        }
        # AMP scaler: enabled only when use_amp=true and on CUDA
        self._use_amp = (
            getattr(cfg, "use_amp", False) and str(device).startswith("cuda")
        )
        self._scaler = torch.amp.GradScaler("cuda", enabled=self._use_amp)

    # ── Main loop ────────────────────────────────────────────────

    def train(
        self, start_iter: int = 0, start_env_steps: int = 0,
    ) -> None:
        """Run the DAgger training loop.

        The budget is ``cfg.total_timesteps`` — total env.step() calls
        across model + oracle rollouts. Iteration count is derived; it
        depends on how many env steps each iteration consumes (which in
        turn depends on episode length and efficiency filter outcomes).

        Args:
            start_iter: Iteration index to resume from (for logging).
            start_env_steps: Cumulative env steps already consumed.
        """
        cfg = self.cfg
        env_steps_total = start_env_steps
        iteration = start_iter
        last_id_eval_step = start_env_steps
        last_ood_eval_step = start_env_steps
        last_ckpt_step = start_env_steps

        while env_steps_total < cfg.total_timesteps:
            reset_gpu_memory_stats()
            iter_start = time.perf_counter()

            # 1. Collect N episodes per iteration
            n_eps = getattr(cfg, "episodes_per_iteration", 1)
            num_workers = getattr(cfg, "num_collection_workers", 0)
            model_wins = 0
            added_total = 0
            # Accumulators across all n_eps episodes — must be summed,
            # NOT taken from a single (last) episode, otherwise the
            # unified env-step budget undercounts by ~n_eps×.
            model_steps_iter = 0
            oracle_steps_iter = 0
            last_env_id: str = ""

            collect_start = time.perf_counter()
            use_gpu_batch = (
                str(self.device).startswith("cuda") and n_eps > 1
            )
            if use_gpu_batch:
                # GPU-batched collection (all envs in lockstep)
                batch_stats = self.collector.collect_batch_gpu(n_eps)
                for s in batch_stats:
                    model_wins += int(s["model_won"])
                    added_total += int(s["added_to_buffer"])
                    model_steps_iter += int(s["model_steps"])
                    oracle_steps_iter += int(s["oracle_steps"])
                    last_env_id = s.get("env_id", last_env_id)
            elif num_workers > 0 and n_eps > 1:
                # Threaded CPU collection (fallback)
                batch_stats = self.collector.collect_batch_parallel(
                    n_eps,
                )
                for s in batch_stats:
                    model_wins += int(s["model_won"])
                    added_total += int(s["added_to_buffer"])
                    model_steps_iter += int(s["model_steps"])
                    oracle_steps_iter += int(s["oracle_steps"])
                    last_env_id = s.get("env_id", last_env_id)
            else:
                # Sequential collection (reference behaviour)
                for _ in range(n_eps):
                    s = self.collector.collect_one_iteration()
                    model_wins += int(s["model_won"])
                    added_total += int(s["added_to_buffer"])
                    model_steps_iter += int(s["model_steps"])
                    oracle_steps_iter += int(s["oracle_steps"])
                    last_env_id = s.get("env_id", last_env_id)
            collect_time = time.perf_counter() - collect_start

            collect_stats = {
                "env_id": last_env_id,
                "model_won": model_wins,
                "added_to_buffer": added_total,
                "model_steps": model_steps_iter,
                "oracle_steps": oracle_steps_iter,
            }

            # Advance the unified env-step budget. Both model and oracle
            # rollouts consume real env.step() calls (the oracle rollout
            # runs in its own env instance in collect_oracle_trajectory),
            # so both contribute to the budget.
            iter_env_steps = model_steps_iter + oracle_steps_iter
            env_steps_total += iter_env_steps

            # 2. Gradient steps (EMA updated after each step)
            self.model.train()
            step_metrics: list[dict[str, float]] = []
            train_start = time.perf_counter()
            for _ in range(cfg.grad_steps_per_iteration):
                m = self._train_step()
                step_metrics.append(m)
                self.ema_model.update(self._raw_model)
            train_time = time.perf_counter() - train_start

            iter_time = time.perf_counter() - iter_start

            # 4. Log
            n_steps = len(step_metrics) or 1
            avg_loss = sum(m["loss"] for m in step_metrics) / n_steps
            avg_loss_diff = sum(m["loss_diff"] for m in step_metrics) / n_steps
            avg_loss_aux = sum(m["loss_aux"] for m in step_metrics) / n_steps
            avg_grad_norm = sum(m["grad_norm"] for m in step_metrics) / n_steps
            current_lr = (
                self.scheduler.get_last_lr()[0]
                if self.scheduler is not None
                else self.cfg.dagger_lr
            )

            # Global gate value (how open is the global stream)
            gate_val = None
            if hasattr(self._raw_model, "global_gate"):
                gate_val = torch.sigmoid(
                    self._raw_model.global_gate
                ).item()

            # Buffer online fraction
            buf_total = len(self.buffer)
            buf_online_frac = (
                (buf_total - self.buffer.offline_size) / max(buf_total, 1)
                if hasattr(self.buffer, "offline_size")
                else 0.0
            )

            # Samples per second
            total_samples = n_steps * cfg.dagger_batch_size
            samples_per_sec = total_samples / max(train_time, 1e-6)

            # Env steps per second (uses the iter-summed total, not a
            # single episode — same bug class as the env-step budget).
            env_steps_per_sec = iter_env_steps / max(collect_time, 1e-6)

            metrics = {
                "diffusion/loss": avg_loss,
                "diffusion/loss_diff": avg_loss_diff,
                "diffusion/loss_aux": avg_loss_aux,
                "train/buffer_size": buf_total,
                "train/buffer_online_frac": buf_online_frac,
                "train/model_won": int(collect_stats["model_won"]),
                "train/added_to_buffer": int(
                    collect_stats["added_to_buffer"]
                ),
                "train/episodes_collected": n_eps,
                "train/model_steps": collect_stats["model_steps"],
                "train/oracle_steps": collect_stats["oracle_steps"],
                "train/efficiency_ratio": (
                    collect_stats["model_steps"]
                    / max(collect_stats["oracle_steps"], 1)
                ),
                "train/lr": current_lr,
                "train/grad_norm": avg_grad_norm,
                "train/env_steps": env_steps_total,
                "train/progress": env_steps_total / cfg.total_timesteps,
                "speed/iter_time_sec": iter_time,
                "speed/collect_time_sec": collect_time,
                "speed/train_step_time_sec": train_time,
                "speed/samples_per_sec": samples_per_sec,
                "speed/env_steps_per_sec": env_steps_per_sec,
                "speed/gpu_memory_mb": gpu_memory_mb(),
                # Keep old perf/ keys for backward compat
                "perf/iter_time_s": iter_time,
                "perf/collect_time_s": collect_time,
                "perf/train_time_s": train_time,
                "perf/grad_steps_per_sec": (
                    cfg.grad_steps_per_iteration / max(train_time, 1e-6)
                ),
            }
            if gate_val is not None:
                metrics["train/global_gate"] = gate_val
                metrics["model/ema_gate_value"] = gate_val

            # Model health (every 10 iters to avoid overhead)
            if iteration % 10 == 0:
                metrics["model/param_norm"] = compute_param_norm(
                    self._raw_model
                )
                metrics["model/param_drift_from_init"] = compute_param_drift(
                    self._raw_model, self._init_state
                )

            # Profile breakdown from GPU-batched collection
            _profile = getattr(self.collector, "_last_profile", {})
            for _pk, _pv in _profile.items():
                metrics[f"profile/{_pk}"] = _pv

            self.log.log(metrics, step=iteration)

            # 5. ID eval — triggered when env-step delta crosses threshold
            if (
                cfg.id_eval_every_timesteps > 0
                and env_steps_total - last_id_eval_step
                >= cfg.id_eval_every_timesteps
            ):
                eval_model = self.ema_model.make_eval_model(self._raw_model)
                results = self.evaluator.evaluate(
                    cfg.id_envs,
                    eval_model,
                    cfg.eval_episodes_per_env,
                    cfg,
                    self.device,
                )
                self.log.log_eval(results, step=iteration, prefix="eval_id")
                mean_id_wr = float(np.mean(
                    [s["win_rate"] for s in results.values()]
                )) if results else 0.0
                self.log.log(
                    {
                        "eval_id/mean_win_rate": mean_id_wr,
                        **{
                            f"curriculum/{env_id}/win_rate":
                                self.collector.curriculum.win_rate(env_id)
                            for env_id in self.cfg.id_envs
                        },
                    },
                    step=iteration,
                )
                last_id_eval_step = env_steps_total

            # 6. OOD eval — env-step-triggered
            if (
                cfg.ood_eval_every_timesteps > 0
                and env_steps_total - last_ood_eval_step
                >= cfg.ood_eval_every_timesteps
            ):
                eval_model = self.ema_model.make_eval_model(self._raw_model)
                results = self.evaluator.evaluate(
                    cfg.ood_envs,
                    eval_model,
                    cfg.eval_episodes_per_env,
                    cfg,
                    self.device,
                )
                self.log.log_eval(results, step=iteration, prefix="eval_ood")
                mean_ood_wr = float(np.mean(
                    [s["win_rate"] for s in results.values()]
                )) if results else 0.0
                self.log.log(
                    {"eval_ood/mean_win_rate": mean_ood_wr}, step=iteration,
                )
                last_ood_eval_step = env_steps_total

            # 7. Checkpoint — env-step-triggered
            if (
                cfg.checkpoint_every_timesteps > 0
                and env_steps_total - last_ckpt_step
                >= cfg.checkpoint_every_timesteps
            ):
                self.save_checkpoint(iteration, env_steps_total)
                last_ckpt_step = env_steps_total

            iteration += 1

        # Final checkpoint
        if cfg.save_policy:
            self.save_checkpoint(iteration, env_steps_total)

    # ── Single gradient step ─────────────────────────────────────

    def _train_step(self) -> dict[str, float]:
        """One gradient step on a buffer sample.

        Uses AMP (mixed precision) when ``cfg.use_amp`` is ``True``
        and training on CUDA.

        Returns:
            Dict with ``"loss"``, ``"loss_diff"``, ``"loss_aux"``,
            and ``"grad_norm"`` scalars.
        """
        cfg = self.cfg
        batch = self.buffer.sample(cfg.dagger_batch_size)
        if batch is None:
            return {"loss": 0.0, "loss_diff": 0.0,
                    "loss_aux": 0.0, "grad_norm": 0.0}
        local_np, global_np, actions_np = batch
        local_t = torch.from_numpy(local_np).long().to(self.device)
        global_t = torch.from_numpy(global_np).long().to(self.device)
        actions_t = torch.from_numpy(actions_np).long().to(self.device)

        B = actions_t.shape[0]
        t = torch.rand(B, device=self.device).clamp(1e-5, 1.0 - 1e-5)

        zt = q_sample(
            actions_t, t, cfg.mask_token, cfg.pad_token,
            self._schedule_fn,
        )
        t_discrete = (t * cfg.num_diffusion_steps).long().clamp(
            0, cfg.num_diffusion_steps - 1,
        )

        self.optimizer.zero_grad()
        with torch.amp.autocast("cuda", enabled=self._use_amp):
            out = self.model(local_t, global_t, zt, t_discrete)

            loss_diff = mdlm_loss(
                out["actions"], actions_t, zt, t,
                cfg.mask_token, cfg.pad_token, self._schedule_fn,
                weight_clip=cfg.loss_weight_clip,
                label_smoothing=cfg.label_smoothing,
                use_importance_weighting=cfg.use_importance_weighting,
            )

            loss_aux = torch.tensor(0.0, device=self.device)
            if "goal_pred" in out:
                loss_aux = auxiliary_goal_loss(out["goal_pred"], global_t)

            loss = loss_diff + cfg.aux_loss_weight * loss_aux

        self._scaler.scale(loss).backward()
        self._scaler.unscale_(self.optimizer)
        grad_norm = nn.utils.clip_grad_norm_(
            self.model.parameters(), cfg.dagger_grad_clip,
        )
        self._scaler.step(self.optimizer)
        self._scaler.update()
        if self.scheduler is not None:
            self.scheduler.step()

        return {
            "loss": loss.item(),
            "loss_diff": loss_diff.item(),
            "loss_aux": loss_aux.item(),
            "grad_norm": grad_norm.item(),
        }

    # ── Checkpointing ────────────────────────────────────────────

    def save_checkpoint(
        self, iteration: int, env_steps: int,
    ) -> None:
        """Save a training checkpoint.

        Args:
            iteration: Current iteration number (for filename + metadata).
            env_steps: Cumulative env.step() count consumed so far.
        """
        ckpt_dir = Path(self.cfg.checkpoint_dir)
        ckpt_dir.mkdir(parents=True, exist_ok=True)
        path = ckpt_dir / f"iter{iteration}.pth"

        # Capture W&B run ID for seamless resumption
        wandb_run_id: str | None = None
        if self.log._use_wandb and self.log._run is not None:
            wandb_run_id = self.log._run.id

        state = {
            "model_state_dict": self._raw_model.state_dict(),
            "ema_state_dict": self.ema_model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "scheduler_state_dict": (
                self.scheduler.state_dict()
                if self.scheduler is not None
                else None
            ),
            "curriculum_state": self.collector.curriculum.state_dict(),
            "iteration": iteration,
            "env_steps": env_steps,
            "wandb_run_id": wandb_run_id,
            "rng_states": {
                "torch": torch.get_rng_state(),
                "numpy": np.random.get_state(),
                "python": random.getstate(),
            },
        }

        try:
            torch.save(state, path)
            logger.info(f"Checkpoint saved: {path}")
        except Exception:
            logger.error(
                f"Failed to save checkpoint to {path}", exc_info=True,
            )

        # Save config snapshot alongside checkpoint
        config_path = ckpt_dir / f"config_iter{iteration}.yaml"
        try:
            cfg_dict = {
                k: v for k, v in vars(self.cfg).items()
                if not k.startswith("_")
            }
            with open(config_path, "w") as f:
                yaml.dump(cfg_dict, f, default_flow_style=False)
        except Exception:
            logger.error("Failed to save config snapshot", exc_info=True)
            config_path = None

        # Run eval at checkpoint and save JSON
        try:
            eval_model = self.ema_model.make_eval_model(self._raw_model)
            id_results = self.evaluator.evaluate(
                self.cfg.id_envs, eval_model,
                self.cfg.checkpoint_eval_episodes,
                self.cfg, self.device,
            )
            ood_results = self.evaluator.evaluate(
                self.cfg.ood_envs, eval_model,
                self.cfg.checkpoint_eval_episodes,
                self.cfg, self.device,
            )

            id_winrate = float(np.mean(
                [s["win_rate"] for s in id_results.values()]
            )) if id_results else 0.0
            ood_winrate = float(np.mean(
                [s["win_rate"] for s in ood_results.values()]
            )) if ood_results else 0.0
            current_lr = (
                self.scheduler.get_last_lr()[0]
                if self.scheduler is not None
                else self.cfg.dagger_lr
            )
            training_meta = {
                "iteration": iteration,
                "env_steps": env_steps,
                "total_timesteps": self.cfg.total_timesteps,
                "lr": current_lr,
                "dagger_batch_size": self.cfg.dagger_batch_size,
                "aux_loss_weight": self.cfg.aux_loss_weight,
                "buffer_size": len(self.buffer),
                "buffer_capacity": self.cfg.buffer_capacity,
                "ema_decay": self.cfg.ema_decay,
                "grad_steps_per_iteration": self.cfg.grad_steps_per_iteration,
                "episodes_per_iteration": getattr(
                    self.cfg, "episodes_per_iteration", 1
                ),
                "id_winrate": id_winrate,
                "ood_winrate": ood_winrate,
                "per_env_id": {
                    env_id: {
                        "win_rate": s["win_rate"],
                        "wins": s.get("wins", 0),
                        "avg_reward": s["avg_reward"],
                        "avg_steps": s["avg_steps"],
                        "n_episodes": s["n_episodes"],
                    }
                    for env_id, s in id_results.items()
                },
                "per_env_ood": {
                    env_id: {
                        "win_rate": s["win_rate"],
                        "wins": s.get("wins", 0),
                        "avg_reward": s["avg_reward"],
                        "avg_steps": s["avg_steps"],
                        "n_episodes": s["n_episodes"],
                    }
                    for env_id, s in ood_results.items()
                },
            }

            json_path = ckpt_dir / f"eval_iter{iteration}.json"
            save_eval_json(
                {"id": id_results, "ood": ood_results},
                str(json_path),
                metadata=training_meta,
            )

            # W&B checkpoint log — per-env step metrics + aggregates
            self.log.log_eval(
                id_results, step=iteration, prefix="ckpt_eval_id",
            )
            self.log.log_eval(
                ood_results, step=iteration, prefix="ckpt_eval_ood",
            )
            self.log.log(
                {
                    "ckpt_eval/id_winrate": id_winrate,
                    "ckpt_eval/ood_winrate": ood_winrate,
                },
                step=iteration,
            )
            self.log.log_summary({
                f"ckpt_{iteration}/id_winrate": id_winrate,
                f"ckpt_{iteration}/ood_winrate": ood_winrate,
            })
        except Exception:
            logger.error("Checkpoint eval failed", exc_info=True)

        # HuggingFace Hub upload (no-op if HF_TOKEN or hub_run_id not set)
        try:
            from scripts.hf_upload import maybe_upload_checkpoint
            maybe_upload_checkpoint(
                str(ckpt_dir),
                getattr(self.cfg, "hub_run_id", None),
                getattr(self.cfg, "hub_repo_id", None),
            )
        except Exception:
            logger.error("HF Hub upload failed", exc_info=True)

        # W&B artifact upload
        self.log.log_checkpoint_artifact(
            checkpoint_path=str(path),
            config_path=str(config_path) if config_path else None,
            iteration=iteration,
            metadata={
                "iteration": iteration,
                "buffer_size": len(self.buffer),
            },
        )

    def load_checkpoint(self, path: str) -> tuple[int, int]:
        """Load a training checkpoint.

        Args:
            path: Path to ``.pth`` checkpoint file.

        Returns:
            ``(start_iter, start_env_steps)`` — the iteration and
            cumulative env-step count to resume from.
        """
        ckpt = torch.load(
            path, map_location=self.device, weights_only=False,
        )
        self._raw_model.load_state_dict(ckpt["model_state_dict"])
        self.ema_model.load_state_dict(ckpt["ema_state_dict"])
        self.optimizer.load_state_dict(ckpt["optimizer_state_dict"])

        if (
            self.scheduler is not None
            and ckpt.get("scheduler_state_dict") is not None
        ):
            self.scheduler.load_state_dict(ckpt["scheduler_state_dict"])

        if "curriculum_state" in ckpt:
            self.collector.curriculum.load_state_dict(
                ckpt["curriculum_state"],
            )

        # Restore RNG states (best-effort)
        rng = ckpt.get("rng_states", {})
        try:
            if "torch" in rng:
                torch.set_rng_state(rng["torch"])
            if "numpy" in rng:
                np.random.set_state(rng["numpy"])
            if "python" in rng:
                random.setstate(rng["python"])
        except Exception:
            logger.warning(
                "RNG state restore failed; continuing with fresh state",
            )

        iteration = ckpt.get("iteration", 0)
        env_steps = ckpt.get("env_steps", 0)
        resume_from = iteration + 1
        logger.info(
            f"Resumed from checkpoint: {path} (iter {iteration}, "
            f"env_steps={env_steps}), starting at iter {resume_from}"
        )
        return resume_from, env_steps


def run_dagger(
    cfg: SimpleNamespace,
    checkpoint_path: str | None,
    no_warm_start: bool,
) -> None:
    """DAgger online training loop."""
    make_run_dir(cfg, tag="dagger")

    device = cfg.device
    logger.info(f"DAgger training on {device}")

    raw_model = make_model(cfg).to(device)

    # EMA and eval always use the raw (uncompiled) model — deep-copying
    # a compiled model breaks FX tracing.
    ema = ModelEMA(raw_model, decay=cfg.ema_decay)

    # torch.compile: wrap for training only; shares parameters with raw_model
    model = try_compile(raw_model, cfg)

    optimizer = torch.optim.AdamW(
        raw_model.parameters(), lr=cfg.dagger_lr,
        weight_decay=cfg.weight_decay,
    )

    buffer = ReplayBuffer(cfg.buffer_capacity, cfg.seq_len, cfg.pad_token)
    curriculum = DynamicCurriculum(
        cfg.id_envs, cfg.curriculum_queue_size, cfg.curriculum_preseed,
    )

    # Seed buffer with some oracle data
    for i, env_id in enumerate(cfg.id_envs):
        for s in range(3):
            traj = collect_oracle_trajectory(env_id, seed=i * 100 + s, cfg=cfg)
            if traj is not None:
                buffer.add(traj)
    logger.info(f"Buffer seeded with {len(buffer)} windows")

    # If resuming, extract W&B run ID from checkpoint before Logger init
    # so the same W&B run is continued (curve continuity).
    if checkpoint_path and not no_warm_start:
        resume_id = getattr(cfg, "wandb_resume_id", None)
        if not resume_id:
            ckpt_peek = torch.load(
                checkpoint_path, map_location="cpu", weights_only=False,
            )
            saved_id = ckpt_peek.get("wandb_run_id")
            if saved_id:
                cfg.wandb_resume_id = saved_id
                logger.info(
                    f"W&B run ID from checkpoint: {saved_id}"
                )
            del ckpt_peek

    # DataCollector uses raw_model for eval copies (not compiled)
    collector = DataCollector(ema, raw_model, buffer, curriculum, cfg, device)
    evaluator = Evaluator()
    log = Logger(cfg)

    trainer = Trainer(
        model, ema, optimizer, None, buffer, collector,
        evaluator, log, cfg, device, raw_model=raw_model,
    )

    start_iter = 0
    start_env_steps = 0
    if checkpoint_path and not no_warm_start:
        start_iter, start_env_steps = trainer.load_checkpoint(
            checkpoint_path,
        )

    trainer.train(
        start_iter=start_iter, start_env_steps=start_env_steps,
    )
    log.finish()