Upload COMP0258 demo bundle (code + diffusion/PPO checkpoints + ablation assets)

6140064 verified about 2 months ago

18.9 kB

	"""Shared gradient-step factory, validation rollout, and action diagnostics.

	Both :mod:`src.planners.offline` and :mod:`src.planners.online` use identical
	gradient update and validation logic. Centralising it here eliminates
	duplication.
	"""

	from __future__ import annotations

	from typing import Any, Callable

	import jax
	import jax.numpy as jnp
	import optax

	from src.diffusion.loss import compute_loss
	from src.diffusion.sampling import sample_plan
	from src.diffusion.schedules import ScheduleFn


	def resolve_num_updates(config: dict[str, Any], mode: str) -> None:
	"""Resolve ``NUM_UPDATES`` from env-frame-denominated config keys.

	Mutates ``config`` in place. After this call the runners can read
	``NUM_UPDATES`` (and ``OFFLINE_TOTAL_TIMESTEPS`` /
	``ONLINE_TOTAL_TIMESTEPS`` depending on mode) without worrying about
	whether the user specified the env-frame or update-count form.

	Resolution priority:

	=========== ============================================================
	Mode Priority (highest first)
	=========== ============================================================
	``offline`` ``OFFLINE_TOTAL_TIMESTEPS`` > ``OFFLINE_NUM_UPDATES``
	``online`` ``ONLINE_TOTAL_TIMESTEPS`` > ``ONLINE_NUM_UPDATES``
	=========== ============================================================

	Env-frame keys are preferred because they are invariant under
	``num_envs`` changes — the same value yields the same total environment
	experience regardless of hardware sizing, which makes cross-hardware
	fairness studies (e.g. UCL 4096-env vs QMUL 96-env) trivially fair
	without manual scaling.

	The function is idempotent: calling it twice with the same config has
	the same effect as calling it once.

	Args:
	config: Upper-cased config dict. Must contain ``NUM_STEPS`` and
	``NUM_ENVS``.
	mode: Either ``"offline"`` or ``"online"``.

	Raises:
	ValueError: If neither the env-frame nor the update-count form is
	set for the given mode, or if ``mode`` is unknown.
	"""
	frames_per_update = int(config["NUM_STEPS"]) * int(config["NUM_ENVS"])

	if mode == "offline":
	ts_key, nu_key = "OFFLINE_TOTAL_TIMESTEPS", "OFFLINE_NUM_UPDATES"
	elif mode == "online":
	ts_key, nu_key = "ONLINE_TOTAL_TIMESTEPS", "ONLINE_NUM_UPDATES"
	else:
	raise ValueError(
	f"Unknown mode: {mode!r}; expected 'offline' or 'online'."
	)

	ts = config.get(ts_key)
	nu = config.get(nu_key)
	# float() first to accept YAML scientific notation parsed as string
	# (PyYAML 1.1 only auto-coerces "3.0e+8", not "3e8" or "3.0e8").
	if ts is not None:
	num_updates = max(1, int(float(ts)) // frames_per_update)
	elif nu:
	num_updates = int(float(nu))
	else:
	raise ValueError(
	f"{mode.capitalize()} mode requires either "
	f"{ts_key.lower()!r} (env frames, preferred) or "
	f"{nu_key.lower()!r} to be set."
	)

	config["NUM_UPDATES"] = num_updates
	# Re-snap so downstream consumers (run names, SPS, checkpoint IDs)
	# see the exact integer multiple actually trained.
	config[ts_key] = num_updates * frames_per_update


	def resolve_scaled_hyperparams(config: dict[str, Any], mode: str) -> None:
	"""Resolve env-frame-denominated hyperparameters into update-step form.

	Mutates ``config`` in place. PRIMARY (env-frame) keys override LEGACY
	(update-step) keys when set, mirroring the
	:func:`resolve_num_updates` pattern. When the PRIMARY key is ``None``
	the LEGACY value passes through unchanged, preserving full
	backward compatibility with configs that predate this resolver.

	Resolution table
	================

	+-----------------------------+------------------------+----------+
	\| PRIMARY (env-frame) \| LEGACY (update-step) \| Mode \|
	+=============================+========================+==========+
	\| ``LR_WARMUP_FRAMES`` \| ``LR_WARMUP_STEPS`` \| both \|
	+-----------------------------+------------------------+----------+
	\| ``VAL_INTERVAL_FRAMES`` \| ``VAL_INTERVAL`` \| both \|
	+-----------------------------+------------------------+----------+
	\| ``DAGGER_BETA_FINAL`` \| ``DAGGER_BETA_DECAY`` \| online \|
	+-----------------------------+------------------------+----------+
	\| ``DAGGER_BUFFER_CYCLES`` \| ``DAGGER_BUFFER_MAX`` \| online \|
	+-----------------------------+------------------------+----------+

	Why env-frame units
	-------------------
	Env-frame values are invariant under ``num_envs`` changes, so the
	same config trains the same effective experiment on any GPU. The
	update-step legacy keys had to be hand-derived per hardware tier,
	which was both error-prone and obscured the conceptual quantity
	(e.g. final beta, not per-update decay constant).

	The conversion for ``DAGGER_BETA_FINAL`` requires ``NUM_UPDATES``,
	so this function MUST be called after :func:`resolve_num_updates`
	when resolving online mode.

	Idempotent: calling this twice is equivalent to calling it once.

	Args:
	config: Upper-cased config dict. Must contain ``NUM_STEPS`` and
	``NUM_ENVS``.
	mode: Either ``"offline"`` or ``"online"``.

	Raises:
	ValueError: If ``DAGGER_BETA_FINAL`` is set in online mode but
	``NUM_UPDATES`` has not been resolved yet.
	"""
	fpu = int(config["NUM_STEPS"]) * int(config["NUM_ENVS"])

	# float() first to accept YAML scientific notation parsed as string
	# (PyYAML 1.1 only auto-coerces "3.0e+8", not "3e8" or "3.0e8").
	# ── Mode-agnostic ────────────────────────────────────────────────
	warmup_frames = config.get("LR_WARMUP_FRAMES")
	if warmup_frames is not None:
	config["LR_WARMUP_STEPS"] = int(float(warmup_frames)) // fpu

	val_frames = config.get("VAL_INTERVAL_FRAMES")
	if val_frames is not None:
	config["VAL_INTERVAL"] = max(1, int(float(val_frames)) // fpu)

	# ── Online-only ──────────────────────────────────────────────────
	if mode != "online":
	return

	beta_final = config.get("DAGGER_BETA_FINAL")
	if beta_final is not None:
	num_updates = config.get("NUM_UPDATES")
	if num_updates is None:
	raise ValueError(
	"DAGGER_BETA_FINAL requires NUM_UPDATES to be resolved "
	"first; call resolve_num_updates() before "
	"resolve_scaled_hyperparams()."
	)
	beta_init = float(config.get("DAGGER_BETA_INIT", 1.0))
	# final = init * decay^N => decay = (final / init) ** (1 / N)
	config["DAGGER_BETA_DECAY"] = (
	float(beta_final) / beta_init
	) ** (1.0 / int(num_updates))

	buffer_cycles = config.get("DAGGER_BUFFER_CYCLES")
	if buffer_cycles is not None:
	config["DAGGER_BUFFER_MAX"] = max(1, int(round(float(buffer_cycles) * fpu)))


	def print_config_snapshot(config: dict[str, Any], mode: str) -> None:
	"""Print a structured banner of training-critical hyperparameters.

	Surfaces fairness-critical, schedule, and architecture parameters at
	the start of every offline/online run so cross-hardware comparisons
	can be sanity-checked at a glance. Must be called AFTER
	:func:`resolve_num_updates` and :func:`resolve_scaled_hyperparams`
	so the printed values reflect what training will actually use.

	Args:
	config: Upper-cased config dict (post-resolver).
	mode: Either ``"offline"`` or ``"online"``.
	"""
	fpu = int(config["NUM_STEPS"]) * int(config["NUM_ENVS"])
	num_updates = int(config["NUM_UPDATES"])
	minibatch = fpu // int(config["NUM_MINIBATCHES"])
	ts_key = f"{mode.upper()}_TOTAL_TIMESTEPS"
	total_frames = int(config[ts_key])

	bar = "=" * 72
	title = f"{mode.upper()} training — config snapshot"
	print(f"\n{bar}\n {title}\n{bar}")
	print(f" env_name : {config['ENV_NAME']}")
	print(f" seed : {config['SEED']}")

	print(" -- Rollout / hardware --")
	print(f" num_envs = {config['NUM_ENVS']}")
	print(f" num_steps = {config['NUM_STEPS']}")
	print(f" fpu (envs*steps) = {fpu}")
	print(f" num_minibatches = {config['NUM_MINIBATCHES']} (minibatch={minibatch})")
	print(f" update_epochs = {config['UPDATE_EPOCHS']}")
	print(f" num_repeats = {config.get('NUM_REPEATS', 1)}")

	print(" -- Schedule --")
	print(f" {ts_key.lower():<24} = {total_frames:,} (~{total_frames/1e6:.1f}M frames)")
	print(f" {'num_updates':<24} = {num_updates:,}")
	warmup = int(config.get("LR_WARMUP_STEPS", 0))
	print(f" {'lr':<24} = {float(config['LR']):.2e}")
	print(f" {'lr_warmup_steps':<24} = {warmup} (~{warmup * fpu / 1e6:.2f}M frames)")
	print(f" {'max_grad_norm':<24} = {config.get('MAX_GRAD_NORM', 1.0)}")

	if mode == "online":
	beta_init = float(config.get("DAGGER_BETA_INIT", 1.0))
	beta_decay = float(config["DAGGER_BETA_DECAY"])
	final_beta = beta_init * beta_decay ** num_updates
	buffer_max = int(config["DAGGER_BUFFER_MAX"])
	cycles = buffer_max / fpu
	# Mirrors the n_train_passes default in run_online: drawn fresh per
	# update, capped at samples_per_update for memory.
	plan_h = int(config["PLAN_HORIZON"])
	samples_per_update = int(config["NUM_ENVS"]) * (
	int(config["NUM_STEPS"]) - plan_h + 1
	)
	n_passes = config.get("DAGGER_TRAIN_PASSES") or max(
	1, buffer_max // max(1, samples_per_update)
	)
	expert_det = bool(config.get("DAGGER_EXPERT_DETERMINISTIC", True))
	total_grad_steps = (
	num_updates * int(n_passes)
	* int(config["UPDATE_EPOCHS"]) * int(config["NUM_MINIBATCHES"])
	)
	passes_tag = "auto" if config.get("DAGGER_TRAIN_PASSES") is None else "override"
	print(" -- DAgger --")
	print(f" {'dagger_beta_init':<24} = {beta_init}")
	print(f" {'dagger_beta_decay':<24} = {beta_decay:.10f}")
	print(f" {'final beta':<24} = {final_beta:.4f} (init * decay^N)")
	print(f" {'dagger_buffer_max':<24} = {buffer_max:,} (~{cycles:.2f} update cycles)")
	print(f" {'samples_per_update':<24} = {samples_per_update:,}")
	print(f" {'dagger_train_passes':<24} = {n_passes} ({passes_tag})")
	print(f" {'dagger_expert_determ':<24} = {expert_det}")
	print(f" {'total_grad_steps':<24} = {total_grad_steps:,}")
	else:
	total_grad_steps = (
	num_updates * int(config["UPDATE_EPOCHS"]) * int(config["NUM_MINIBATCHES"])
	)
	print(f" {'total_grad_steps':<24} = {total_grad_steps:,}")

	val_int = int(config.get("VAL_INTERVAL", 0))
	print(" -- Validation --")
	print(f" val_interval = {val_int} updates (~{val_int * fpu / 1e6:.2f}M frames)")
	print(f" val_diffusion_steps = {config.get('VAL_DIFFUSION_STEPS')}")
	print(f" val_replan_every = {config.get('VAL_REPLAN_EVERY')}")
	print(f" val_steps = {config.get('VAL_STEPS')}")

	print(" -- Diffusion model --")
	print(
	f" d_model/n_heads/n_layers/d_ff = "
	f"{config['D_MODEL']}/{config['N_HEADS']}/{config['N_LAYERS']}/{config['D_FF']}"
	)
	print(f" plan_horizon = {config['PLAN_HORIZON']}")
	print(f" diffusion_steps = {config['DIFFUSION_STEPS']}")
	print(f" remask_strategy = {config.get('REMASK_STRATEGY')} eta={config.get('ETA')}")
	print(
	f" sampling: temp={config.get('TEMPERATURE')} top_p={config.get('TOP_P')} "
	f"loop={config.get('USE_LOOP')} t_on/t_off={config.get('T_ON')}/{config.get('T_OFF')}"
	)
	print(f"{bar}\n", flush=True)


	def _action_stats(
	acts: jnp.ndarray,
	num_actions: int,
	valid: jnp.ndarray,
	) -> dict[str, jnp.ndarray]:
	"""Compute action-distribution entropy and unique-action fraction over valid windows.

	Args:
	acts: ``[B, H]`` int32 action sequences.
	num_actions: Size of the real action vocabulary.
	valid: ``[B]`` bool mask; invalid samples are excluded from counts.

	Returns:
	Dict with ``action_entropy`` and ``action_unique_frac``.
	"""
	mask = jnp.broadcast_to(valid[:, None], acts.shape).reshape(-1)
	flat = jnp.where(mask, acts.reshape(-1), num_actions + 1)
	counts = jnp.bincount(flat, length=num_actions).astype(jnp.float32)
	probs = counts / jnp.maximum(counts.sum(), 1.0)
	entropy = -jnp.sum(probs * jnp.log(jnp.where(probs > 0, probs, 1.0)))
	return {
	"action_entropy": entropy,
	"action_unique_frac": jnp.sum(probs > 0).astype(jnp.float32) / num_actions,
	}


	def make_grad_step(
	apply_train: Callable,
	num_actions: int,
	schedule_fn: ScheduleFn,
	schedule_deriv_fn: ScheduleFn,
	sigma_t: float,
	label_smoothing: float,
	) -> Callable:
	"""Return a jittable gradient update function.

	Args:
	apply_train: Model apply function with dropout enabled.
	num_actions: Size of the action vocabulary.
	schedule_fn: alpha(t) noise schedule.
	schedule_deriv_fn: d(alpha)/dt analytic derivative.
	sigma_t: ReMDM remasking strength during training.
	label_smoothing: Cross-entropy label smoothing epsilon.

	Returns:
	A ``step(state, acts, obs, valid, rng, advantages) -> (state, metrics)``
	function ready for use inside ``jax.lax.scan``.
	"""

	def _loss_fn(
	params: Any,
	acts: jnp.ndarray,
	obs: jnp.ndarray,
	valid: jnp.ndarray,
	rng: jax.Array,
	advantages: jnp.ndarray,
	) -> tuple[jnp.ndarray, dict]:
	return compute_loss(
	apply_train, params, rng, acts, obs, valid,
	num_actions, schedule_fn, schedule_deriv_fn,
	sigma_t=sigma_t, label_smoothing=label_smoothing,
	advantages=advantages,
	)

	def step(
	state: Any,
	acts: jnp.ndarray,
	obs: jnp.ndarray,
	valid: jnp.ndarray,
	rng: jax.Array,
	advantages: jnp.ndarray,
	) -> tuple[Any, dict]:
	"""Single gradient update step.

	Args:
	state: Current ``TrainState``.
	acts: ``[B, H]`` int32 action sequences.
	obs: ``[B, obs_dim]`` float32 observations.
	valid: ``[B]`` bool validity mask (episode-boundary filter).
	rng: PRNG key for dropout and noise sampling.
	advantages: ``[B]`` float per-sample weights applied before loss reduction.

	Returns:
	Updated ``TrainState`` and a metrics dict.
	"""
	(_, info), grads = jax.value_and_grad(_loss_fn, has_aux=True)(
	state.params, acts, obs, valid, rng, advantages,
	)
	state = state.apply_gradients(grads=grads)
	info["grad_norm"] = optax.tree.norm(grads)
	info.update(_action_stats(acts, num_actions, valid))
	return state, info

	return step


	def make_validate(
	env: Any,
	env_params: Any,
	apply_eval: Callable,
	num_actions: int,
	plan_horizon: int,
	schedule_fn: ScheduleFn,
	config: dict[str, Any],
	val_replan_every: int,
	n_val_cycles: int,
	) -> Callable:
	"""Return a ``validate(state, rng) -> dict`` closure for periodic eval.

	The closure runs a held-out rollout using the diffusion model's current
	parameters and returns metrics under the ``val/`` namespace.

	Args:
	env: Batched Gymnax environment.
	env_params: Gymnax environment params.
	apply_eval: Model apply function (eval mode, no dropout).
	num_actions: Size of the action vocabulary.
	plan_horizon: Action plan length H.
	schedule_fn: alpha(t) noise schedule.
	config: Training config dict (read-only).
	val_replan_every: Env steps executed per diffusion plan during validation.
	n_val_cycles: Number of plan-execute cycles per validation rollout.

	Returns:
	A ``validate(state, rng) -> {str: jnp.ndarray}`` closure.
	"""

	def validate(state: Any, rng: jax.Array) -> dict[str, jnp.ndarray]:
	"""Run a validation rollout and return ``val/`` metrics.

	Args:
	state: Current ``TrainState`` (only ``.params`` is used).
	rng: PRNG key.

	Returns:
	Dict with ``val/`` prefixed metric keys.
	"""
	rng, val_rng = jax.random.split(rng)
	val_obs, val_env_state = env.reset(val_rng, env_params)

	def _val_cycle(carry, _):
	vs, vo, rng = carry
	rng, p_rng = jax.random.split(rng)
	plan = sample_plan(
	apply_eval,
	state.params,
	p_rng,
	vo,
	num_actions,
	plan_horizon,
	num_steps=config.get("VAL_DIFFUSION_STEPS", 50),
	schedule_fn=schedule_fn,
	remask_strategy=config.get("REMASK_STRATEGY", "rescale"),
	eta=config.get("ETA", 0.5),
	use_loop=config.get("USE_LOOP", True),
	t_on=config.get("T_ON", 0.7),
	t_off=config.get("T_OFF", 0.3),
	temperature=config.get("TEMPERATURE", 0.5),
	top_p=config.get("TOP_P", 0.95),
	) # [num_envs, plan_horizon]

	def _exec_step(inner_carry, step_i):
	vs_i, vo_i, r = inner_carry
	r, s_rng = jax.random.split(r)
	vo_next, vs_next, _, _, info = env.step(
	s_rng, vs_i, plan[:, step_i], env_params,
	)
	return (vs_next, vo_next, r), info

	(vs, vo, rng), step_infos = jax.lax.scan(
	_exec_step, (vs, vo, rng), jnp.arange(val_replan_every),
	)
	return (vs, vo, rng), step_infos

	_, cycle_infos = jax.lax.scan(
	_val_cycle, (val_env_state, val_obs, rng), None, n_val_cycles,
	)
	infos = jax.tree.map(
	lambda x: x.reshape(-1, *x.shape[2:]), cycle_infos,
	)
	returned = infos["returned_episode"]
	metrics = jax.tree.map(
	lambda x: (x * returned).sum() / (returned.sum() + 1e-8),
	infos,
	)
	return {f"val/{k}": v for k, v in metrics.items()}

	return validate