div18

show script

85b91e2 about 1 month ago

54.2 kB

	#!/usr/bin/env python3
	"""
	train.py — AntiAtropos QLoRA Reward-Based Training (HF Jobs Edition)
	=====================================================================

	Training loop: generate → evaluate → reward → update → log → checkpoint

	This is NOT supervised fine-tuning. The model generates actions, the OpenEnv
	environment (running on HF Spaces) evaluates them, and we use the reward
	signal to update the policy via REINFORCE/GRPO/RLOO.

	Architecture (from training.md):
	- GPU = compute only (ephemeral)
	- Hub = source of truth (persistent)
	- Training = reproducible + resumable
	- Metrics = structured + queryable

	Usage:
	# RECOMMENDED: Launch via HF Jobs (auto-provisions GPU, pushes to Hub):
	python training/launch_train.py --run-id my_run

	# Or run directly (requires a running AntiAtropos server):
	ANTIATROPOS_ENV_URL=http://localhost:8000 \
	ANTIATROPOS_HUB_MODEL_REPO=Keshav051/antiatropos-qlora \
	python training/train.py --run-id my_run --num-iterations 15

	# Override defaults:
	python training/train.py --run-id run_007 --num-iterations 500 --num-episodes 6

	# See all options:
	python training/train.py --help
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import random
	import sys
	import time
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	import torch
	import yaml

	# ────────────────────────────────────────────────────────────
	# Module path setup — allow imports from training/ package
	# ────────────────────────────────────────────────────────────
	TRAINING_DIR = Path(__file__).resolve().parent
	PROJECT_DIR = TRAINING_DIR.parent
	if str(TRAINING_DIR) not in sys.path:
	sys.path.insert(0, str(TRAINING_DIR))
	if str(PROJECT_DIR) not in sys.path:
	sys.path.insert(0, str(PROJECT_DIR))

	from model_utils import (
	attach_lora,
	detect_gpu_tier,
	find_latest_checkpoint,
	gpu_scaled_config,
	load_base_model,
	push_adapter_to_hub,
	push_to_hub,
	save_checkpoint,
	)
	from openenv_loop import (
	OpenEnvClient,
	rollout_batch,
	rollout_episode,
	rollout_heuristic_episode,
	)
	from eval import evaluate
	from plotting import (
	generate_all_plots,
	push_plots_to_hub,
	episodes_to_plot_data,
	)


	# ────────────────────────────────────────────────────────────
	# Config Loading
	# ────────────────────────────────────────────────────────────

	def load_config(config_path: str) -> Dict[str, Any]:
	"""Load config from YAML, apply env var overrides, GPU auto-scale."""
	with open(config_path) as f:
	cfg = yaml.safe_load(f)

	# Apply env var overrides (ANTIATROPOS_<KEY>=<value>)
	env_overrides = {}
	for key, value in os.environ.items():
	if key.startswith("ANTIATROPOS_"):
	cfg_key = key[len("ANTIATROPOS_"):].lower()
	env_overrides[cfg_key] = value

	for key, value in env_overrides.items():
	if key in cfg:
	orig = cfg[key]
	if isinstance(orig, bool):
	cfg[key] = value.lower() in ("true", "1", "yes")
	elif isinstance(orig, int):
	cfg[key] = int(value)
	elif isinstance(orig, float):
	cfg[key] = float(value)
	elif isinstance(orig, list):
	cfg[key] = json.loads(value)
	else:
	cfg[key] = value
	print(f"[config] Env override: {key} = {cfg[key]}")

	# GPU auto-scaling (only if not explicitly overridden)
	cfg = gpu_scaled_config(cfg)

	return cfg


	# ────────────────────────────────────────────────────────────
	# REINFORCE Loss (PyTorch)
	# ────────────────────────────────────────────────────────────

	def compute_returns(rewards: List[float], gamma: float) -> List[float]:
	"""Compute discounted returns from a list of rewards."""
	returns = []
	g = 0.0
	for r in reversed(rewards):
	g = r + gamma * g
	returns.insert(0, g)
	return returns


	def reinforce_baseline_loss_fn(
	model,
	tokenizer,
	episodes: List,
	cfg: Dict[str, Any],
	) -> torch.Tensor:
	"""Compute REINFORCE with baseline loss across episodes.

	Uses per-mini-batch gradient accumulation:
	- Pre-compute ALL advantages on CPU first (enables global normalization).
	- For each mini-batch: forward → compute loss → backward() immediately.
	- Frees the computation graph after every mini-batch.
	- Returns a detached scalar; gradients already sit in model.parameters().grad.

	This keeps peak VRAM to ONE forward pass worth of activations (~8-9 GiB)
	instead of accumulating all mini-batch graphs simultaneously (which caused
	OOM when 3 batches × ~8.9 GiB each = 26+ GiB were held concurrently).

	Caller (train.py) must check `if loss.requires_grad` before calling
	loss.backward() — this function returns requires_grad=False so the
	caller's backward() is skipped cleanly.
	"""
	import math as _math
	gamma = cfg.get("reward_gamma", 0.99)
	normalize_adv = cfg.get("advantage_normalize", True)
	loss_batch_size = cfg.get("loss_batch_size", 1)
	max_seq_len_cap = cfg.get("max_seq_length", 512)
	pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id

	# ── Phase 1: Collect all (transition, return) pairs (CPU) ──────────────
	all_pairs: List[Tuple] = []
	for ep in episodes:
	if not ep.transitions:
	continue
	rewards = [t.reward for t in ep.transitions]
	returns = compute_returns(rewards, gamma)
	for trans, ret in zip(ep.transitions, returns):
	if trans.input_ids is not None:
	all_pairs.append((trans, ret))

	if not all_pairs:
	return torch.tensor(0.0, device=model.device)

	# ── Phase 2: Compute normalized advantages on CPU (global normalization) ─
	raw_returns = torch.tensor([p[1] for p in all_pairs], dtype=torch.float32)
	if normalize_adv and len(raw_returns) > 1:
	advantages = (raw_returns - raw_returns.mean()) / (raw_returns.std() + 1e-8)
	else:
	advantages = raw_returns
	# advantages stays on CPU until we move per-batch slices to GPU

	# ── Phase 3: Gradient accumulation — one forward/backward per mini-batch ─
	# Each iteration: build batch → forward → loss → backward → del graph.
	# Only one forward pass worth of activations lives in VRAM at any time.
	n_batches = _math.ceil(len(all_pairs) / loss_batch_size)
	total_loss_val = 0.0

	for batch_idx, batch_start in enumerate(range(0, len(all_pairs), loss_batch_size)):
	batch = all_pairs[batch_start:batch_start + loss_batch_size]
	batch_advs = advantages[batch_start:batch_start + loss_batch_size] # CPU tensor

	batch_ids = [p[0].input_ids for p in batch]
	batch_masks = [p[0].attention_mask for p in batch]

	# Truncate outlier-length sequences (tail keeps action tokens)
	batch_ids = [ids[-max_seq_len_cap:] if ids.shape[0] > max_seq_len_cap else ids for ids in batch_ids]
	batch_masks = [m[-max_seq_len_cap:] if m.shape[0] > max_seq_len_cap else m for m in batch_masks]

	# Build per-transition action masks: only compute log-probs over
	# the GENERATED action tokens, not the prompt tokens.
	# prompt_len is stored on each transition; after possible truncation
	# we need to recompute the mask offset.
	batch_action_masks = []
	for ids, p in zip(batch_ids, batch):
	plen = p[0].prompt_len # original prompt length before truncation
	seq_len = ids.shape[0]
	# If sequence was truncated from the left, adjust prompt_len:
	# the kept portion starts at max(0, original_len - max_seq_len_cap)
	original_len = p[0].input_ids.shape[0] if not isinstance(p[0].input_ids, int) else seq_len
	if isinstance(p[0].input_ids, torch.Tensor) and p[0].input_ids.shape[0] > max_seq_len_cap:
	offset = p[0].input_ids.shape[0] - max_seq_len_cap
	plen = max(0, plen - offset)
	amask = torch.zeros(seq_len, dtype=torch.long)
	if plen < seq_len:
	amask[plen:] = 1 # action tokens after prompt
	batch_action_masks.append(amask)

	# Left-pad to same length within mini-batch
	max_len = max(ids.shape[0] for ids in batch_ids)
	padded_ids, padded_masks, padded_action_masks = [], [], []
	for ids, mask, amask in zip(batch_ids, batch_masks, batch_action_masks):
	pad_len = max_len - ids.shape[0]
	if pad_len > 0:
	padded_ids.append(torch.cat([torch.full((pad_len,), pad_id, device=ids.device), ids]))
	padded_masks.append(torch.cat([torch.zeros(pad_len, device=mask.device, dtype=mask.dtype), mask]))
	# Action mask: left-pad with zeros (padding tokens are never action tokens)
	padded_action_masks.append(torch.cat([torch.zeros(pad_len, dtype=torch.long), amask]))
	else:
	padded_ids.append(ids)
	padded_masks.append(mask)
	padded_action_masks.append(amask)

	input_ids = torch.stack(padded_ids)
	attention_mask = torch.stack(padded_masks)

	# ── Forward pass ─────────────────────────────────────────────────────
	torch.cuda.empty_cache()
	if torch.cuda.is_available():
	alloc = torch.cuda.memory_allocated() / 1024**3
	free, total_mem = torch.cuda.mem_get_info()
	torch.cuda.reset_peak_memory_stats()
	print(f" [loss_fwd b{batch_idx+1}/{n_batches}] "
	f"shape={input_ids.shape} alloc={alloc:.2f}GiB "
	f"free={free/10243:.1f}/{total_mem/10243:.1f}GiB", flush=True)

	outputs = model(
	input_ids=input_ids.to(model.device),
	attention_mask=attention_mask.to(model.device),
	use_cache=False, # No KV-cache needed for single-pass loss forward.
	)

	if torch.cuda.is_available():
	peak = torch.cuda.max_memory_allocated() / 1024**3
	free2, _ = torch.cuda.mem_get_info()
	print(f" [loss_fwd b{batch_idx+1}/{n_batches}] "
	f"post-fwd peak={peak:.2f}GiB free={free2/1024**3:.1f}GiB", flush=True)

	# ── Memory-efficient NLL via fused cross_entropy ─────────────────────
	# F.cross_entropy(reduction='none') uses a single fused CUDA kernel:
	# it never materialises the full [B, S-1, V] log-prob matrix (~623 MiB
	# at V=151936, batch=1, seq=512) — it computes log-softmax + NLL in one
	# pass, keeping only the per-token scalar result.
	logits = outputs.logits
	shift_logits = logits[:, :-1, :].contiguous() # (B, S-1, V)
	shift_labels = input_ids[:, 1:].contiguous() # (B, S-1)
	shift_labels = shift_labels.to(model.device)
	shift_mask = attention_mask[:, 1:].to(model.device) # (B, S-1)

	# token_nll: (B, S-1), zero for padded positions
	token_nll = torch.nn.functional.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.clamp(min=0).view(-1),
	reduction="none",
	).view(shift_labels.shape) # (B, S-1)
	token_nll = token_nll * shift_mask

	# ── Action-only log-probs for REINFORCE ──
	# Only sum NLL over action tokens (past prompt_len), not the prompt.
	# This is critical: log π(action \| prompt) ≠ log π(prompt+action).
	# Masking out prompt tokens prevents the gradient from pushing on
	# tokens the model can't control and eliminates a massive source of
	# noise and variance in the REINFORCE gradient.
	stacked_action_masks = torch.stack(padded_action_masks) # (B, S)
	shift_action_mask = stacked_action_masks[:, 1:].to(model.device) # (B, S-1)
	# Zero out NLL for prompt positions — only keep action token NLL
	action_nll = token_nll * shift_action_mask
	seq_log_probs = -(action_nll.sum(dim=1)) # (B,) sum of action-token log-probs only
	# Number of action tokens per sequence (for optional normalization)
	n_action_tokens = shift_action_mask.sum(dim=1).clamp(min=1) # (B,)

	# ── Chunked vocab entropy (avoids materialising full [B, S, V]) ────────
	# logsumexp over V gives the log-normaliser (1 scalar per token, ~4 MiB).
	# We then accumulate -sum(p*log_p) chunk-by-chunk: each chunk is CHUNK_V
	# columns → peak extra alloc ≈ B×S×CHUNK_V×4B = 1×511×4096×4 ≈ 8 MiB
	# instead of 623 MiB for the full V=151936 matrix.
	CHUNK_V = 4096
	# Exact single-pass entropy without materialising [B,S,V]:
	# logsumexp over V gives the normaliser; we then compute -sum(p*log_p) chunk-by-chunk.
	log_Z = shift_logits.logsumexp(dim=-1, keepdim=True) # (B, S-1, 1)
	entropy_per_token = torch.zeros(shift_logits.shape[:2], device=model.device)
	for v_start in range(0, shift_logits.size(-1), CHUNK_V):
	chunk_logits = shift_logits[:, :, v_start:v_start + CHUNK_V] # (B, S-1, c)
	log_p_chunk = chunk_logits - log_Z # log-prob for this slice
	p_chunk = log_p_chunk.exp() # prob for this slice
	entropy_per_token += -(p_chunk * log_p_chunk).sum(dim=-1) # accumulate (B, S-1)
	del log_Z

	# Free logits immediately before backward
	del outputs, logits, shift_logits, token_nll
	torch.cuda.empty_cache()

	if torch.cuda.is_available():
	peak = torch.cuda.max_memory_allocated() / 1024**3
	free2, _ = torch.cuda.mem_get_info()
	print(f" [loss_fwd b{batch_idx+1}/{n_batches}] "
	f"post-fwd peak={peak:.2f}GiB free={free2/1024**3:.1f}GiB", flush=True)

	# ── Per-mini-batch loss: REINFORCE + entropy bonus ────────────────────
	ent_coef = cfg.get("entropy_coef", 0.001)
	n_valid_tokens = shift_mask.sum(dim=1).clamp(min=1) # (B,)
	# Only compute entropy over action tokens (same region as log-probs)
	n_action_valid = (shift_action_mask * shift_mask).sum(dim=1).clamp(min=1) # (B,)
	avg_token_entropy = ((entropy_per_token * shift_action_mask * shift_mask).sum(dim=1) / n_action_valid).mean()

	print(f" [entropy b{batch_idx+1}/{n_batches}] "
	f"avg_token_entropy={avg_token_entropy.item():.3f}nats "
	f"ent_coef={ent_coef} "
	f"reinforce={-(batch_advs.to(model.device) * seq_log_probs).mean().item():.4f} "
	f"ent_bonus={ent_coef * avg_token_entropy.item():.4f}", flush=True)

	batch_advs_gpu = batch_advs.to(model.device)
	# Normalize log-probs by number of action tokens to prevent
	# length-dependent gradient scaling. Without this, sequences with
	# more action tokens get disproportionately large gradients.
	norm_seq_log_probs = seq_log_probs / n_action_tokens # (B,)
	batch_loss = (
	-(batch_advs_gpu * norm_seq_log_probs).mean() # REINFORCE (length-normalized)
	- ent_coef * avg_token_entropy # per-token entropy bonus
	) / n_batches

	# ── Backward immediately — frees entire computation graph ─────────────
	batch_loss.backward()

	total_loss_val += batch_loss.item() * n_batches
	del batch_loss, seq_log_probs, batch_advs_gpu, avg_token_entropy, entropy_per_token
	torch.cuda.empty_cache()

	# Return detached scalar for logging (requires_grad=False → caller skips backward)
	return torch.tensor(total_loss_val / n_batches, device=model.device)




	def grpo_loss_fn(
	model,
	tokenizer,
	episodes: List,
	cfg: Dict[str, Any],
	) -> torch.Tensor:
	"""GRPO (Group Relative Policy Optimization) loss.

	Requires episodes to be structured as K groups of same-(task_id, seed) rollouts.
	Each group's advantages are normalised relative to that group's mean/std,
	eliminating the need for a value-function baseline.

	Uses the same OOM-safe per-mini-batch backward() as reinforce_baseline_loss_fn.
	"""
	import math as _math
	gamma = cfg.get("reward_gamma", 0.99)
	loss_batch_size = cfg.get("loss_batch_size", 1)
	max_seq_len_cap = cfg.get("max_seq_length", 512)
	pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id

	# ── Phase 1: Group episodes by (task_id, seed), compute group advantages ─
	# Key: (task_id, seed) — both must match for same-state comparison.
	# Seed is stored on Episode.seed (set during rollout collection).
	groups: Dict[tuple, List] = {}
	for ep in episodes:
	key = (ep.task_id, ep.seed) # EXACT grouping — not id() approximation
	groups.setdefault(key, []).append(ep)

	all_pairs: List[Tuple] = [] # (transition, advantage)

	for key, group in groups.items():
	if len(group) == 1:
	# Group of 1 — advantage=0, no gradient signal.
	# Happens if num_episodes is not a multiple of grpo_k.
	print(f" [grpo] WARNING: group {key} has only 1 episode — "
	f"num_episodes must be grpo_k × num_tasks", flush=True)

	# Episode-level returns (discounted sum from step 0)
	group_returns = []
	for ep in group:
	rewards = [t.reward for t in ep.transitions]
	returns = compute_returns(rewards, gamma)
	group_returns.append(returns[0] if returns else 0.0)

	group_mean = sum(group_returns) / len(group_returns)
	group_std = (sum((r - group_mean) ** 2 for r in group_returns)
	/ max(len(group_returns) - 1, 1)) ** 0.5 + 1e-8 # Bessel-corrected

	for ep, ep_return in zip(group, group_returns):
	advantage = (ep_return - group_mean) / group_std
	for trans in ep.transitions:
	if trans.input_ids is not None:
	all_pairs.append((trans, advantage))

	if not all_pairs:
	return torch.tensor(0.0, device=model.device)

	# ── Phase 2: Advantage tensor on CPU ────────────────────────────────────
	advantages = torch.tensor([p[1] for p in all_pairs], dtype=torch.float32)

	# ── Phase 3: OOM-safe mini-batch forward/backward ───────────────────────
	n_batches = _math.ceil(len(all_pairs) / loss_batch_size)
	total_loss_val = 0.0
	ent_coef = cfg.get("entropy_coef", 0.001)
	CHUNK_V = 4096

	for batch_idx, batch_start in enumerate(range(0, len(all_pairs), loss_batch_size)):
	batch = all_pairs[batch_start:batch_start + loss_batch_size]
	batch_advs = advantages[batch_start:batch_start + loss_batch_size]

	batch_ids = [p[0].input_ids for p in batch]
	batch_masks = [p[0].attention_mask for p in batch]

	# Truncate + left-pad
	batch_ids = [ids[-max_seq_len_cap:] if ids.shape[0] > max_seq_len_cap else ids
	for ids in batch_ids]
	batch_masks = [m[-max_seq_len_cap:] if m.shape[0] > max_seq_len_cap else m
	for m in batch_masks]

	# Build action masks (same as reinforce_baseline_loss_fn)
	batch_action_masks = []
	for ids, p in zip(batch_ids, batch):
	plen = p[0].prompt_len
	seq_len = ids.shape[0]
	if isinstance(p[0].input_ids, torch.Tensor) and p[0].input_ids.shape[0] > max_seq_len_cap:
	offset = p[0].input_ids.shape[0] - max_seq_len_cap
	plen = max(0, plen - offset)
	amask = torch.zeros(seq_len, dtype=torch.long)
	if plen < seq_len:
	amask[plen:] = 1
	batch_action_masks.append(amask)
	max_len = max(ids.shape[0] for ids in batch_ids)
	padded_ids, padded_masks, padded_action_masks = [], [], []
	for ids, mask, amask in zip(batch_ids, batch_masks, batch_action_masks):
	pad_len = max_len - ids.shape[0]
	if pad_len > 0:
	padded_ids.append(torch.cat(
	[torch.full((pad_len,), pad_id, device=ids.device), ids]))
	padded_masks.append(torch.cat(
	[torch.zeros(pad_len, device=mask.device, dtype=mask.dtype), mask]))
	padded_action_masks.append(torch.cat(
	[torch.zeros(pad_len, dtype=torch.long), amask]))
	else:
	padded_ids.append(ids)
	padded_masks.append(mask)
	padded_action_masks.append(amask)

	input_ids = torch.stack(padded_ids)
	attention_mask = torch.stack(padded_masks)

	torch.cuda.empty_cache()
	outputs = model(
	input_ids=input_ids.to(model.device),
	attention_mask=attention_mask.to(model.device),
	use_cache=False,
	)

	# Fused cross_entropy NLL (no [B,S,V] materialisation)
	logits = outputs.logits
	shift_logits = logits[:, :-1, :].contiguous() # (B, S-1, V)
	shift_labels = input_ids[:, 1:].contiguous().to(model.device)
	shift_mask_g = attention_mask[:, 1:].to(model.device)

	token_nll = torch.nn.functional.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.clamp(min=0).view(-1),
	reduction="none",
	).view(shift_labels.shape)
	token_nll = token_nll * shift_mask_g

	# ── Action-only log-probs for GRPO ──
	stacked_action_masks = torch.stack(padded_action_masks) # (B, S)
	shift_action_mask = stacked_action_masks[:, 1:].to(model.device) # (B, S-1)
	action_nll = token_nll * shift_action_mask
	seq_log_probs = -(action_nll.sum(dim=1)) # (B,)

	# Chunked entropy
	log_Z = shift_logits.logsumexp(dim=-1, keepdim=True)
	entropy_per_token = torch.zeros(shift_logits.shape[:2], device=model.device)
	for v_start in range(0, shift_logits.size(-1), CHUNK_V):
	chunk = shift_logits[:, :, v_start:v_start + CHUNK_V] - log_Z
	p_chunk = chunk.exp()
	entropy_per_token += -(p_chunk * chunk).sum(dim=-1)
	del log_Z, outputs, logits, shift_logits, token_nll
	torch.cuda.empty_cache()

	n_valid = (shift_action_mask * shift_mask_g).sum(dim=1).clamp(min=1)
	avg_entropy = ((entropy_per_token * shift_action_mask * shift_mask_g).sum(dim=1) / n_valid).mean()

	batch_advs_gpu = batch_advs.to(model.device)
	# Length-normalized log-probs (same as reinforce_baseline_loss_fn)
	n_action_tokens_grpo = shift_action_mask.sum(dim=1).clamp(min=1) # (B,)
	norm_seq_log_probs = seq_log_probs / n_action_tokens_grpo
	batch_loss = (
	-(batch_advs_gpu * norm_seq_log_probs).mean()
	- ent_coef * avg_entropy
	) / n_batches
	batch_loss.backward()

	total_loss_val += batch_loss.item() * n_batches
	del batch_loss, seq_log_probs, batch_advs_gpu, avg_entropy, entropy_per_token
	torch.cuda.empty_cache()

	return torch.tensor(total_loss_val / n_batches, device=model.device)



	# ────────────────────────────────────────────────────────────
	# Run Files Push (to hub_model_repo/<run_id>/)
	# ────────────────────────────────────────────────────────────


	def push_run_files_to_hub(
	run_id: str,
	output_dir: Path,
	hub_model_repo: str,
	iteration: int,
	) -> None:
	"""Upload step_metrics.jsonl, iter_metrics.jsonl, training.log, and eval results.

	Files are uploaded under <run_id>/ in the model repo alongside checkpoints.
	Called every checkpoint_interval iterations and at the end of training.
	"""
	if not hub_model_repo:
	return

	files_to_push = [
	("step_metrics.jsonl", f"{run_id}/step_metrics.jsonl"),
	("iter_metrics.jsonl", f"{run_id}/iter_metrics.jsonl"),
	("training.log", f"{run_id}/training.log"),
	("run_info.json", f"{run_id}/run_info.json"),
	]

	# Also push eval results if they exist
	eval_path = output_dir / "eval" / "eval_results.json"
	if eval_path.exists():
	files_to_push.append(("eval/eval_results.json", f"{run_id}/eval_results.json"))
	final_eval_path = output_dir / "final_eval" / "eval_results.json"
	if final_eval_path.exists():
	files_to_push.append(("final_eval/eval_results.json", f"{run_id}/final_eval_results.json"))

	try:
	from huggingface_hub import HfApi
	api = HfApi()
	pushed = []
	for local_name, hub_path in files_to_push:
	local_path = output_dir / local_name
	if not local_path.exists():
	continue
	try:
	api.upload_file(
	path_or_fileobj=str(local_path),
	path_in_repo=hub_path,
	repo_id=hub_model_repo,
	repo_type="model",
	commit_message=f"[{run_id}] iter {iteration}: {local_name}",
	)
	pushed.append(local_name)
	except Exception as e:
	print(f" [push] Failed to push {local_name}: {e}")
	if pushed:
	print(f" [push] \u2192 HF model {hub_model_repo}/{run_id}/: {', '.join(pushed)}",
	flush=True)
	except Exception as e:
	print(f"[train] Hub file push failed: {e}")


	# ────────────────────────────────────────────────────────────
	# Local JSONL Metrics Writers
	# ────────────────────────────────────────────────────────────

	def write_step_metrics(
	run_id: str,
	iteration: int,
	episode_idx: int,
	task_id: str,
	step: int,
	transition, # Transition dataclass from openenv_loop
	output_dir: Path,
	) -> None:
	"""Append one per-step row to step_metrics.jsonl.

	Each row captures the full cluster state at that step:
	action chosen, reward received, and all per-node + cluster-level
	metrics so you can graph queue depth, latency, cost, SLA violations,
	action distribution, etc. over time.
	"""
	obs = transition.obs_dict or {}
	action = transition.action

	row: Dict[str, Any] = {
	# ── Identity ──────────────────────────────────────────────
	"run_id": run_id,
	"iteration": iteration,
	"episode_idx": episode_idx,
	"task_id": task_id,
	"step": step,
	"ts": __import__("datetime").datetime.utcnow().isoformat() + "Z",

	# ── Action ────────────────────────────────────────────────
	"action_type": action.action_type,
	"target_node": action.target_node_id,
	"parameter": round(action.parameter, 4),
	"is_valid": action.is_valid,

	# ── Reward ────────────────────────────────────────────────
	"reward": round(transition.reward, 6),

	# ── Cluster-level metrics ──────────────────────────────────
	"avg_latency_ms": round(obs.get("average_latency_ms", 0.0), 3),
	"error_rate": round(obs.get("error_rate", 0.0), 6),
	"total_queue_backlog": round(obs.get("total_queue_backlog", 0.0), 4),
	"cost_per_hour": round(obs.get("current_cost_per_hour", 0.0), 4),
	"sla_violations": obs.get("sla_violations", 0),
	}

	# ── Per-node metrics (flat columns: n0_q, n0_l, n0_s, ...) ──
	for node in obs.get("nodes", []):
	nid = node.get("node_id", "")
	key = nid.replace("-", "") # "node-0" → "node0"
	row[f"{key}_status"] = node.get("status", "")[:1] # H/D/F
	row[f"{key}_queue"] = round(node.get("queue_depth", 0.0), 4)
	row[f"{key}_latency"] = round(node.get("latency_ms", 0.0), 2)
	row[f"{key}_inflow"] = round(node.get("incoming_request_rate", 0.0), 2)
	row[f"{key}_outflow"] = round(node.get("outflow_rate", 0.0), 2)
	row[f"{key}_capacity"] = round(node.get("capacity", 0.0), 4)
	row[f"{key}_pending"] = round(node.get("pending_capacity", 0.0), 4)

	path = output_dir / "step_metrics.jsonl"
	with open(path, "a") as f:
	f.write(json.dumps(row) + "\n")


	def write_iter_metrics(
	run_id: str,
	iteration: int,
	loss: float,
	avg_reward: float,
	grad_norm: float,
	total_invalid: int,
	num_episodes: int,
	iter_time_s: float,
	output_dir: Path,
	) -> None:
	"""Append one per-iteration row to iter_metrics.jsonl."""
	row = {
	"run_id": run_id,
	"iteration": iteration,
	"ts": __import__("datetime").datetime.utcnow().isoformat() + "Z",
	"loss": round(loss, 6),
	"avg_reward": round(avg_reward, 6),
	"grad_norm": round(grad_norm, 4),
	"invalid_actions": total_invalid,
	"num_episodes": num_episodes,
	"iter_time_s": round(iter_time_s, 2),
	}
	path = output_dir / "iter_metrics.jsonl"
	with open(path, "a") as f:
	f.write(json.dumps(row) + "\n")



	class _TeeLogger:
	"""Duplicates writes to both the original stream and a log file.

	Activated at the start of train() so that every print() — VRAM stats,
	step logs, entropy, iteration summaries, tracebacks — goes to both
	the HF job terminal stream AND a persistent training.log on disk.
	"""
	def __init__(self, stream, log_path: Path):
	self._stream = stream
	self._file = open(log_path, "a", buffering=1, encoding="utf-8") # line-buffered

	def write(self, data: str) -> None:
	self._stream.write(data)
	self._file.write(data)

	def flush(self) -> None:
	self._stream.flush()
	self._file.flush()

	def fileno(self) -> int:
	return self._stream.fileno() # subprocess / os compatibility

	def isatty(self) -> bool:
	return False

	def close(self) -> None:
	try:
	self._file.flush()
	self._file.close()
	except Exception:
	pass

	@property
	def original_stream(self):
	return self._stream


	def _log_vram(where: str) -> None:
	"""Print CUDA memory usage at key diagnostic points."""
	if not torch.cuda.is_available():
	return
	free, total = torch.cuda.mem_get_info()
	alloc = torch.cuda.memory_allocated() / (1024 ** 3)
	reserved = torch.cuda.memory_reserved() / (1024 ** 3)
	peak = torch.cuda.max_memory_allocated() / (1024 ** 3)
	print(f" [VRAM @{where}] "
	f"alloc={alloc:6.2f}GiB reserved={reserved:6.2f}GiB "
	f"peak={peak:6.2f}GiB free={free/10243:.1f}/{total/10243:.1f}GiB",
	flush=True)


	def train(cfg: Dict[str, Any]) -> None:
	"""Main training loop."""

	# ---- Reproducibility ----
	seed = cfg.get("seed", 42)
	random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)

	run_id = cfg.get("run_id", "exp_001")

	# ── Run-specific output directory ───────────────────────────────────────
	# Structure: <base_output_dir>/<run_id>/
	# checkpoint-0010/ ← saved every checkpoint_interval iters
	# checkpoint-0020/
	# ...
	# final_adapter/ ← saved at end of training
	# run_info.json ← written at startup; identifies this run
	#
	# Using run_id as a subfolder means multiple runs never overwrite each other.
	base_output_dir = Path(cfg.get("output_dir", "/workspace/antiatropos_checkpoints"))
	output_dir = base_output_dir / run_id
	output_dir.mkdir(parents=True, exist_ok=True)

	# ── Activate full-run logging to disk ────────────────────────────────────
	# Every print() from here on is tee'd to training.log (line-buffered).
	# This mirrors the HF job terminal stream to a persistent file so you
	# can inspect the full log even after the job completes or crashes.
	log_path = output_dir / "training.log"
	_orig_stdout = sys.stdout
	_orig_stderr = sys.stderr
	sys.stdout = _TeeLogger(sys.stdout, log_path)
	sys.stderr = _TeeLogger(sys.stderr, log_path)
	print(f"[train] Full log: {log_path}")
	# Write run manifest so checkpoints are always identifiable
	import json as _json
	run_info = {
	"run_id": run_id,
	"started_at": __import__("datetime").datetime.utcnow().isoformat() + "Z",
	"config": {k: v for k, v in cfg.items() if not k.startswith("_")},
	}
	run_info_path = output_dir / "run_info.json"
	run_info_path.write_text(_json.dumps(run_info, indent=2, default=str))
	print(f"[train] Run directory: {output_dir}")
	print(f"[train] Run manifest: {run_info_path}")

	hub_model_repo = cfg.get("hub_model_repo", "")
	push_to_hub_flag = cfg.get("push_to_hub", True)

	# ---- Verify environment ----
	env_url = cfg.get("env_url", "https://pranavkk-antiatropos.hf.space")
	client = OpenEnvClient(env_url)
	if not client.verify():
	print("[train] FATAL: Cannot reach environment. Aborting.")
	sys.exit(1)

	# ---- Load model ----
	print("\n[train] Loading model...")
	model, tokenizer = load_base_model(cfg)
	_log_vram("model_loaded")

	# ---- Check for resume ----
	start_iteration = 0
	ckpt_path = find_latest_checkpoint(hub_model_repo) if hub_model_repo else None
	if ckpt_path:
	local_ckpt = download_checkpoint(hub_model_repo, ckpt_path)
	model = load_from_checkpoint(model, tokenizer, local_ckpt)
	try:
	start_iteration = int(ckpt_path.split("-")[1])
	except (ValueError, IndexError):
	start_iteration = 0
	print(f"[train] Resuming from iteration {start_iteration}")
	else:
	model = attach_lora(model, cfg, seed=seed)
	# Unsloth's attach_lora already enables gradient checkpointing via
	# use_gradient_checkpointing="unsloth" in get_peft_model().
	# Do NOT call gradient_checkpointing_enable() again — it conflicts
	# with Unsloth's custom implementation and can increase VRAM usage.

	# ---- Optimizer ----
	lr = cfg.get("learning_rate", 2e-4)
	weight_decay = cfg.get("weight_decay", 0.01)
	optim_name = cfg.get("optim", "adamw_8bit")

	optimizer = torch.optim.AdamW(
	filter(lambda p: p.requires_grad, model.parameters()),
	lr=lr,
	weight_decay=weight_decay,
	)

	# ---- Loss function ----
	loss_type = cfg.get("loss_type", "reinforce_baseline")
	loss_fns = {
	"reinforce_baseline": reinforce_baseline_loss_fn,
	"grpo": grpo_loss_fn,
	}
	loss_fn = loss_fns.get(loss_type, reinforce_baseline_loss_fn)
	print(f"[train] Loss function: {loss_type}")

	# ---- Config ----
	num_iterations = cfg.get("num_iterations", 500)
	num_episodes = cfg.get("num_episodes_per_iteration", 4)
	max_steps = cfg.get("max_steps_per_episode", 60)
	tasks = cfg.get("tasks", ["task-1", "task-2", "task-3"])
	max_grad_norm = cfg.get("max_grad_norm", 1.0)
	checkpoint_interval = cfg.get("checkpoint_interval", 10) # default: every 10 iters
	eval_interval = cfg.get("eval_interval", 50)
	plot_interval = cfg.get("plot_interval", 25)

	# ---- Training loop ----
	print(f"\n{'='*70}")
	print(f"ANTIATROPOS QLORA TRAINING")
	print(f"{'='*70}")
	print(f" Run ID: {run_id}")
	print(f" Loss type: {loss_type}")
	print(f" Iterations: {num_iterations}")
	print(f" Episodes/iter: {num_episodes}")
	print(f" Tasks: {tasks}")
	print(f" Max steps: {max_steps}")
	print(f" Learning rate: {lr}")
	print(f" Hub model: {hub_model_repo or '(not configured)'}")
	print(f" Output dir: {output_dir}")
	print(f"{'='*70}\n")

	# Keep model in eval mode during rollout to minimise VRAM pressure.
	# for_training() is called only right before the loss forward pass.
	model.eval()
	_log_vram("eval_after_attach")

	metrics_buffer: List[Dict] = []
	eval_metrics_history: List[Dict] = []
	recent_episodes_data: List[Dict] = [] # For plotting action distributions

	for iteration in range(start_iteration, num_iterations):
	iter_start = time.time()

	# ---- Collect rollouts (parallel batch) ----
	# GRPO requires K episodes per task from the SAME seed so grpo_loss_fn
	# can group by (task_id, seed) and compute within-group advantages.
	# REINFORCE uses unique seeds per episode for diversity.
	if loss_type == "grpo":
	k = cfg.get("grpo_k", 2)
	# Validate: num_episodes must be k * num_tasks
	expected = k * len(tasks)
	if num_episodes != expected:
	print(f" [grpo] WARNING: num_episodes={num_episodes} ≠ "
	f"grpo_k({k}) × num_tasks({len(tasks)})={expected}. "
	f"Forcing to {expected}.", flush=True)
	num_episodes = expected
	# Each task gets k copies with the same per-task seed
	task_ids = [tasks[t] for t in range(len(tasks)) for _ in range(k)]
	task_seeds = [seed + iteration * 100 + t for t in range(len(tasks))]
	seeds = [task_seeds[t] for t in range(len(tasks)) for _ in range(k)]
	else:
	task_ids = [tasks[ep_idx % len(tasks)] for ep_idx in range(num_episodes)]
	seeds = [seed + iteration * 1000 + ep_idx for ep_idx in range(num_episodes)]

	_log_vram(f"i{iteration}_pre_rollout")

	try:
	use_parallel = cfg.get("parallel_episodes", True)
	if use_parallel and num_episodes > 1:
	episodes = rollout_batch(
	env_url, model, tokenizer, task_ids,
	max_steps, cfg, seeds,
	)
	else:
	# Fallback: sequential rollout (for debugging)
	episodes = []
	for ep_idx in range(num_episodes):
	task_id = tasks[ep_idx % len(tasks)]
	seed_ep = seed + iteration * 1000 + ep_idx
	ep = rollout_episode(
	client, model, tokenizer, task_id,
	max_steps, cfg, seed=seed_ep,
	)
	episodes.append(ep)
	except Exception as e:
	print(f" [iter {iteration}] Batch rollout failed: {e}")
	continue

	# ---- Clear VRAM before loss (generation KV-cache on GPU) ----
	torch.cuda.empty_cache()
	import gc
	gc.collect()
	# Move rollout tensors to CPU — loss will move them back in batches
	for ep in episodes:
	for t in ep.transitions:
	if t.input_ids is not None:
	t.input_ids = t.input_ids.cpu()
	if t.attention_mask is not None:
	t.attention_mask = t.attention_mask.cpu()
	_log_vram(f"i{iteration}_after_offload")

	# ---- Compute loss (standard train mode — base 4-bit stays frozen, only LoRA needs gradients) ----
	model.train()
	_log_vram(f"i{iteration}_after_train")
	loss = loss_fn(model, tokenizer, episodes, cfg)

	# ---- Optimizer step (loss_fn already called .backward() per mini-batch) ----
	grad_norm = torch.nn.utils.clip_grad_norm_(
	filter(lambda p: p.requires_grad, model.parameters()),
	max_grad_norm,
	)
	optimizer.step()
	optimizer.zero_grad()

	# Clear training intermediates and return to eval for next rollout
	torch.cuda.empty_cache()
	model.eval()
	_log_vram(f"i{iteration}_post_grad")

	# ---- Compute iteration metrics ----
	avg_reward = sum(ep.avg_reward for ep in episodes) / len(episodes)
	total_invalid = sum(ep.num_invalid for ep in episodes)
	iter_time = time.time() - iter_start

	# ---- Write per-step metrics (one row per episode step) ----
	# Done here (post-training) because training may mutate episode objects.
	for ep_idx, ep in enumerate(episodes):
	for step_idx, t in enumerate(ep.transitions):
	write_step_metrics(
	run_id=run_id,
	iteration=iteration,
	episode_idx=ep_idx,
	task_id=ep.task_id,
	step=step_idx + 1,
	transition=t,
	output_dir=output_dir,
	)

	# ---- Write per-iteration metrics ----
	_grad_norm_val = (
	grad_norm.item() if torch.is_tensor(grad_norm) else float(grad_norm)
	)
	write_iter_metrics(
	run_id=run_id,
	iteration=iteration,
	loss=loss.item(),
	avg_reward=avg_reward,
	grad_norm=_grad_norm_val,
	total_invalid=total_invalid,
	num_episodes=len(episodes),
	iter_time_s=iter_time,
	output_dir=output_dir,
	)

	print(f" [iter {iteration:4d}] loss={loss.item():.4f} "
	f"avg_reward={avg_reward:.4f} "
	f"invalid={total_invalid} "
	f"grad_norm={_grad_norm_val:.4f} "
	f"time={iter_time:.1f}s")

	# Store episode data for plotting (keep recent window)
	ep_data = episodes_to_plot_data(episodes)
	recent_episodes_data.extend(ep_data)
	if len(recent_episodes_data) > 200: # Keep last ~200 episodes
	recent_episodes_data = recent_episodes_data[-200:]

	# ---- Checkpoint + push run files ----
	if (iteration + 1) % checkpoint_interval == 0:
	# Pad iteration number so ls sorts correctly: checkpoint-0010, checkpoint-0050, ...
	ckpt_name = f"checkpoint-{iteration + 1:04d}"
	ckpt_dir = output_dir / ckpt_name
	ckpt_dir.mkdir(parents=True, exist_ok=True)
	model.save_pretrained(str(ckpt_dir))
	tokenizer.save_pretrained(str(ckpt_dir))
	# Write a small metadata file so you know exactly what's in each checkpoint
	ckpt_meta = {
	"run_id": run_id,
	"iteration": iteration + 1,
	"avg_reward": avg_reward,
	"loss": loss.item(),
	"saved_at": __import__("datetime").datetime.utcnow().isoformat() + "Z",
	}
	(ckpt_dir / "checkpoint_meta.json").write_text(
	_json.dumps(ckpt_meta, indent=2)
	)
	print(f" [ckpt] Saved \u2192 {ckpt_dir} "
	f"(reward={avg_reward:.4f} loss={loss.item():.4f})", flush=True)
	if push_to_hub_flag and hub_model_repo:
	push_to_hub(
	str(ckpt_dir),
	hub_model_repo,
	commit_message=f"[{run_id}] {ckpt_name}",
	path_in_repo=f"{run_id}/{ckpt_name}",
	)
	# Push run files (metrics, logs) alongside checkpoint
	push_run_files_to_hub(run_id, output_dir, hub_model_repo, iteration + 1)

	# ---- Evaluation ----
	if (iteration + 1) % eval_interval == 0:
	eval_results = evaluate(
	client, model, tokenizer, cfg,
	output_dir=str(output_dir / "eval"),
	)
	eval_row = {
	"run_id": run_id,
	"step": iteration,
	"type": "eval",
	}
	# Flatten eval results for plotting
	for k, v in eval_results.items():
	if not isinstance(v, dict):
	eval_row[f"eval_{k}"] = v
	for tid, tv in eval_results.get("per_task", {}).items():
	for mk, mv in tv.items():
	eval_row[f"eval_{tid}_{mk}"] = mv
	eval_metrics_history.append(eval_row)

	# Re-enable training mode
	model.train()

	# ---- Plotting ----
	if (iteration + 1) % plot_interval == 0:
	try:
	plot_paths = generate_all_plots(
	train_metrics=metrics_buffer,
	eval_metrics=eval_metrics_history,
	episodes_data=recent_episodes_data,
	output_dir=str(output_dir),
	cfg=cfg,
	)
	if push_to_hub_flag and hub_model_repo:
	push_plots_to_hub(plot_paths, hub_model_repo, iteration, run_id=run_id)
	except Exception as e:
	print(f" [iter {iteration}] Plotting failed: {e}")

	# ────────────────────────────────────────────────────────
	# Final save + push
	# ────────────────────────────────────────────────────────

	print(f"\n{'='*70}")
	print(f"TRAINING COMPLETE")
	print(f"{'='*70}")

	# Save final adapter
	final_dir = str(output_dir / "final_adapter")
	Path(final_dir).mkdir(parents=True, exist_ok=True)
	model.save_pretrained(final_dir)
	tokenizer.save_pretrained(final_dir)
	print(f"[train] Final adapter saved to {final_dir}")

	# Push to Hub — scoped under run_id/final_adapter/ so it never overwrites other runs
	if push_to_hub_flag and hub_model_repo:
	push_to_hub(
	final_dir,
	hub_model_repo,
	commit_message=f"[{run_id}] final_adapter",
	path_in_repo=f"{run_id}/final_adapter",
	)


	# Final evaluation
	final_eval = evaluate(
	client, model, tokenizer, cfg,
	output_dir=str(output_dir / "final_eval"),
	)
	# Final plots (full training history)
	try:
	final_eval_row = {
	"run_id": run_id,
	"step": num_iterations,
	"type": "eval",
	}
	for k, v in final_eval.items():
	if not isinstance(v, dict):
	final_eval_row[f"eval_{k}"] = v
	for tid, tv in final_eval.get("per_task", {}).items():
	for mk, mv in tv.items():
	final_eval_row[f"eval_{tid}_{mk}"] = mv
	eval_metrics_history.append(final_eval_row)

	plot_paths = generate_all_plots(
	train_metrics=metrics_buffer,
	eval_metrics=eval_metrics_history,
	episodes_data=recent_episodes_data,
	output_dir=str(output_dir),
	cfg=cfg,
	)
	if push_to_hub_flag and hub_model_repo:
	push_plots_to_hub(plot_paths, hub_model_repo, num_iterations, run_id=run_id)
	except Exception as e:
	print(f"[train] Final plotting failed: {e}")

	print(f"\n[train] All done. Final adapter: {final_dir}")
	if hub_model_repo:
	print(f"[train] Hub repo: https://huggingface.co/{hub_model_repo}")

	# \u2500\u2500 Final push of all run files
	if hub_model_repo:
	push_run_files_to_hub(run_id, output_dir, hub_model_repo, num_iterations)

	# ── Flush and close the TeeLogger ────────────────────────────────────────
	# Restore original stdout/stderr so any code after train() works normally.
	print(f"[train] Full training log saved to: {log_path}", flush=True)
	if isinstance(sys.stdout, _TeeLogger):
	sys.stdout.close()
	sys.stdout = _orig_stdout
	if isinstance(sys.stderr, _TeeLogger):
	sys.stderr.close()
	sys.stderr = _orig_stderr


	# ────────────────────────────────────────────────────────────
	# CLI Entry Point
	# ────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(
	description="AntiAtropos QLoRA Training — HF Jobs Edition"
	)
	parser.add_argument(
	"--config", type=str,
	default=str(TRAINING_DIR / "config.yaml"),
	help="Path to config.yaml (default: training/config.yaml)",
	)

	# ---- Quick overrides for smoke runs ----
	parser.add_argument("--num-iterations", type=int, default=None,
	help="Total training iterations (default: from config)")
	parser.add_argument("--num-episodes", type=int, default=None,
	help="Episodes per iteration (default: from config)")
	parser.add_argument("--max-steps", type=int, default=None,
	help="Max steps per episode (default: from config)")
	parser.add_argument("--loss-type", type=str, default=None,
	choices=["reinforce_baseline", "grpo"],
	help="Loss function type")
	parser.add_argument("--env-mode", type=str, default=None,
	choices=["simulated", "hybrid", "live"],
	help="Environment mode (default: from config)")
	parser.add_argument("--eval-interval", type=int, default=None,
	help="Evaluate every N iterations")
	parser.add_argument("--checkpoint-interval", type=int, default=None,
	help="Checkpoint every N iterations")
	parser.add_argument("--plot-interval", type=int, default=None,
	help="Generate plots every N iterations")
	parser.add_argument("--run-id", type=str, default=None,
	help="Unique run identifier")
	parser.add_argument("--output-dir", type=str, default=None,
	help="Local output directory")
	parser.add_argument("--no-push", action="store_true",
	help="Disable all Hub pushes (model + metrics + plots)")
	parser.add_argument("--smoke", action="store_true",
	help="Quick smoke run: 10 iters, 2 episodes, 20 steps, "
	"no push, eval/ckpt/plot every 5")

	args = parser.parse_args()

	# Load config
	cfg = load_config(args.config)

	# ---- Smoke run preset ----
	if args.smoke:
	cfg["num_iterations"] = 10
	cfg["num_episodes_per_iteration"] = 2
	cfg["max_steps_per_episode"] = 40
	cfg["eval_interval"] = 5
	cfg["checkpoint_interval"] = 5
	cfg["plot_interval"] = 5
	cfg["push_to_hub"] = False
	cfg["eval_episodes"] = 1
	if not args.run_id:
	cfg["run_id"] = "smoke_test"
	if not args.output_dir:
	cfg["output_dir"] = "/tmp/antiatropos_smoke"
	print("[SMOKE MODE] 10 iters x 2 episodes x 40 steps — no Hub push")

	# ---- CLI overrides (explicit args beat smoke preset) ----
	if args.num_iterations is not None:
	cfg["num_iterations"] = args.num_iterations
	if args.num_episodes is not None:
	cfg["num_episodes_per_iteration"] = args.num_episodes
	if args.max_steps is not None:
	cfg["max_steps_per_episode"] = args.max_steps
	if args.loss_type is not None:
	cfg["loss_type"] = args.loss_type
	if args.env_mode is not None:
	cfg["env_mode"] = args.env_mode
	if args.eval_interval is not None:
	cfg["eval_interval"] = args.eval_interval
	if args.checkpoint_interval is not None:
	cfg["checkpoint_interval"] = args.checkpoint_interval
	if args.plot_interval is not None:
	cfg["plot_interval"] = args.plot_interval
	if args.run_id is not None:
	cfg["run_id"] = args.run_id
	if args.output_dir is not None:
	cfg["output_dir"] = args.output_dir
	if args.no_push:
	cfg["push_to_hub"] = False
	cfg["hub_model_repo"] = ""

	# Allow HF_TOKEN from env
	hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
	if hf_token:
	os.environ["HF_TOKEN"] = hf_token
	os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token

	train(cfg)


	if __name__ == "__main__":
	main()