Spaces:

Freakdivi
/

dispatch_arena_v0

Sleeping

App Files Files Community

dispatch_arena_v0 / scripts /train_grpo.py

Freakdivi

Upload folder using huggingface_hub

c71bf62 verified about 1 month ago

raw

history blame contribute delete

15.9 kB

	"""GRPO training against Dispatch Arena (normal mode, catalog-driven).

	End-to-end pipeline:
	1. Load `catalog.json` -> stratified 70/30 train/eval split.
	2. Spin up the FastAPI server in-process; one DispatchToolEnv per rollout.
	3. Each env is configured per-row from the catalog spec (mode=normal,
	plus rolling_arrivals / traffic_noise / scenario_bucket / etc.).
	4. Tool methods exposed to the LLM:
	view_dashboard, assign, reposition, hold, prioritize, finish_shift
	5. Reward function: reward_total (sum of env's per-step
	RewardBreakdown.total_reward over the rollout). Single function — the
	env already decomposes the reward; we report the scalar to GRPO.
	6. Training: TRL GRPOTrainer + LoRA (r=16). Smoke-friendly defaults for
	a Tesla T4 (16 GB) with grad checkpointing on.

	Hyperparameters confirmed by user before run:
	max_steps=50, num_generations=2, max_completion_length=512,
	max_tool_calling_iterations=20, LoRA on, beta=0.0, fp16,
	per_device_train_batch_size=2, lr=1e-5.
	"""

	from __future__ import annotations

	import os
	import time
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	# Silence the experimental-feature warning before importing TRL.
	os.environ.setdefault("TRL_EXPERIMENTAL_SILENCE", "1")


	# NOTE: Qwen2.5-Instruct required a chat-template patch (TRL 1.2.0 ships the
	# template but never wired it into add_response_schema). We dropped that path
	# in favor of Qwen3-0.6B, which TRL recognizes natively — simpler stack, no
	# monkey-patching, and the original smoke script used this model successfully.
	# The git history of train_grpo_smoke.py shows the patch logic if it's needed
	# again later.

	import torch
	from peft import LoraConfig
	from trl import GRPOConfig, GRPOTrainer

	from dispatch_arena.catalog.dataset import load_catalog_datasets
	from dispatch_arena.client import DispatchArenaClient
	from dispatch_arena.server.app import run_local_server_in_thread

	# ---------------------------------------------------------------------------
	# Model / paths
	# ---------------------------------------------------------------------------

	MODEL_NAME = "Qwen/Qwen3-1.7B" # Natively recognized by TRL 1.2.0 (no patch needed). Pre-flight verified: tools render into prompt, envelope identical to 0.6B, ~6.7 GB total VRAM on T4.
	CATALOG_PATH = Path(__file__).resolve().parents[1] / "catalog" / "catalog.json"
	OUTPUT_DIR = Path(__file__).resolve().parents[1] / "scripts" / "_grpo_normal_out"

	# ---------------------------------------------------------------------------
	# System prompt for normal-mode dispatcher
	# ---------------------------------------------------------------------------

	SYSTEM_PROMPT = """You are a real-time delivery dispatcher running one shift over a small fleet of couriers. Your job is to dispatch each order to the right courier and keep the shift moving so orders are delivered before their deadlines.

	# Tool calling

	Always reply with EXACTLY ONE tool call per turn, in this format (no other text):
	<tool_call>
	{"name": "<tool_name>", "arguments": {<args-json>}}
	</tool_call>

	# Available tools

	```json
	[
	{
	"name": "view_dashboard",
	"description": "Refresh the dashboard. Returns courier statuses, order list, deadlines, and travel times.",
	"parameters": {"type": "object", "properties": {}, "required": []}
	},
	{
	"name": "assign",
	"description": "Dispatch an idle courier to an unassigned order whose status is queued or ready.",
	"parameters": {
	"type": "object",
	"properties": {
	"courier_id": {"type": "string", "description": "e.g. courier_0, courier_1, ..."},
	"order_id": {"type": "string", "description": "e.g. order_0, order_1, ..."}
	},
	"required": ["courier_id", "order_id"]
	}
	},
	{
	"name": "reposition",
	"description": "Pre-stage an idle courier near a busy store or upcoming dropoff.",
	"parameters": {
	"type": "object",
	"properties": {
	"courier_id": {"type": "string"},
	"node_id": {"type": "string", "description": "hub, store_0..3, or customer_0..N"}
	},
	"required": ["courier_id", "node_id"]
	}
	},
	{
	"name": "hold",
	"description": "Wait one tick. Use when prep is not done and no good action exists.",
	"parameters": {"type": "object", "properties": {}, "required": []}
	},
	{
	"name": "prioritize",
	"description": "Mark an order as priority. Safe even if not yet assigned.",
	"parameters": {
	"type": "object",
	"properties": {"order_id": {"type": "string"}},
	"required": ["order_id"]
	}
	},
	{
	"name": "finish_shift",
	"description": "End the shift early once all visible orders are delivered.",
	"parameters": {"type": "object", "properties": {}, "required": []}
	}
	]
	```

	# Examples

	Refresh the dashboard:
	<tool_call>
	{"name": "view_dashboard", "arguments": {}}
	</tool_call>

	Dispatch courier_0 to order_1:
	<tool_call>
	{"name": "assign", "arguments": {"courier_id": "courier_0", "order_id": "order_1"}}
	</tool_call>

	# Rules

	- Prep time is hidden. Queued orders flip to "ready" when prep completes; the courier you dispatch may have to wait briefly at the store.
	- Travel times shown are BASE estimates. With traffic noise, real ETAs can run longer.
	- The shift ends automatically at max_ticks. Maximize on-time deliveries.
	- One tool per turn. Output the tool call in the format above and nothing else."""

	USER_KICKOFF = "Begin the shift. Call view_dashboard first to see the state, then dispatch."


	# ---------------------------------------------------------------------------
	# Server boot + helpers
	# ---------------------------------------------------------------------------


	def _start_shared_server() -> str:
	server, _thread = run_local_server_in_thread(port=0, max_concurrent_envs=64)
	host, port = server.server_address
	time.sleep(0.2) # let uvicorn bind
	return f"http://{host}:{port}"


	SERVER_URL = _start_shared_server()


	def _render_dashboard(obs) -> str:
	"""Compact textual dashboard rendered for the LLM.

	Lists couriers, orders (with deadline + status), an excerpt of the travel
	matrix, last events, and the legal action shape. Designed to fit inside
	~300 tokens so the agent has room for tool-call output too.
	"""
	state = obs.state
	parts = [
	f"tick={state.tick}/{state.max_ticks} verdict={obs.verifier_status.value} "
	f"backlog={state.backlog} sla_pressure={state.sla_pressure:.2f}",
	]

	# Couriers
	parts.append("couriers:")
	for c in state.couriers:
	load = c.load or "none"
	target = f" -> {c.target_node_id}(eta {c.eta_remaining})" if c.target_node_id else ""
	parts.append(f" {c.id} @ {c.node_id} {c.status.value}{target} carrying={load}")

	# Orders
	parts.append("orders:")
	for o in state.orders:
	assigned = o.assigned_courier_id or "-"
	parts.append(
	f" {o.id} {o.kind} {o.pickup_node_id}->{o.dropoff_node_id} "
	f"status={o.status.value} deadline=t{o.deadline_tick} assigned={assigned}"
	)

	# Travel times — compact: one line per node showing top-K nearest
	parts.append("travel_times (base, may run longer with traffic):")
	for src in [n.id for n in state.nodes]:
	row = state.travel_time_matrix.get(src, {})
	# Show all destinations in a compact format
	edges = ", ".join(f"{dst}={t}" for dst, t in row.items() if dst != src)
	parts.append(f" {src}: {edges}")

	# Last events
	if obs.info.get("events"):
	parts.append("last_events: " + " \| ".join(obs.info["events"][-4:]))

	if obs.done:
	parts.append("DONE")
	return "\n".join(parts)


	# ---------------------------------------------------------------------------
	# Tool-calling environment (one per rollout via environment_factory)
	# ---------------------------------------------------------------------------


	class DispatchToolEnv:
	"""Normal-mode dispatcher wrapper exposing 6 tools to the LLM.

	The TRL trainer instantiates one DispatchToolEnv per generation. Public
	methods become the LLM's callable tools (per TRL OpenEnv integration).

	`metrics` is read by the reward functions after the rollout finishes.
	"""

	def __init__(self) -> None:
	self.client = DispatchArenaClient(base_url=SERVER_URL, timeout_seconds=30)
	self.metrics: Dict[str, Any] = {
	"step_total": 0.0,
	"invalid_count": 0,
	"delivered": 0,
	"ticks": 0,
	"verdict": "in_progress",
	"rollout_done": False,
	}

	# The trainer passes dataset row fields here (seed + _config + ...).
	# We accept **kwargs to ignore _difficulty / _skill_focus / _name without
	# leaking them into env state.
	def reset(
	self,
	seed: Optional[int] = None,
	_config: Optional[Dict[str, Any]] = None,
	**_: Any,
	) -> str:
	seed_int = int(seed) if seed is not None else 0
	config = _config or {"mode": "normal", "max_ticks": 16, "num_couriers": 3, "num_orders": 5}
	obs = self.client.reset(seed=seed_int, config=config)
	self.metrics = {
	"step_total": float(obs.reward),
	"invalid_count": 0,
	"delivered": 0,
	"ticks": int(obs.state.tick),
	"verdict": obs.verifier_status.value,
	"rollout_done": False,
	}
	return "Initial dashboard:\n" + _render_dashboard(obs)

	def _step(self, action: Dict[str, Any]) -> str:
	if self.metrics.get("rollout_done"):
	return "Shift already finished — call finish_shift to stop or stop calling tools."
	obs = self.client.step(action)
	self.metrics["step_total"] += float(obs.reward)
	self.metrics["ticks"] = int(obs.state.tick)
	self.metrics["verdict"] = obs.verifier_status.value
	self.metrics["delivered"] = sum(
	1 for o in obs.state.orders if o.status.value == "delivered"
	)
	if obs.info.get("invalid_action"):
	self.metrics["invalid_count"] += 1
	if obs.done:
	self.metrics["rollout_done"] = True
	return _render_dashboard(obs)

	# ---- Tools (each is exposed to the LLM as a callable) -----------------

	def view_dashboard(self) -> str:
	"""Refresh the dashboard with the latest courier/order state."""
	# No-op step semantically — but our env doesn't separate "look" from
	# "act", so we issue a hold instead to advance one tick.
	return self._step({"action_type": "hold"})

	def assign(self, courier_id: str, order_id: str) -> str:
	"""Dispatch a courier to an order. Both must be valid + free.

	Args:
	courier_id: e.g. "courier_0".
	order_id: e.g. "order_3".
	"""
	return self._step(
	{"action_type": "assign", "courier_id": courier_id, "order_id": order_id}
	)

	def reposition(self, courier_id: str, node_id: str) -> str:
	"""Move an idle courier to a node to pre-stage near a busy store.

	Args:
	courier_id: e.g. "courier_1".
	node_id: e.g. "store_0", "hub", "customer_2".
	"""
	return self._step(
	{"action_type": "reposition", "courier_id": courier_id, "node_id": node_id}
	)

	def hold(self) -> str:
	"""Wait one tick. Use when prep is unfinished and no good move exists."""
	return self._step({"action_type": "hold"})

	def prioritize(self, order_id: str) -> str:
	"""Signal that an order is priority. Safe even if not assigned.

	Args:
	order_id: e.g. "order_2".
	"""
	return self._step({"action_type": "prioritize", "order_id": order_id})

	def finish_shift(self) -> str:
	"""End the shift early. Returns the final summary."""
	# Mark rollout done; TRL will stop tool-calling once the next
	# iteration sees the rollout flag. We also issue a hold to advance
	# the tick so the env can finalize.
	if not self.metrics.get("rollout_done"):
	self._step({"action_type": "hold"})
	self.metrics["rollout_done"] = True
	return (
	f"Shift finished. tick={self.metrics['ticks']} delivered={self.metrics['delivered']} "
	f"verdict={self.metrics['verdict']} reward={self.metrics['step_total']:.2f}"
	)


	# ---------------------------------------------------------------------------
	# Reward functions
	# ---------------------------------------------------------------------------


	def reward_total(environments: List[DispatchToolEnv], **_: Any) -> List[float]:
	"""Sum of env's per-step RewardBreakdown.total_reward across the rollout.

	Already includes step_cost, progress, success, invalid_penalty, on-time
	bonus, late penalty, timeout penalty, idle penalty, churn, fairness — so
	additional reward functions would be double-counts of components inside
	this scalar.
	"""
	return [float(env.metrics.get("step_total", 0.0)) for env in environments]


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------


	def main() -> None:
	if not torch.cuda.is_available():
	raise RuntimeError("CUDA not available; this trainer requires a GPU.")

	train_ds, eval_ds, train_specs, eval_specs = load_catalog_datasets(
	catalog_path=CATALOG_PATH,
	system_prompt=SYSTEM_PROMPT,
	eval_fraction=0.30,
	master_seed=0,
	)
	print(f"Catalog loaded: train={len(train_ds)} eval={len(eval_ds)}")

	lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	)

	config = GRPOConfig(
	output_dir=str(OUTPUT_DIR),
	per_device_train_batch_size=1, # OOM fix: only 1 prompt per micro-batch
	gradient_accumulation_steps=4, # generation_batch_size = 141 = 4, divisible by G=4
	num_generations=4, # bump from 2 for better advantage variance
	max_completion_length=384, # OOM fix: was 512, less KV cache + caps rambling earlier
	max_tool_calling_iterations=20,
	learning_rate=1e-5,
	max_steps=80, # longer horizon for the policy to actually move
	beta=0.0, # no KL -> skips reference model
	log_completions=True,
	report_to=["tensorboard"],
	logging_dir=str(OUTPUT_DIR / "tb"),
	save_strategy="no", # smoke-friendly; no checkpoints to disk
	eval_strategy="no", # post-training eval is a separate script
	logging_steps=1,
	bf16=False,
	fp16=True,
	gradient_checkpointing=True,
	model_init_kwargs={"torch_dtype": "float16"},
	)

	trainer = GRPOTrainer(
	model=MODEL_NAME,
	reward_funcs=[reward_total],
	args=config,
	train_dataset=train_ds,
	environment_factory=DispatchToolEnv,
	peft_config=lora_config,
	)

	print("Starting training...")
	train_output = trainer.train()
	print("\n=== TRAIN DONE ===")
	print("metrics:", train_output.metrics)

	# Persist the trained LoRA adapter so eval can load it later.
	# GRPOConfig was set with save_strategy="no" to avoid mid-run checkpoints,
	# but we explicitly save the final state here. trainer.save_model() writes
	# the adapter (since peft is in use) — base model weights are not duplicated.
	final_dir = OUTPUT_DIR / "final_lora"
	trainer.save_model(str(final_dir))
	print(f"LoRA adapter saved -> {final_dir}")


	if __name__ == "__main__":
	main()