Spaces:

Godreign
/

Policy2Logic

Sleeping

Policy2Logic / training /trajectory_optimizer.py

Godreign-Y

Fix encoding issue in Colab notebook

e59d389 24 days ago

26.7 kB

	"""
	Reward-Guided Trajectory Optimization Loop
	==========================================
	Optimizes agent behavior across episodes by accumulating high-reward
	trajectories as few-shot examples. Uses environment reward signal to
	drive improvement — no weight updates required.

	This implements a policy improvement loop where:
	- reward_signal → trajectory_selection → context_construction → improved_policy
	"""

	import json
	import os
	import time
	import requests
	import logging
	import wandb
	from dataclasses import dataclass, field
	from typing import Optional
	from openai import OpenAI

	# ── Configuration ────────────────────────────────────────────────────────────

	ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
	HF_TOKEN = os.getenv("HF_TOKEN", "")
	MODEL = "Qwen/Qwen2.5-72B-Instruct"
	TEMPERATURE = 0.3
	MAX_TOKENS = 1024

	# Training hyperparameters
	NUM_EPISODES_PER_TASK = 8 # Episodes to run per task
	TOP_K_TRAJECTORIES = 3 # Max few-shot examples to keep
	MIN_REWARD_THRESHOLD = 0.3 # Minimum reward to store trajectory
	TASKS = ["data_access", "resource_access", "transaction_approval"]

	# ── Data Structures ──────────────────────────────────────────────────────────

	@dataclass
	class Step:
	step_number: int
	action_type: str
	action_content: str
	reward: float
	accuracy: float
	feedback: str
	clarification_response: Optional[str] = None

	@dataclass
	class Trajectory:
	task_name: str
	episode_id: int
	steps: list[Step] = field(default_factory=list)
	total_reward: float = 0.0
	final_accuracy: float = 0.0
	success: bool = False

	def to_few_shot_string(self) -> str:
	"""Convert trajectory to a few-shot example string for prompting."""
	lines = [
	f"=== Example Episode (reward={self.total_reward:.2f}, accuracy={self.final_accuracy:.2f}) ===",
	]
	for s in self.steps:
	lines.append(f"Step {s.step_number}: action={s.action_type}")
	lines.append(f" Content: {s.action_content[:200]}")
	lines.append(f" Result: accuracy={s.accuracy:.2f}, reward={s.reward:.2f}")
	if s.feedback:
	lines.append(f" Feedback: {s.feedback[:150]}")
	return "\n".join(lines)

	# ── Environment Client ────────────────────────────────────────────────────────

	class EnvClient:
	def __init__(self, base_url: str):
	self.base_url = base_url.rstrip("/")
	self.session = requests.Session()

	def reset(self, task_name: str) -> dict:
	r = self.session.post(f"{self.base_url}/reset", json={"task_name": task_name})
	r.raise_for_status()
	return r.json()

	def step(self, action_type: str, content: str) -> dict:
	r = self.session.post(f"{self.base_url}/step", json={
	"action_type": action_type,
	"content": content
	})
	r.raise_for_status()
	return r.json()

	def health(self) -> bool:
	try:
	r = self.session.get(f"{self.base_url}/health", timeout=5)
	return r.status_code == 200
	except Exception:
	return False

	# ── LLM Agent ────────────────────────────────────────────────────────────────

	class Agent:
	def __init__(self, hf_token: str):
	self.client = OpenAI(
	base_url="https://router.huggingface.co/v1",
	api_key=hf_token
	)

	def get_action(
	self,
	observation: dict,
	step_number: int,
	episode_history: list[str],
	few_shot_examples: list[Trajectory],
	task_name: str = ""
	) -> tuple[str, str]:
	"""
	Returns (action_type, content_json_string).
	action_type: one of ask_clarification \| propose_rules \| refine_rules
	content: JSON string appropriate for that action
	"""
	system_prompt = self._build_system_prompt(few_shot_examples, task_name)
	user_prompt = self._build_user_prompt(observation, step_number, episode_history)

	try:
	response = self.client.chat.completions.create(
	model=MODEL,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS
	)
	raw = response.choices[0].message.content.strip()
	return self._parse_response(raw, observation)
	except Exception as e:
	print(f" [LLM ERROR] {e}")
	return "propose_rules", json.dumps({"rules": [], "default": "DENY"})

	def _build_system_prompt(self, few_shot_examples: list[Trajectory], task_name: str = "") -> str:
	base = """You are a policy-to-logic agent. Your job is to convert natural language policies into executable rules.

	AVAILABLE ACTIONS:
	1. ask_clarification: {"type": "clarification", "question": "your question"}
	2. propose_rules: {"rules": [...], "default": "DECISION"}
	3. refine_rules: {"rules": [...], "default": "DECISION"}

	DSL FORMAT for rules:
	{
	"rules": [
	{
	"if": [
	{"field": "FIELD_NAME", "op": "OPERATOR", "value": VALUE}
	],
	"then": "DECISION"
	}
	],
	"default": "FALLBACK_DECISION"
	}

	Operators: >, <, >=, <=, ==, !=
	Rules execute top-to-bottom. First match wins. Default applies if no rule matches.

	STRATEGY:
	- Step 1: Ask 1-2 targeted clarification questions about ambiguous terms
	- Step 2: Propose initial rules based on policy + clarifications
	- Step 3+: Refine rules based on failure feedback

	OUTPUT FORMAT: Respond ONLY with valid JSON. No markdown. No explanation.
	{"action_type": "propose_rules", "content": "{...escaped json string...}"}
	"""
	# Task-specific guidance for complex tasks
	if task_name == "transaction_approval":
	base += """
	IMPORTANT — TRANSACTION APPROVAL TASK:
	This task has 4 possible decisions: APPROVE, REQUIRE_APPROVAL, COMPLIANCE_REVIEW, HOLD
	Rules are evaluated TOP-TO-BOTTOM. Order matters critically. You MUST order rules by priority:
	1. FIRST: Check if transfer_type == "international" → then COMPLIANCE_REVIEW (always, overrides everything)
	2. SECOND: Check if amount >= 10000 AND time is outside business hours (hour < 9 or hour >= 17) → then HOLD
	3. THIRD: Check if amount > 5000 AND initiator_role != "manager" → then REQUIRE_APPROVAL
	4. DEFAULT: APPROVE

	Key details:
	- Standard limit is $5,000 (amount > 5000 triggers approval, NOT >=)
	- High-value threshold is $10,000 (amount >= 10000)
	- Business hours: hour >= 9 AND hour < 17
	- Manager exemption ONLY applies to the standard $5,000 limit, NOT to international or high-value HOLD rules
	- "system" role follows the same rules as "employee"

	Here is a working example of valid rules for this task:
	{"rules": [{"if": [{"field": "transfer_type", "op": "==", "value": "international"}], "then": "COMPLIANCE_REVIEW"}, {"if": [{"field": "amount", "op": ">=", "value": 10000}, {"field": "time", "op": ">=", "value": 17}], "then": "HOLD"}, {"if": [{"field": "amount", "op": ">=", "value": 10000}, {"field": "time", "op": "<", "value": 9}], "then": "HOLD"}, {"if": [{"field": "amount", "op": ">", "value": 5000}, {"field": "initiator_role", "op": "!=", "value": "manager"}], "then": "REQUIRE_APPROVAL"}], "default": "APPROVE"}
	"""
	elif task_name == "resource_access":
	base += """
	IMPORTANT — RESOURCE ACCESS TASK:
	This task has roles: junior, senior, contractor. Document types: public, internal, confidential.
	- Senior employees: ALLOW everything always
	- Contractors: ALLOW only public, DENY everything else
	- Junior + confidential: ALWAYS DENY (regardless of time — the policy is misleading about this)
	- Junior + internal: ALLOW only during business hours (hour >= 8 AND hour < 17)
	- Junior + public: ALLOW always
	- Business hours: hour >= 8 AND hour < 17
	"""

	if few_shot_examples:
	base += "\n\nLEARNED FROM PREVIOUS EPISODES (high-reward strategies):\n"
	for traj in few_shot_examples[-TOP_K_TRAJECTORIES:]:
	base += "\n" + traj.to_few_shot_string() + "\n"
	return base

	def _build_user_prompt(self, obs: dict, step: int, history: list[str]) -> str:
	lines = [
	f"TASK: {obs.get('task_name', 'unknown')}",
	f"STEP: {step} of {obs.get('max_steps', 7)}",
	f"\nPOLICY:\n{obs.get('policy_text', '')}",
	]
	if obs.get("clarification_response"):
	lines.append(f"\nLAST CLARIFICATION ANSWER:\n{obs['clarification_response']}")
	if obs.get("test_results"):
	tr = obs["test_results"]
	lines.append(f"\nTEST RESULTS: {tr.get('passed', 0)}/{tr.get('total', 0)} passed (accuracy={obs.get('current_accuracy', 0):.2f})")
	if tr.get("sample_failures"):
	lines.append("SAMPLE FAILURES:")
	for f in tr["sample_failures"][:3]:
	lines.append(f" - {f}")
	if obs.get("feedback"):
	lines.append(f"\nFEEDBACK: {obs['feedback']}")
	if history:
	lines.append(f"\nACTION HISTORY (last 3):\n" + "\n".join(history[-3:]))
	lines.append(f"\nAVAILABLE ACTIONS: {obs.get('available_actions', [])}")
	lines.append("\nRespond with JSON only: {\"action_type\": \"...\", \"content\": \"...\"}")
	return "\n".join(lines)

	def _parse_response(self, raw: str, obs: dict) -> tuple[str, str]:
	# Strip markdown code fences if present
	if "```" in raw:
	raw = raw.split("```")[1]
	if raw.startswith("json"):
	raw = raw[4:]
	raw = raw.strip()

	try:
	parsed = json.loads(raw)
	action_type = parsed.get("action_type", "propose_rules")
	content = parsed.get("content", "{}")

	# Validate action_type
	valid_actions = obs.get("available_actions", ["propose_rules", "ask_clarification"])
	if action_type not in valid_actions:
	action_type = "propose_rules" if "propose_rules" in valid_actions else valid_actions[0]

	# Ensure content is a string
	if isinstance(content, dict):
	content = json.dumps(content)
	return action_type, content
	except Exception:
	return "propose_rules", json.dumps({"rules": [], "default": "DENY"})

	# ── Trajectory Bank ───────────────────────────────────────────────────────────

	class TrajectoryBank:
	"""Stores and retrieves high-reward trajectories per task."""

	def __init__(self):
	self.bank: dict[str, list[Trajectory]] = {task: [] for task in TASKS}

	def store(self, trajectory: Trajectory):
	if trajectory.total_reward >= MIN_REWARD_THRESHOLD:
	self.bank[trajectory.task_name].append(trajectory)
	# Keep only top-K by reward
	self.bank[trajectory.task_name].sort(key=lambda t: t.total_reward, reverse=True)
	self.bank[trajectory.task_name] = self.bank[trajectory.task_name][:TOP_K_TRAJECTORIES]

	def get_examples(self, task_name: str) -> list[Trajectory]:
	return self.bank.get(task_name, [])

	def summary(self) -> dict:
	return {
	task: {
	"stored": len(trajs),
	"best_reward": max((t.total_reward for t in trajs), default=0),
	"best_accuracy": max((t.final_accuracy for t in trajs), default=0)
	}
	for task, trajs in self.bank.items()
	}

	# ── Training Loop ─────────────────────────────────────────────────────────────

	class TrainingLoop:
	def __init__(self, env_url: str, hf_token: str):
	self.env = EnvClient(env_url)
	self.agent = Agent(hf_token)
	self.bank = TrajectoryBank()
	self.metrics = [] # List of {episode, task, reward, accuracy, success}

	os.makedirs("training/logs", exist_ok=True)
	log_filename = f"training/logs/run_{int(time.time())}.log"
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.FileHandler(log_filename),
	logging.StreamHandler() # also print to console
	]
	)
	self.logger = logging.getLogger("TrainingLoop")
	self.log_file = log_filename

	def run_episode(self, task_name: str, episode_id: int) -> Trajectory:
	"""Run a single episode and return the trajectory."""
	few_shots = self.bank.get_examples(task_name)
	trajectory = Trajectory(task_name=task_name, episode_id=episode_id)

	# Reset environment
	result = self.env.reset(task_name)
	obs = result.get("observation", {})
	done = result.get("done", False)
	history = []

	self.logger.info(f"START episode={episode_id} task={task_name} few_shots_available={len(few_shots)}")

	step_num = 0
	while not done and step_num < obs.get("max_steps", 7):
	step_num += 1

	# Get action from agent
	action_type, content = self.agent.get_action(
	observation=obs,
	step_number=step_num,
	episode_history=history,
	few_shot_examples=few_shots,
	task_name=task_name
	)

	# Execute action
	result = self.env.step(action_type, content)
	reward = result.get("reward", 0.0)
	done = result.get("done", False)
	obs = result.get("observation", {})
	info = result.get("info", {})

	# Record step
	step = Step(
	step_number=step_num,
	action_type=action_type,
	action_content=content[:300],
	reward=reward,
	accuracy=obs.get("current_accuracy", 0.0),
	feedback=obs.get("feedback", "") or "",
	clarification_response=obs.get("clarification_response")
	)
	trajectory.steps.append(step)
	trajectory.total_reward += reward

	# Update history
	history.append(f"Step {step_num}: {action_type} → reward={reward:.2f} acc={step.accuracy:.2f}")

	self.logger.info(f"STEP episode={episode_id} step={step_num} action={action_type} reward={reward:.4f} accuracy={step.accuracy:.4f}")

	if done:
	episode_score = info.get("episode_score", obs.get("current_accuracy", 0.0))
	trajectory.final_accuracy = episode_score
	trajectory.success = obs.get("current_accuracy", 0.0) >= 0.9
	break

	if not trajectory.steps:
	trajectory.final_accuracy = 0.0

	self.logger.info(f"END episode={episode_id} task={task_name} total_reward={trajectory.total_reward:.4f} final_accuracy={trajectory.final_accuracy:.4f} success={trajectory.success} steps={len(trajectory.steps)}")

	return trajectory

	def run(self):
	"""Run full training loop across all tasks."""
	self.logger.info("=" * 60)
	self.logger.info("REWARD-GUIDED TRAJECTORY OPTIMIZATION")
	self.logger.info(f"Tasks: {TASKS}")
	self.logger.info(f"Episodes per task: {NUM_EPISODES_PER_TASK}")
	self.logger.info(f"Top-K trajectories: {TOP_K_TRAJECTORIES}")
	self.logger.info("=" * 60)
	self.logger.info(f"Log file: {self.log_file}")

	try:
	wandb.init(
	project="policy-to-logic-rl",
	name=f"trajectory-opt-{int(time.time())}",
	config={
	"num_episodes_per_task": NUM_EPISODES_PER_TASK,
	"top_k_trajectories": TOP_K_TRAJECTORIES,
	"min_reward_threshold": MIN_REWARD_THRESHOLD,
	"model": MODEL,
	"temperature": TEMPERATURE,
	"tasks": TASKS,
	"env_url": ENV_BASE_URL,
	}
	)
	except Exception as e:
	self.logger.warning(f"Wandb init failed: {e}. Continuing without W&B.")

	# Health check
	if not self.env.health():
	raise RuntimeError(f"Environment not reachable at {ENV_BASE_URL}")
	self.logger.info(f"Environment: OK ({ENV_BASE_URL})\n")

	global_episode = 0

	for task in TASKS:
	self.logger.info(f"\n{'-'*40}")
	self.logger.info(f"TASK: {task}")
	self.logger.info(f"{'-'*40}")

	task_rewards = []
	task_accuracies = []

	for ep in range(1, NUM_EPISODES_PER_TASK + 1):
	global_episode += 1
	trajectory = self.run_episode(task, ep)

	# Store in bank
	self.bank.store(trajectory)

	try:
	wandb.log({
	f"{task}/total_reward": trajectory.total_reward,
	f"{task}/final_accuracy": trajectory.final_accuracy,
	f"{task}/num_steps": len(trajectory.steps),
	f"{task}/success": int(trajectory.success),
	f"{task}/few_shots_used": len(self.bank.get_examples(task)),
	"global/total_reward": trajectory.total_reward,
	"global/final_accuracy": trajectory.final_accuracy,
	"episode": global_episode,
	})
	except Exception:
	pass

	# Record metrics
	self.metrics.append({
	"global_episode": global_episode,
	"task": task,
	"episode_in_task": ep,
	"total_reward": trajectory.total_reward,
	"final_accuracy": trajectory.final_accuracy,
	"success": trajectory.success,
	"num_steps": len(trajectory.steps),
	"few_shots_used": len(self.bank.get_examples(task)) - (1 if trajectory.total_reward >= MIN_REWARD_THRESHOLD else 0)
	})

	task_rewards.append(trajectory.total_reward)
	task_accuracies.append(trajectory.final_accuracy)

	self.logger.info(f" → Episode {ep} complete: reward={trajectory.total_reward:.3f} accuracy={trajectory.final_accuracy:.2f} success={trajectory.success}")
	time.sleep(0.5) # Rate limiting

	self.logger.info(f"\n Task summary:")
	self.logger.info(f" First episode reward: {task_rewards[0]:.3f}")
	self.logger.info(f" Last episode reward: {task_rewards[-1]:.3f}")
	self.logger.info(f" Improvement: {task_rewards[-1] - task_rewards[0]:+.3f}")

	self.logger.info("\n" + "=" * 60)
	self.logger.info("TRAINING COMPLETE")
	self.logger.info(f"Bank summary: {self.bank.summary()}")
	self.logger.info("=" * 60)

	try:
	wandb.finish()
	except Exception:
	pass

	return self.metrics

	# ── Plot Generation ───────────────────────────────────────────────────────────

	def save_plots(metrics: list[dict]):
	"""
	Save reward curve and accuracy curve as PNG files.
	These are REQUIRED for hackathon submission — must be committed to repo.
	"""
	try:
	import matplotlib
	matplotlib.use("Agg") # Non-interactive backend
	import matplotlib.pyplot as plt
	import numpy as np
	except ImportError:
	print("matplotlib not installed. Run: pip install matplotlib")
	return

	os.makedirs("training/plots", exist_ok=True)

	episodes = [m["global_episode"] for m in metrics]
	rewards = [m["total_reward"] for m in metrics]
	accuracies = [m["final_accuracy"] for m in metrics]
	tasks = [m["task"] for m in metrics]

	colors = {
	"data_access": "#2196F3",
	"resource_access": "#FF9800",
	"transaction_approval": "#4CAF50"
	}

	# ── Plot 1: Reward Curve (per-task trend lines) ────────────────────────────
	fig, ax = plt.subplots(figsize=(10, 5))

	for task in TASKS:
	task_eps = [m["global_episode"] for m in metrics if m["task"] == task]
	task_rews = [m["total_reward"] for m in metrics if m["task"] == task]
	ax.plot(task_eps, task_rews, marker="o", label=task,
	color=colors.get(task, "gray"), linewidth=2, markersize=5)
	# Per-task trend line
	if len(task_eps) >= 2:
	z = np.polyfit(task_eps, task_rews, 1)
	p = np.poly1d(z)
	ax.plot(task_eps, p(task_eps), "--",
	color=colors.get(task, "gray"), alpha=0.4, linewidth=1.5)

	ax.set_xlabel("Episode")
	ax.set_ylabel("Total Reward")
	ax.set_title("Reward Curve — Reward-Guided Trajectory Optimization")
	ax.legend()
	ax.grid(True, alpha=0.3)
	ax.set_ylim(bottom=0)

	plt.tight_layout()
	plt.savefig("training/plots/reward_curve.png", dpi=150, bbox_inches="tight")
	plt.close()
	print("Saved: training/plots/reward_curve.png")

	# ── Plot 2: Accuracy Curve ────────────────────────────────────────────────
	fig, ax = plt.subplots(figsize=(10, 5))

	for task in TASKS:
	task_eps = [m["global_episode"] for m in metrics if m["task"] == task]
	task_accs = [m["final_accuracy"] for m in metrics if m["task"] == task]
	ax.plot(task_eps, task_accs, marker="s", label=task,
	color=colors.get(task, "gray"), linewidth=2, markersize=5)

	ax.axhline(y=0.9, color="red", linestyle="--", alpha=0.7, label="success threshold (0.9)")

	ax.set_xlabel("Episode")
	ax.set_ylabel("Final Accuracy")
	ax.set_title("Accuracy Curve — Policy-to-Logic Agent")
	ax.legend()
	ax.grid(True, alpha=0.3)
	ax.set_ylim(0, 1.05)

	plt.tight_layout()
	plt.savefig("training/plots/accuracy_curve.png", dpi=150, bbox_inches="tight")
	plt.close()
	print("Saved: training/plots/accuracy_curve.png")

	# ── Plot 3: Per-Task Summary (Accuracy + Efficiency) ──────────────────────
	fig, axes = plt.subplots(1, 2, figsize=(14, 5))

	task_labels = []
	acc_improvements = []
	eff_improvements = []
	best_accuracies = []

	for task in TASKS:
	task_data = [m for m in metrics if m["task"] == task]
	if len(task_data) >= 2:
	task_labels.append(task.replace("_", "\n"))
	acc_improvements.append(task_data[-1]["final_accuracy"] - task_data[0]["final_accuracy"])
	# Efficiency: steps saved (first vs best)
	first_steps = task_data[0]["num_steps"]
	best_steps = min(m["num_steps"] for m in task_data)
	eff_pct = ((first_steps - best_steps) / first_steps * 100) if first_steps > 0 else 0
	eff_improvements.append(eff_pct)
	best_accuracies.append(max(m["final_accuracy"] for m in task_data))

	# Left: Best accuracy per task
	bars1 = axes[0].bar(task_labels, best_accuracies,
	color=["#2196F3", "#FF9800", "#4CAF50"][:len(task_labels)],
	edgecolor="white", linewidth=1.5)
	axes[0].axhline(y=0.9, color="red", linestyle="--", alpha=0.7, label="success threshold")
	axes[0].set_ylabel("Best Accuracy Achieved")
	axes[0].set_title("Best Accuracy Per Task")
	axes[0].set_ylim(0, 1.1)
	axes[0].grid(True, axis="y", alpha=0.3)
	axes[0].legend()
	for bar, val in zip(bars1, best_accuracies):
	axes[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
	f"{val:.0%}", ha="center", va="bottom", fontweight="bold")

	# Right: Efficiency improvement (% steps saved)
	bars2 = axes[1].bar(task_labels, eff_improvements,
	color=["#2196F3", "#FF9800", "#4CAF50"][:len(task_labels)],
	edgecolor="white", linewidth=1.5)
	axes[1].axhline(y=0, color="black", linewidth=0.8)
	axes[1].set_ylabel("Steps Saved (%)")
	axes[1].set_title("Efficiency Improvement (First → Best Episode)")
	axes[1].grid(True, axis="y", alpha=0.3)
	for bar, val in zip(bars2, eff_improvements):
	y_pos = max(bar.get_height() + 1, 2)
	axes[1].text(bar.get_x() + bar.get_width() / 2, y_pos,
	f"{val:.0f}%", ha="center", va="bottom", fontweight="bold")

	plt.tight_layout()
	plt.savefig("training/plots/improvement_chart.png", dpi=150, bbox_inches="tight")
	plt.close()
	print("Saved: training/plots/improvement_chart.png")

	# ── Save raw metrics as JSON ──────────────────────────────────────────────
	timestamp = int(time.time())
	with open(f"training/plots/metrics_{timestamp}.json", "w") as f:
	json.dump(metrics, f, indent=2)
	with open("training/plots/metrics_latest.json", "w") as f:
	json.dump(metrics, f, indent=2)
	print(f"Saved: training/plots/metrics_{timestamp}.json and metrics_latest.json")

	# ── Entry Point ───────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	hf_token = os.getenv("HF_TOKEN", "")
	if not hf_token:
	raise ValueError("HF_TOKEN environment variable not set")

	loop = TrainingLoop(ENV_BASE_URL, hf_token)
	metrics = loop.run()
	save_plots(metrics)

	print("\nNext step: commit training/plots/*.png to repo for submission.")