# scripts/train_rl.py """ VERGIL Real RL Training — REINFORCE with Learned Baseline ============================================================ This script implements a proper RL training loop: - Neural network policy (MLP) that maps state features → action distribution - Value network baseline for variance reduction - REINFORCE with baseline algorithm - Learning curves showing before vs after improvement - Uses the VERGIL environment with all Phase 2 modules wired in This is the MISSING piece for the hackathon: an actual model that learns from environment feedback through gradient descent. Usage: python3 scripts/train_rl.py --episodes 1000 --lr 3e-4 python3 scripts/train_rl.py --smoke-test """ import argparse import json import math import sys import time from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.distributions import Categorical sys.path.insert(0, str(Path(__file__).parent.parent)) from vergil.core.env import VERGILEnv from vergil.core.types import ( AgentAction, ActionType, CommitmentStatus, CommitmentNode ) from vergil.core.pomdp import POMDPWrapper from vergil.core.execution_model import ProbabilisticExecutionEngine from vergil.curriculum.failure_db import FailureTopologyDatabase from vergil.curriculum.scenario_generator import ScenarioGenerator from vergil.curriculum.curriculum_engine import CurriculumEngine from vergil.training.evaluation import EvaluationSuite, EvaluationMetrics # ═══════════════════════════════════════════════════════════════════════════ # State Encoder: converts VERGIL state → flat feature vector # ═══════════════════════════════════════════════════════════════════════════ def encode_state(state, env) -> torch.Tensor: """ Converts a VERGILState into a fixed-size feature vector. This is the "state encoder" that the hackathon guide requires. Features (28-dim): [0-3] Global: SAT score, cognitive_load, energy, available_hours [4-5] Counts: n_pending, n_accepted [6-7] Counts: n_completed, n_failed [8-9] Counts: n_at_risk, n_total_nodes [10-13] Top pending node: urgency, hours_to_deadline, duration, type_encoding [14-17] Trust: mean_trust, min_trust, max_trust, trust_variance [18-21] Multi-dim trust (avg): reliability, competence, benevolence, composite [22-24] Belief: overall_uncertainty, epistemic_risk, step_ratio [25-27] Capacity: total_committed_hours, remaining_capacity, schedule_density """ nodes = state.cdg_nodes trust_entries = state.trust_entries pending = [n for n in nodes if n.status == CommitmentStatus.PENDING] accepted = [n for n in nodes if n.status == CommitmentStatus.ACCEPTED] completed = [n for n in nodes if n.status == CommitmentStatus.COMPLETED] failed = [n for n in nodes if n.status == CommitmentStatus.FAILED] at_risk = [n for n in nodes if n.status == CommitmentStatus.AT_RISK] # Global features sat = state.satisfiability_score cog = state.cognitive_load energy = state.energy_level avail = state.available_hours_next_48h / 16.0 # Normalize to [0,1] # Count features (normalized) n_total = max(len(nodes), 1) f_pending = len(pending) / n_total f_accepted = len(accepted) / n_total f_completed = len(completed) / n_total f_failed = len(failed) / n_total f_at_risk = len(at_risk) / n_total f_total = min(n_total / 10.0, 1.0) # Top pending node features if pending: top = pending[0] top_urgency = top.urgency top_deadline_hours = top.deadline_proximity_hours(state.current_time) / 48.0 top_duration = top.estimated_duration_hours / 10.0 type_map = {'explicit_hard': 1.0, 'explicit_soft': 0.7, 'implicit': 0.4, 'social': 0.2} top_type = type_map.get(top.commitment_type.value, 0.5) else: top_urgency = 0.0 top_deadline_hours = 1.0 top_duration = 0.0 top_type = 0.0 # Trust features trust_scores = [te.trust_score for te in trust_entries.values()] if trust_scores: mean_trust = np.mean(trust_scores) min_trust = np.min(trust_scores) max_trust = np.max(trust_scores) trust_var = np.var(trust_scores) else: mean_trust = min_trust = max_trust = 0.5 trust_var = 0.0 # Multi-dim trust features md_trust = getattr(env, 'multidim_trust', {}) if md_trust: avg_rel = np.mean([mt.reliability for mt in md_trust.values()]) avg_comp = np.mean([mt.competence for mt in md_trust.values()]) avg_ben = np.mean([mt.benevolence for mt in md_trust.values()]) avg_composite = np.mean([mt.composite_trust for mt in md_trust.values()]) else: avg_rel = avg_comp = avg_ben = avg_composite = 0.5 # Belief & time features step_ratio = state.step_number / max(env._max_steps, 1) # POMDP belief features — read from actual belief state if available pomdp_belief = getattr(env, '_pomdp_belief', None) if pomdp_belief is not None: uncertainty = pomdp_belief.overall_uncertainty epistemic_risk = pomdp_belief.epistemic_risk else: # Fallback: entropy-based estimate from trust variance uncertainty = min(1.0, trust_var * 5 + 0.3) epistemic_risk = 1.0 - min(1.0, step_ratio * 2) # decreases as we observe more # Capacity features total_committed = sum( n.estimated_duration_hours for n in nodes if n.status == CommitmentStatus.ACCEPTED ) remaining_cap = max(0, state.available_hours_next_48h - total_committed) / 16.0 schedule_density = total_committed / max(state.available_hours_next_48h, 0.1) features = [ sat, cog, energy, avail, f_pending, f_accepted, f_completed, f_failed, f_at_risk, f_total, top_urgency, top_deadline_hours, top_duration, top_type, mean_trust, min_trust, max_trust, trust_var, avg_rel, avg_comp, avg_ben, avg_composite, uncertainty, epistemic_risk, step_ratio, total_committed / 10.0, remaining_cap, min(schedule_density, 2.0) / 2.0, ] return torch.tensor(features, dtype=torch.float32) # ═══════════════════════════════════════════════════════════════════════════ # Policy Network & Value Network # ═══════════════════════════════════════════════════════════════════════════ STATE_DIM = 28 N_ACTIONS = 4 # accept, decline, counter_propose, do_nothing ACTION_MAP = [ ActionType.ACCEPT, ActionType.DECLINE, ActionType.COUNTER_PROPOSE, ActionType.DO_NOTHING, ] class PolicyNetwork(nn.Module): """MLP policy: state → action probabilities.""" def __init__(self, state_dim=STATE_DIM, n_actions=N_ACTIONS, hidden=128): super().__init__() self.net = nn.Sequential( nn.Linear(state_dim, hidden), nn.ReLU(), nn.LayerNorm(hidden), nn.Linear(hidden, hidden), nn.ReLU(), nn.LayerNorm(hidden), nn.Linear(hidden, n_actions), ) def forward(self, x): logits = self.net(x) return F.softmax(logits, dim=-1) class ValueNetwork(nn.Module): """MLP value baseline: state → estimated return.""" def __init__(self, state_dim=STATE_DIM, hidden=128): super().__init__() self.net = nn.Sequential( nn.Linear(state_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden // 2), nn.ReLU(), nn.Linear(hidden // 2, 1), ) def forward(self, x): return self.net(x).squeeze(-1) # ═══════════════════════════════════════════════════════════════════════════ # Task Completion Simulator (same as heuristic trainer) # ═══════════════════════════════════════════════════════════════════════════ def simulate_task_progress(env): """Mark accepted tasks as completed when enough work accumulates.""" if env.cdg is None or env._state is None: return ct = env._state.current_time step_hours = env.config.get('step_hours', 2) for nid, node in env.cdg._nodes.items(): if node.status != CommitmentStatus.ACCEPTED: continue wk = f"work_done_{nid}" work = env._hidden.get(wk, 0.0) td = env._hidden.get('true_durations', {}).get(nid, node.estimated_duration_hours) eff = max(0.3, 1.0 - env._state.cognitive_load * 0.4) work += step_hours * eff * 0.85 env._hidden[wk] = work if work >= td: env.cdg.update_node_status(nid, CommitmentStatus.COMPLETED, ct) node.actual_duration_hours = work # ═══════════════════════════════════════════════════════════════════════════ # REINFORCE Episode Runner # ═══════════════════════════════════════════════════════════════════════════ def run_rl_episode(env, pomdp, policy, value_net, scenario, gamma=0.99): """Run one episode, collecting (state, action, reward) trajectories.""" state, belief, info = pomdp.reset(scenario=scenario) env._pomdp_belief = belief # Store belief so encode_state() can read it states, actions, rewards, log_probs, values = [], [], [], [], [] total_reward = 0.0 while True: simulate_task_progress(env) # Encode state state_vec = encode_state(state, env) states.append(state_vec) # POLICY forward pass with torch.no_grad(): probs = policy(state_vec.unsqueeze(0)).squeeze(0) value = value_net(state_vec.unsqueeze(0)).squeeze(0) dist = Categorical(probs) action_idx = dist.sample() log_prob = dist.log_prob(action_idx) action_type = ACTION_MAP[action_idx.item()] log_probs.append(log_prob) values.append(value) actions.append(action_idx.item()) # Build action with PROPER action masking nodes = state.cdg_nodes pending = [n for n in nodes if n.status == CommitmentStatus.PENDING] accepted = [n for n in nodes if n.status == CommitmentStatus.ACCEPTED] at_risk = [n for n in nodes if n.status == CommitmentStatus.AT_RISK] target_node_id = None # Action masking: accept/decline/counter ONLY work on pending nodes if action_type in (ActionType.ACCEPT, ActionType.DECLINE, ActionType.COUNTER_PROPOSE): if pending: target_node_id = pending[0].node_id else: # No pending nodes → cannot do this action, fallback action_type = ActionType.DO_NOTHING elif action_type == ActionType.DO_NOTHING: pass # Always valid action = AgentAction( action_type=action_type, target_node_id=target_node_id, feasibility_prediction=0.5 + float(probs[0]) * 0.4, ) if action_type == ActionType.COUNTER_PROPOSE and target_node_id: node = next((n for n in nodes if n.node_id == target_node_id), None) if node: action.proposed_deadline = state.current_time + timedelta( hours=node.estimated_duration_hours * 2.0) state, belief, reward, terminated, truncated, step_info = pomdp.step(action) env._pomdp_belief = belief # Update belief for next encode_state() simulate_task_progress(env) rewards.append(reward) total_reward += reward if terminated or truncated: break # Compute metrics n_completed = sum(1 for n in state.cdg_nodes if n.status == CommitmentStatus.COMPLETED) n_total = sum(1 for n in state.cdg_nodes if n.status in (CommitmentStatus.COMPLETED, CommitmentStatus.FAILED, CommitmentStatus.ACCEPTED)) fulfillment = n_completed / max(1, n_total) trust_index = np.mean([te.trust_score for te in state.trust_entries.values()]) return { 'states': states, 'actions': actions, 'rewards': rewards, 'log_probs': log_probs, 'values': values, 'total_reward': total_reward, 'fulfillment': fulfillment, 'trust_index': trust_index, 'steps': len(rewards), } def compute_returns(rewards, gamma=0.99): """Compute discounted returns from rewards.""" returns = [] R = 0 for r in reversed(rewards): R = r + gamma * R returns.insert(0, R) returns = torch.tensor(returns, dtype=torch.float32) if len(returns) > 1: returns = (returns - returns.mean()) / (returns.std() + 1e-8) return returns # ═══════════════════════════════════════════════════════════════════════════ # Main Training Loop # ═══════════════════════════════════════════════════════════════════════════ def main(): parser = argparse.ArgumentParser(description='VERGIL RL Training') parser.add_argument('--episodes', type=int, default=1000) parser.add_argument('--stage', type=int, default=1) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--smoke-test', action='store_true') parser.add_argument('--log-dir', type=str, default='/tmp/vergil_rl_training') args = parser.parse_args() if args.smoke_test: args.episodes = 200 print("🔥 Smoke test: 200 RL episodes") torch.manual_seed(args.seed) np.random.seed(args.seed) print() print(f"╔══════════════════════════════════════════════════╗") print(f"║ VERGIL RL Training — REINFORCE + Baseline ║") print(f"╠══════════════════════════════════════════════════╣") print(f"║ Episodes: {args.episodes:<6} │ LR: {args.lr:<12} ║") print(f"║ Stage: {args.stage:<9} │ γ: {args.gamma:<12} ║") print(f"╚══════════════════════════════════════════════════╝") print() # ── Initialize ──────────────────────────────────────────────────────── env = VERGILEnv(seed=args.seed, config={ 'max_steps_per_episode': 30, 'step_hours': 2, 'log_dir': args.log_dir, }) pomdp = POMDPWrapper(env) failure_db = FailureTopologyDatabase(db_path='/tmp/vergil_ftd_rl.sqlite') scenario_gen = ScenarioGenerator(seed=args.seed) curriculum = CurriculumEngine( failure_db=failure_db, scenario_generator=scenario_gen, initial_stage=args.stage, ) policy = PolicyNetwork() value_net = ValueNetwork() optimizer = optim.Adam( list(policy.parameters()) + list(value_net.parameters()), lr=args.lr, ) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.9) # ── Metrics Tracking ────────────────────────────────────────────────── training_curve = [] all_rewards = [] all_fulfillments = [] all_trusts = [] policy_losses = [] value_losses = [] # ── Before training baseline ────────────────────────────────────────── print("📊 Running PRE-TRAINING baseline (random policy)...") pre_rewards = [] pre_fulfill = [] for i in range(20): env.curriculum_stage = 1 scenario = curriculum.generate_next_episode() ep_data = run_rl_episode(env, pomdp, policy, value_net, scenario) pre_rewards.append(ep_data['total_reward']) pre_fulfill.append(ep_data['fulfillment']) print(f" Pre-training: reward={np.mean(pre_rewards):+.3f} " f"fulfill={np.mean(pre_fulfill):.1%}") print() # ── Training ────────────────────────────────────────────────────────── start_time = time.time() print(f"{'─'*65}") print(f" {'Ep':>5} {'Stage':>5} {'Reward':>8} {'Fulfill':>8} " f"{'Trust':>7} {'PLoss':>8} {'VLoss':>8}") print(f"{'─'*65}") for ep in range(1, args.episodes + 1): env.curriculum_stage = curriculum.current_stage scenario = curriculum.generate_next_episode() # Run episode with current policy ep_data = run_rl_episode( env, pomdp, policy, value_net, scenario, gamma=args.gamma) all_rewards.append(ep_data['total_reward']) all_fulfillments.append(ep_data['fulfillment']) all_trusts.append(ep_data['trust_index']) # Compute returns & advantages returns = compute_returns(ep_data['rewards'], args.gamma) log_probs = torch.stack(ep_data['log_probs']) values = torch.stack(ep_data['values']) advantages = returns - values.detach() # Policy loss (REINFORCE with baseline) policy_loss = -(log_probs * advantages).mean() # Value loss (MSE) value_loss = F.mse_loss(values, returns) # Combined loss loss = policy_loss + 0.5 * value_loss # Entropy bonus for exploration states_t = torch.stack(ep_data['states']) probs = policy(states_t) entropy = -(probs * torch.log(probs + 1e-8)).sum(dim=-1).mean() loss -= 0.01 * entropy optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( list(policy.parameters()) + list(value_net.parameters()), 1.0) optimizer.step() scheduler.step() policy_losses.append(policy_loss.item()) value_losses.append(value_loss.item()) # Record curriculum curriculum.record_episode_reward(ep_data['total_reward'], curriculum.current_stage) promoted = curriculum.check_promotion() if promoted: print(f"\n 🎓 PROMOTED TO STAGE {curriculum.current_stage}!\n") # Logging if ep % 20 == 0 or ep == 1 or promoted: recent_r = all_rewards[-20:] recent_f = all_fulfillments[-20:] recent_t = all_trusts[-20:] recent_pl = policy_losses[-20:] recent_vl = value_losses[-20:] elapsed = time.time() - start_time eps_per_sec = ep / elapsed print(f" {ep:5d} {curriculum.current_stage:5d} " f"{np.mean(recent_r):+8.3f} {np.mean(recent_f):7.1%} " f"{np.mean(recent_t):6.3f} " f"{np.mean(recent_pl):8.4f} {np.mean(recent_vl):8.4f}") training_curve.append({ 'episode': ep, 'stage': curriculum.current_stage, 'avg_reward': round(float(np.mean(recent_r)), 4), 'avg_fulfillment': round(float(np.mean(recent_f)), 4), 'avg_trust': round(float(np.mean(recent_t)), 4), 'policy_loss': round(float(np.mean(recent_pl)), 4), 'value_loss': round(float(np.mean(recent_vl)), 4), 'eps_per_sec': round(eps_per_sec, 1), }) # ── Post-training evaluation ────────────────────────────────────────── print(f"\n📊 Running POST-TRAINING evaluation...") post_rewards = [] post_fulfill = [] for i in range(20): env.curriculum_stage = 1 scenario = curriculum.generate_next_episode() with torch.no_grad(): ep_data = run_rl_episode(env, pomdp, policy, value_net, scenario) post_rewards.append(ep_data['total_reward']) post_fulfill.append(ep_data['fulfillment']) print(f" Post-training: reward={np.mean(post_rewards):+.3f} " f"fulfill={np.mean(post_fulfill):.1%}") # ── Final Summary ───────────────────────────────────────────────────── elapsed = time.time() - start_time print(f"\n{'═'*65}") print(f" TRAINING COMPLETE: {args.episodes} episodes in {elapsed:.1f}s") print(f"{'═'*65}") reward_improvement = np.mean(post_rewards) - np.mean(pre_rewards) fulfill_improvement = np.mean(post_fulfill) - np.mean(pre_fulfill) print(f"\n ┌─── Before vs After ───────────────────────────┐") print(f" │ BEFORE AFTER DELTA │") print(f" │ Reward: {np.mean(pre_rewards):+7.3f} {np.mean(post_rewards):+7.3f} {reward_improvement:+6.3f} │") print(f" │ Fulfillment: {np.mean(pre_fulfill):7.1%} {np.mean(post_fulfill):7.1%} {fulfill_improvement:+6.1%} │") print(f" └────────────────────────────────────────────────┘") print(f"\n Final Stage Reached: {curriculum.current_stage}") print(f" Policy Parameters: {sum(p.numel() for p in policy.parameters()):,}") print(f" Value Net Parameters: {sum(p.numel() for p in value_net.parameters()):,}") print(f"{'═'*65}") # ── Save Results ────────────────────────────────────────────────────── results_dir = Path(args.log_dir) results_dir.mkdir(parents=True, exist_ok=True) results = { 'algorithm': 'REINFORCE_with_baseline', 'total_episodes': args.episodes, 'elapsed_seconds': round(elapsed, 2), 'learning_rate': args.lr, 'gamma': args.gamma, 'final_stage': curriculum.current_stage, 'before_after': { 'pre_training_reward': round(float(np.mean(pre_rewards)), 4), 'post_training_reward': round(float(np.mean(post_rewards)), 4), 'reward_improvement': round(float(reward_improvement), 4), 'pre_training_fulfillment': round(float(np.mean(pre_fulfill)), 4), 'post_training_fulfillment': round(float(np.mean(post_fulfill)), 4), 'fulfillment_improvement': round(float(fulfill_improvement), 4), }, 'training_curve': training_curve, 'model_params': { 'policy': sum(p.numel() for p in policy.parameters()), 'value_net': sum(p.numel() for p in value_net.parameters()), }, } with open(results_dir / 'rl_training_results.json', 'w') as f: json.dump(results, f, indent=2) print(f"\n Results saved to: {results_dir / 'rl_training_results.json'}") # Save trained model torch.save({ 'policy_state_dict': policy.state_dict(), 'value_net_state_dict': value_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, results_dir / 'vergil_rl_model.pt') print(f" Model saved to: {results_dir / 'vergil_rl_model.pt'}") with open(results_dir / 'rl_training_curve.json', 'w') as f: json.dump(training_curve, f, indent=2) print(f" Learning curve saved to: {results_dir / 'rl_training_curve.json'}") print() if __name__ == '__main__': main()