Spaces:

Godreign
/

Policy2Logic

Sleeping

File size: 26,658 Bytes

"""
Reward-Guided Trajectory Optimization Loop
==========================================
Optimizes agent behavior across episodes by accumulating high-reward
trajectories as few-shot examples. Uses environment reward signal to
drive improvement — no weight updates required.

This implements a policy improvement loop where:
  - reward_signal → trajectory_selection → context_construction → improved_policy
"""

import json
import os
import time
import requests
import logging
import wandb
from dataclasses import dataclass, field
from typing import Optional
from openai import OpenAI

# ── Configuration ────────────────────────────────────────────────────────────

ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
HF_TOKEN = os.getenv("HF_TOKEN", "")
MODEL = "Qwen/Qwen2.5-72B-Instruct"
TEMPERATURE = 0.3
MAX_TOKENS = 1024

# Training hyperparameters
NUM_EPISODES_PER_TASK = 8        # Episodes to run per task
TOP_K_TRAJECTORIES = 3           # Max few-shot examples to keep
MIN_REWARD_THRESHOLD = 0.3       # Minimum reward to store trajectory
TASKS = ["data_access", "resource_access", "transaction_approval"]

# ── Data Structures ──────────────────────────────────────────────────────────

@dataclass
class Step:
    step_number: int
    action_type: str
    action_content: str
    reward: float
    accuracy: float
    feedback: str
    clarification_response: Optional[str] = None

@dataclass
class Trajectory:
    task_name: str
    episode_id: int
    steps: list[Step] = field(default_factory=list)
    total_reward: float = 0.0
    final_accuracy: float = 0.0
    success: bool = False

    def to_few_shot_string(self) -> str:
        """Convert trajectory to a few-shot example string for prompting."""
        lines = [
            f"=== Example Episode (reward={self.total_reward:.2f}, accuracy={self.final_accuracy:.2f}) ===",
        ]
        for s in self.steps:
            lines.append(f"Step {s.step_number}: action={s.action_type}")
            lines.append(f"  Content: {s.action_content[:200]}")
            lines.append(f"  Result: accuracy={s.accuracy:.2f}, reward={s.reward:.2f}")
            if s.feedback:
                lines.append(f"  Feedback: {s.feedback[:150]}")
        return "\n".join(lines)

# ── Environment Client ────────────────────────────────────────────────────────

class EnvClient:
    def __init__(self, base_url: str):
        self.base_url = base_url.rstrip("/")
        self.session = requests.Session()

    def reset(self, task_name: str) -> dict:
        r = self.session.post(f"{self.base_url}/reset", json={"task_name": task_name})
        r.raise_for_status()
        return r.json()

    def step(self, action_type: str, content: str) -> dict:
        r = self.session.post(f"{self.base_url}/step", json={
            "action_type": action_type,
            "content": content
        })
        r.raise_for_status()
        return r.json()

    def health(self) -> bool:
        try:
            r = self.session.get(f"{self.base_url}/health", timeout=5)
            return r.status_code == 200
        except Exception:
            return False

# ── LLM Agent ────────────────────────────────────────────────────────────────

class Agent:
    def __init__(self, hf_token: str):
        self.client = OpenAI(
            base_url="https://router.huggingface.co/v1",
            api_key=hf_token
        )

    def get_action(
        self,
        observation: dict,
        step_number: int,
        episode_history: list[str],
        few_shot_examples: list[Trajectory],
        task_name: str = ""
    ) -> tuple[str, str]:
        """
        Returns (action_type, content_json_string).
        action_type: one of ask_clarification | propose_rules | refine_rules
        content: JSON string appropriate for that action
        """
        system_prompt = self._build_system_prompt(few_shot_examples, task_name)
        user_prompt = self._build_user_prompt(observation, step_number, episode_history)

        try:
            response = self.client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS
            )
            raw = response.choices[0].message.content.strip()
            return self._parse_response(raw, observation)
        except Exception as e:
            print(f"    [LLM ERROR] {e}")
            return "propose_rules", json.dumps({"rules": [], "default": "DENY"})

    def _build_system_prompt(self, few_shot_examples: list[Trajectory], task_name: str = "") -> str:
        base = """You are a policy-to-logic agent. Your job is to convert natural language policies into executable rules.

AVAILABLE ACTIONS:
1. ask_clarification: {"type": "clarification", "question": "your question"}
2. propose_rules: {"rules": [...], "default": "DECISION"}
3. refine_rules: {"rules": [...], "default": "DECISION"}

DSL FORMAT for rules:
{
  "rules": [
    {
      "if": [
        {"field": "FIELD_NAME", "op": "OPERATOR", "value": VALUE}
      ],
      "then": "DECISION"
    }
  ],
  "default": "FALLBACK_DECISION"
}

Operators: >, <, >=, <=, ==, !=
Rules execute top-to-bottom. First match wins. Default applies if no rule matches.

STRATEGY:
- Step 1: Ask 1-2 targeted clarification questions about ambiguous terms
- Step 2: Propose initial rules based on policy + clarifications  
- Step 3+: Refine rules based on failure feedback

OUTPUT FORMAT: Respond ONLY with valid JSON. No markdown. No explanation.
{"action_type": "propose_rules", "content": "{...escaped json string...}"}
"""
        # Task-specific guidance for complex tasks
        if task_name == "transaction_approval":
            base += """
IMPORTANT — TRANSACTION APPROVAL TASK:
This task has 4 possible decisions: APPROVE, REQUIRE_APPROVAL, COMPLIANCE_REVIEW, HOLD
Rules are evaluated TOP-TO-BOTTOM. Order matters critically. You MUST order rules by priority:
  1. FIRST: Check if transfer_type == "international" → then COMPLIANCE_REVIEW (always, overrides everything)
  2. SECOND: Check if amount >= 10000 AND time is outside business hours (hour < 9 or hour >= 17) → then HOLD
  3. THIRD: Check if amount > 5000 AND initiator_role != "manager" → then REQUIRE_APPROVAL
  4. DEFAULT: APPROVE

Key details:
- Standard limit is $5,000 (amount > 5000 triggers approval, NOT >=)
- High-value threshold is $10,000 (amount >= 10000)
- Business hours: hour >= 9 AND hour < 17
- Manager exemption ONLY applies to the standard $5,000 limit, NOT to international or high-value HOLD rules
- "system" role follows the same rules as "employee"

Here is a working example of valid rules for this task:
{"rules": [{"if": [{"field": "transfer_type", "op": "==", "value": "international"}], "then": "COMPLIANCE_REVIEW"}, {"if": [{"field": "amount", "op": ">=", "value": 10000}, {"field": "time", "op": ">=", "value": 17}], "then": "HOLD"}, {"if": [{"field": "amount", "op": ">=", "value": 10000}, {"field": "time", "op": "<", "value": 9}], "then": "HOLD"}, {"if": [{"field": "amount", "op": ">", "value": 5000}, {"field": "initiator_role", "op": "!=", "value": "manager"}], "then": "REQUIRE_APPROVAL"}], "default": "APPROVE"}
"""
        elif task_name == "resource_access":
            base += """
IMPORTANT — RESOURCE ACCESS TASK:
This task has roles: junior, senior, contractor. Document types: public, internal, confidential.
- Senior employees: ALLOW everything always
- Contractors: ALLOW only public, DENY everything else
- Junior + confidential: ALWAYS DENY (regardless of time — the policy is misleading about this)
- Junior + internal: ALLOW only during business hours (hour >= 8 AND hour < 17)
- Junior + public: ALLOW always
- Business hours: hour >= 8 AND hour < 17
"""

        if few_shot_examples:
            base += "\n\nLEARNED FROM PREVIOUS EPISODES (high-reward strategies):\n"
            for traj in few_shot_examples[-TOP_K_TRAJECTORIES:]:
                base += "\n" + traj.to_few_shot_string() + "\n"
        return base

    def _build_user_prompt(self, obs: dict, step: int, history: list[str]) -> str:
        lines = [
            f"TASK: {obs.get('task_name', 'unknown')}",
            f"STEP: {step} of {obs.get('max_steps', 7)}",
            f"\nPOLICY:\n{obs.get('policy_text', '')}",
        ]
        if obs.get("clarification_response"):
            lines.append(f"\nLAST CLARIFICATION ANSWER:\n{obs['clarification_response']}")
        if obs.get("test_results"):
            tr = obs["test_results"]
            lines.append(f"\nTEST RESULTS: {tr.get('passed', 0)}/{tr.get('total', 0)} passed (accuracy={obs.get('current_accuracy', 0):.2f})")
            if tr.get("sample_failures"):
                lines.append("SAMPLE FAILURES:")
                for f in tr["sample_failures"][:3]:
                    lines.append(f"  - {f}")
        if obs.get("feedback"):
            lines.append(f"\nFEEDBACK: {obs['feedback']}")
        if history:
            lines.append(f"\nACTION HISTORY (last 3):\n" + "\n".join(history[-3:]))
        lines.append(f"\nAVAILABLE ACTIONS: {obs.get('available_actions', [])}")
        lines.append("\nRespond with JSON only: {\"action_type\": \"...\", \"content\": \"...\"}")
        return "\n".join(lines)

    def _parse_response(self, raw: str, obs: dict) -> tuple[str, str]:
        # Strip markdown code fences if present
        if "```" in raw:
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        raw = raw.strip()

        try:
            parsed = json.loads(raw)
            action_type = parsed.get("action_type", "propose_rules")
            content = parsed.get("content", "{}")

            # Validate action_type
            valid_actions = obs.get("available_actions", ["propose_rules", "ask_clarification"])
            if action_type not in valid_actions:
                action_type = "propose_rules" if "propose_rules" in valid_actions else valid_actions[0]

            # Ensure content is a string
            if isinstance(content, dict):
                content = json.dumps(content)
            return action_type, content
        except Exception:
            return "propose_rules", json.dumps({"rules": [], "default": "DENY"})

# ── Trajectory Bank ───────────────────────────────────────────────────────────

class TrajectoryBank:
    """Stores and retrieves high-reward trajectories per task."""

    def __init__(self):
        self.bank: dict[str, list[Trajectory]] = {task: [] for task in TASKS}

    def store(self, trajectory: Trajectory):
        if trajectory.total_reward >= MIN_REWARD_THRESHOLD:
            self.bank[trajectory.task_name].append(trajectory)
            # Keep only top-K by reward
            self.bank[trajectory.task_name].sort(key=lambda t: t.total_reward, reverse=True)
            self.bank[trajectory.task_name] = self.bank[trajectory.task_name][:TOP_K_TRAJECTORIES]

    def get_examples(self, task_name: str) -> list[Trajectory]:
        return self.bank.get(task_name, [])

    def summary(self) -> dict:
        return {
            task: {
                "stored": len(trajs),
                "best_reward": max((t.total_reward for t in trajs), default=0),
                "best_accuracy": max((t.final_accuracy for t in trajs), default=0)
            }
            for task, trajs in self.bank.items()
        }

# ── Training Loop ─────────────────────────────────────────────────────────────

class TrainingLoop:
    def __init__(self, env_url: str, hf_token: str):
        self.env = EnvClient(env_url)
        self.agent = Agent(hf_token)
        self.bank = TrajectoryBank()
        self.metrics = []  # List of {episode, task, reward, accuracy, success}

        os.makedirs("training/logs", exist_ok=True)
        log_filename = f"training/logs/run_{int(time.time())}.log"
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler(log_filename),
                logging.StreamHandler()  # also print to console
            ]
        )
        self.logger = logging.getLogger("TrainingLoop")
        self.log_file = log_filename

    def run_episode(self, task_name: str, episode_id: int) -> Trajectory:
        """Run a single episode and return the trajectory."""
        few_shots = self.bank.get_examples(task_name)
        trajectory = Trajectory(task_name=task_name, episode_id=episode_id)

        # Reset environment
        result = self.env.reset(task_name)
        obs = result.get("observation", {})
        done = result.get("done", False)
        history = []

        self.logger.info(f"START episode={episode_id} task={task_name} few_shots_available={len(few_shots)}")

        step_num = 0
        while not done and step_num < obs.get("max_steps", 7):
            step_num += 1

            # Get action from agent
            action_type, content = self.agent.get_action(
                observation=obs,
                step_number=step_num,
                episode_history=history,
                few_shot_examples=few_shots,
                task_name=task_name
            )

            # Execute action
            result = self.env.step(action_type, content)
            reward = result.get("reward", 0.0)
            done = result.get("done", False)
            obs = result.get("observation", {})
            info = result.get("info", {})

            # Record step
            step = Step(
                step_number=step_num,
                action_type=action_type,
                action_content=content[:300],
                reward=reward,
                accuracy=obs.get("current_accuracy", 0.0),
                feedback=obs.get("feedback", "") or "",
                clarification_response=obs.get("clarification_response")
            )
            trajectory.steps.append(step)
            trajectory.total_reward += reward

            # Update history
            history.append(f"Step {step_num}: {action_type} → reward={reward:.2f} acc={step.accuracy:.2f}")

            self.logger.info(f"STEP episode={episode_id} step={step_num} action={action_type} reward={reward:.4f} accuracy={step.accuracy:.4f}")

            if done:
                episode_score = info.get("episode_score", obs.get("current_accuracy", 0.0))
                trajectory.final_accuracy = episode_score
                trajectory.success = obs.get("current_accuracy", 0.0) >= 0.9
                break

        if not trajectory.steps:
            trajectory.final_accuracy = 0.0

        self.logger.info(f"END episode={episode_id} task={task_name} total_reward={trajectory.total_reward:.4f} final_accuracy={trajectory.final_accuracy:.4f} success={trajectory.success} steps={len(trajectory.steps)}")

        return trajectory

    def run(self):
        """Run full training loop across all tasks."""
        self.logger.info("=" * 60)
        self.logger.info("REWARD-GUIDED TRAJECTORY OPTIMIZATION")
        self.logger.info(f"Tasks: {TASKS}")
        self.logger.info(f"Episodes per task: {NUM_EPISODES_PER_TASK}")
        self.logger.info(f"Top-K trajectories: {TOP_K_TRAJECTORIES}")
        self.logger.info("=" * 60)
        self.logger.info(f"Log file: {self.log_file}")

        try:
            wandb.init(
                project="policy-to-logic-rl",
                name=f"trajectory-opt-{int(time.time())}",
                config={
                    "num_episodes_per_task": NUM_EPISODES_PER_TASK,
                    "top_k_trajectories": TOP_K_TRAJECTORIES,
                    "min_reward_threshold": MIN_REWARD_THRESHOLD,
                    "model": MODEL,
                    "temperature": TEMPERATURE,
                    "tasks": TASKS,
                    "env_url": ENV_BASE_URL,
                }
            )
        except Exception as e:
            self.logger.warning(f"Wandb init failed: {e}. Continuing without W&B.")

        # Health check
        if not self.env.health():
            raise RuntimeError(f"Environment not reachable at {ENV_BASE_URL}")
        self.logger.info(f"Environment: OK ({ENV_BASE_URL})\n")

        global_episode = 0

        for task in TASKS:
            self.logger.info(f"\n{'-'*40}")
            self.logger.info(f"TASK: {task}")
            self.logger.info(f"{'-'*40}")

            task_rewards = []
            task_accuracies = []

            for ep in range(1, NUM_EPISODES_PER_TASK + 1):
                global_episode += 1
                trajectory = self.run_episode(task, ep)

                # Store in bank
                self.bank.store(trajectory)

                try:
                    wandb.log({
                        f"{task}/total_reward": trajectory.total_reward,
                        f"{task}/final_accuracy": trajectory.final_accuracy,
                        f"{task}/num_steps": len(trajectory.steps),
                        f"{task}/success": int(trajectory.success),
                        f"{task}/few_shots_used": len(self.bank.get_examples(task)),
                        "global/total_reward": trajectory.total_reward,
                        "global/final_accuracy": trajectory.final_accuracy,
                        "episode": global_episode,
                    })
                except Exception:
                    pass

                # Record metrics
                self.metrics.append({
                    "global_episode": global_episode,
                    "task": task,
                    "episode_in_task": ep,
                    "total_reward": trajectory.total_reward,
                    "final_accuracy": trajectory.final_accuracy,
                    "success": trajectory.success,
                    "num_steps": len(trajectory.steps),
                    "few_shots_used": len(self.bank.get_examples(task)) - (1 if trajectory.total_reward >= MIN_REWARD_THRESHOLD else 0)
                })

                task_rewards.append(trajectory.total_reward)
                task_accuracies.append(trajectory.final_accuracy)

                self.logger.info(f"  → Episode {ep} complete: reward={trajectory.total_reward:.3f} accuracy={trajectory.final_accuracy:.2f} success={trajectory.success}")
                time.sleep(0.5)  # Rate limiting

            self.logger.info(f"\n  Task summary:")
            self.logger.info(f"    First episode reward: {task_rewards[0]:.3f}")
            self.logger.info(f"    Last episode reward:  {task_rewards[-1]:.3f}")
            self.logger.info(f"    Improvement: {task_rewards[-1] - task_rewards[0]:+.3f}")

        self.logger.info("\n" + "=" * 60)
        self.logger.info("TRAINING COMPLETE")
        self.logger.info(f"Bank summary: {self.bank.summary()}")
        self.logger.info("=" * 60)

        try:
            wandb.finish()
        except Exception:
            pass

        return self.metrics

# ── Plot Generation ───────────────────────────────────────────────────────────

def save_plots(metrics: list[dict]):
    """
    Save reward curve and accuracy curve as PNG files.
    These are REQUIRED for hackathon submission — must be committed to repo.
    """
    try:
        import matplotlib
        matplotlib.use("Agg")  # Non-interactive backend
        import matplotlib.pyplot as plt
        import numpy as np
    except ImportError:
        print("matplotlib not installed. Run: pip install matplotlib")
        return

    os.makedirs("training/plots", exist_ok=True)

    episodes = [m["global_episode"] for m in metrics]
    rewards = [m["total_reward"] for m in metrics]
    accuracies = [m["final_accuracy"] for m in metrics]
    tasks = [m["task"] for m in metrics]

    colors = {
        "data_access": "#2196F3",
        "resource_access": "#FF9800",
        "transaction_approval": "#4CAF50"
    }

    # ── Plot 1: Reward Curve (per-task trend lines) ────────────────────────────
    fig, ax = plt.subplots(figsize=(10, 5))

    for task in TASKS:
        task_eps = [m["global_episode"] for m in metrics if m["task"] == task]
        task_rews = [m["total_reward"] for m in metrics if m["task"] == task]
        ax.plot(task_eps, task_rews, marker="o", label=task,
                color=colors.get(task, "gray"), linewidth=2, markersize=5)
        # Per-task trend line
        if len(task_eps) >= 2:
            z = np.polyfit(task_eps, task_rews, 1)
            p = np.poly1d(z)
            ax.plot(task_eps, p(task_eps), "--",
                    color=colors.get(task, "gray"), alpha=0.4, linewidth=1.5)

    ax.set_xlabel("Episode")
    ax.set_ylabel("Total Reward")
    ax.set_title("Reward Curve — Reward-Guided Trajectory Optimization")
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(bottom=0)

    plt.tight_layout()
    plt.savefig("training/plots/reward_curve.png", dpi=150, bbox_inches="tight")
    plt.close()
    print("Saved: training/plots/reward_curve.png")

    # ── Plot 2: Accuracy Curve ────────────────────────────────────────────────
    fig, ax = plt.subplots(figsize=(10, 5))

    for task in TASKS:
        task_eps = [m["global_episode"] for m in metrics if m["task"] == task]
        task_accs = [m["final_accuracy"] for m in metrics if m["task"] == task]
        ax.plot(task_eps, task_accs, marker="s", label=task,
                color=colors.get(task, "gray"), linewidth=2, markersize=5)

    ax.axhline(y=0.9, color="red", linestyle="--", alpha=0.7, label="success threshold (0.9)")

    ax.set_xlabel("Episode")
    ax.set_ylabel("Final Accuracy")
    ax.set_title("Accuracy Curve — Policy-to-Logic Agent")
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 1.05)

    plt.tight_layout()
    plt.savefig("training/plots/accuracy_curve.png", dpi=150, bbox_inches="tight")
    plt.close()
    print("Saved: training/plots/accuracy_curve.png")

    # ── Plot 3: Per-Task Summary (Accuracy + Efficiency) ──────────────────────
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    task_labels = []
    acc_improvements = []
    eff_improvements = []
    best_accuracies = []

    for task in TASKS:
        task_data = [m for m in metrics if m["task"] == task]
        if len(task_data) >= 2:
            task_labels.append(task.replace("_", "\n"))
            acc_improvements.append(task_data[-1]["final_accuracy"] - task_data[0]["final_accuracy"])
            # Efficiency: steps saved (first vs best)
            first_steps = task_data[0]["num_steps"]
            best_steps = min(m["num_steps"] for m in task_data)
            eff_pct = ((first_steps - best_steps) / first_steps * 100) if first_steps > 0 else 0
            eff_improvements.append(eff_pct)
            best_accuracies.append(max(m["final_accuracy"] for m in task_data))

    # Left: Best accuracy per task
    bars1 = axes[0].bar(task_labels, best_accuracies,
                        color=["#2196F3", "#FF9800", "#4CAF50"][:len(task_labels)],
                        edgecolor="white", linewidth=1.5)
    axes[0].axhline(y=0.9, color="red", linestyle="--", alpha=0.7, label="success threshold")
    axes[0].set_ylabel("Best Accuracy Achieved")
    axes[0].set_title("Best Accuracy Per Task")
    axes[0].set_ylim(0, 1.1)
    axes[0].grid(True, axis="y", alpha=0.3)
    axes[0].legend()
    for bar, val in zip(bars1, best_accuracies):
        axes[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
                     f"{val:.0%}", ha="center", va="bottom", fontweight="bold")

    # Right: Efficiency improvement (% steps saved)
    bars2 = axes[1].bar(task_labels, eff_improvements,
                        color=["#2196F3", "#FF9800", "#4CAF50"][:len(task_labels)],
                        edgecolor="white", linewidth=1.5)
    axes[1].axhline(y=0, color="black", linewidth=0.8)
    axes[1].set_ylabel("Steps Saved (%)")
    axes[1].set_title("Efficiency Improvement (First → Best Episode)")
    axes[1].grid(True, axis="y", alpha=0.3)
    for bar, val in zip(bars2, eff_improvements):
        y_pos = max(bar.get_height() + 1, 2)
        axes[1].text(bar.get_x() + bar.get_width() / 2, y_pos,
                     f"{val:.0f}%", ha="center", va="bottom", fontweight="bold")

    plt.tight_layout()
    plt.savefig("training/plots/improvement_chart.png", dpi=150, bbox_inches="tight")
    plt.close()
    print("Saved: training/plots/improvement_chart.png")

    # ── Save raw metrics as JSON ──────────────────────────────────────────────
    timestamp = int(time.time())
    with open(f"training/plots/metrics_{timestamp}.json", "w") as f:
        json.dump(metrics, f, indent=2)
    with open("training/plots/metrics_latest.json", "w") as f:
        json.dump(metrics, f, indent=2)
    print(f"Saved: training/plots/metrics_{timestamp}.json and metrics_latest.json")

# ── Entry Point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    hf_token = os.getenv("HF_TOKEN", "")
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not set")

    loop = TrainingLoop(ENV_BASE_URL, hf_token)
    metrics = loop.run()
    save_plots(metrics)

    print("\nNext step: commit training/plots/*.png to repo for submission.")