Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

File size: 16,409 Bytes

4c68ece
 
 
 
 
 
 
 
 
 
 
 
9cdb062
4c68ece
 
 
8cd3fa7
89d39f7
6aa8acb
8cd3fa7
4c68ece
511f04a
4c68ece
 
8cd3fa7
4c68ece
 
899c12a
09a9c72
 
4c68ece
79fb14b
 
 
 
 
 
 
 
 
 
 
 
9c3ced0
 
 
 
 
 
 
79fb14b
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
8cd3fa7
4c68ece
 
9cdb062
4c68ece
 
 
 
9cdb062
4c68ece
9cdb062
4c68ece
 
 
8cd3fa7
4c68ece
 
 
 
 
 
 
 
 
 
89d39f7
 
 
4c68ece
 
6aa8acb
 
511f04a
 
8cd3fa7
4c68ece
 
8cd3fa7
4c68ece
 
8cd3fa7
4c68ece
 
8cd3fa7
511f04a
4c68ece
 
8cd3fa7
 
4c68ece
 
6aa8acb
 
 
 
4c68ece
511f04a
 
 
 
 
 
4c68ece
511f04a
89d39f7
 
 
 
292424c
511f04a
4c68ece
 
511f04a
 
 
4c68ece
511f04a
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
511f04a
4c68ece
511f04a
 
4c68ece
 
 
 
511f04a
 
 
 
 
4c68ece
511f04a
4c68ece
 
 
 
 
 
 
 
 
89d39f7
 
 
 
 
 
 
 
 
 
 
 
 
 
4c68ece
8cd3fa7
4c68ece
 
511f04a
 
 
4c68ece
 
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
933baa6
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
933baa6
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
 
 
933baa6
4c68ece
 
 
 
 
 
9cdb062
4c68ece
9cdb062
 
 
899c12a
9cdb062
 
899c12a
9cdb062
 
899c12a
9cdb062
 
899c12a
9cdb062
6aa8acb
4c68ece
 
 
6aa8acb
4c68ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4f91f0
 
 
 
 
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899c12a
4c68ece
899c12a
 
4c68ece
 
899c12a
 
 
4c68ece
 
 
 
9cdb062
4c68ece
9cdb062
4c68ece
 
9cdb062
 
09a9c72
 
 
9cdb062
 
 
 
 
 
 
6a19dc6
 
 
 
 
 
 
 
 
 
9cdb062
47a298a
9cdb062
 
 
 
4c68ece
 
9cdb062
4c68ece
9cdb062
 
 
 
 
 
 
 
 
 
 
4c68ece
511f04a
8cd3fa7
4c68ece

"""
PolicyEvolverEnv — Hackathon Inference Script
=============================================
MANDATORY ENV VARS:
    API_BASE_URL   The API endpoint for the LLM.
    MODEL_NAME     The model identifier to use for inference.
    HF_TOKEN       Your Hugging Face / API key.
    IMAGE_NAME     The Docker image name (set by validator).

STDOUT FORMAT:
    [START] task=<task_name> env=policy_evolver_env model=<model_name>
    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
    [END]   task=<task_name> success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
"""

import asyncio
import os
import sys
import json
from typing import Dict, List, Optional

from openai import OpenAI
from client import PolicyEvolverEnv
from models import Action

# ─── Environment Variables (Hackathon Mandatory) ───
IMAGE_NAME = os.getenv("IMAGE_NAME")
API_KEY = os.environ.get("API_KEY") or os.environ.get("HF_TOKEN")
API_BASE_URL = os.environ.get("API_BASE_URL")
MODEL_NAME = os.environ.get("MODEL_NAME")
BENCHMARK = "policy_evolver_env"

# ─── Auto-discover model if MODEL_NAME is not set ───
if not MODEL_NAME and API_BASE_URL and API_KEY:
    try:
        import httpx
        resp = httpx.get(
            f"{API_BASE_URL.rstrip('/')}/models",
            headers={"Authorization": f"Bearer {API_KEY}"},
            timeout=10,
        )
        if resp.status_code == 200:
            models_data = resp.json().get("data", [])
            # Filter out wildcards and pick a real model name
            for m in models_data:
                mid = m.get("id", "")
                if mid and mid != "*" and not mid.startswith("*"):
                    MODEL_NAME = mid
                    print(f"[DEBUG] Auto-discovered model: {MODEL_NAME}", flush=True)
                    break
    except Exception as e:
        print(f"[DEBUG] Model discovery failed: {e}", flush=True)

if not MODEL_NAME:
    MODEL_NAME = "gpt-4o-mini"
    print(f"[DEBUG] Using default MODEL_NAME: {MODEL_NAME}", flush=True)
MAX_STEPS = 5
TEMPERATURE = 0.0
SUCCESS_THRESHOLD = 0.70


# ─── Logging Helpers (Hackathon Mandatory Format) ───
def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = f'"{error}"' if error else "null"
    done_val = str(done).lower()
    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)


def log_end(task: str, success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] task={task} success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)


# ─── LLM Agent ───
class PolicyEvolverAgent:
    """Strategic Policy Agent — maximizes governance scores via in-context adaptation."""

    SYSTEM_PROMPT = (
        "You are a Strategic Policy Engineer. Your goal is to maximize governance outcomes through verifiable "
        "precision. STYLISTIC RULES:\n"
        "1. NO VAGUENESS: Never use words like 'maybe', 'perhaps', 'sometimes', 'usually'.\n"
        "2. COMMAND LANGUAGE: Use 'must', 'shall', 'prohibited', 'required', 'mandatory'.\n"
        "3. MEASURABLE CRITERIA: Define terms with 'if-then' and metrics.\n"
        "4. ANALYTICAL COT: Your 'think' field MUST be 150-250 words and include terms: 'tradeoff', 'precision', "
        "'recall', 'threshold', 'impact', 'evidence'.\n"
        "5. JSON ONLY: Output ONLY the JSON object. No preamble.\n"
        "6. INCREMENTALISM: If your previous score was high (>0.80), focus on surgical precision rather than holistic rewriting. "
        "DO NOT add words that create ambiguity."
    )

    def __init__(self, model: str):
        self.model = model
        self.action_history: list = []
        self.score_history: list = []

    def _call_llm(self, client: OpenAI, prompt: str) -> Optional[dict]:
        """Call the LLM and robustly parse the JSON response."""
        try:
            resp = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.SYSTEM_PROMPT},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=1024,
                temperature=TEMPERATURE,
                seed=42,
            )
            raw = resp.choices[0].message.content.strip()

            # Strip markdown fences
            if "```json" in raw:
                raw = raw.split("```json")[1].split("```")[0].strip()
            elif "```" in raw:
                raw = raw.split("```")[1].split("```")[0].strip()

            try:
                return json.loads(raw)
            except json.JSONDecodeError:
                start = raw.find("{")
                end = raw.rfind("}")
                if start != -1 and end != -1:
                    return json.loads(raw[start : end + 1])
                raise
        except Exception as e:
            print(f"[DEBUG] LLM Call Error: {e}", file=sys.stderr)
            if 'raw' in locals():
                print(f"[DEBUG] Raw content: {raw}", file=sys.stderr)
            raise e

    def _build_feedback(self, step: int, last_score: float, last_action: dict, task_id: str) -> str:
        """Build diagnostic feedback from previous step for in-context learning."""
        if step == 0 or not last_action:
            return ""

        lines = [
            f"\n=== STRATEGIC FEEDBACK (Step {step}) ===",
            f"Previous score: {last_score:.3f} / 1.000",
        ]

        if task_id == "task_easy":
            defn = last_action.get("suggested_definition", "")
            vague = ["might", "could", "perhaps", "sometimes", "often", "generally", "usually", "typically", "may", "possibly"]
            found = [w for w in vague if w in defn.lower()]
            meas = ["threshold", "verify", "days", "$", "%", "reports", "hours", "within", "exceed", "minimum", "must", "shall"]
            mfound = [w for w in meas if w in defn.lower()]
            if found:
                lines.append(f"FAILURE: Vague words detected: {found}. Remove them entirely.")
            if len(mfound) < 2:
                lines.append("FAILURE: Missing measurable criteria. Add numbers, hours, percentages.")
            if len(defn.split()) < 15:
                lines.append("FAILURE: Definition too short. Minimum 15 words.")

        elif task_id == "task_medium":
            if not last_action.get("rule_domain", "").strip():
                lines.append("FAILURE: rule_domain was empty.")
            if len(last_action.get("new_rule", "").split()) < 10:
                lines.append("FAILURE: New rule too short.")

        elif task_id == "task_hard":
            outcomes = last_action.get("expected_outcomes", {})
            if isinstance(outcomes, dict) and len(outcomes) >= 2:
                vals = [v for v in outcomes.values() if isinstance(v, (int, float))]
                vals = [v / 100 if v > 1 else v for v in vals]
                if vals and all(v > 0.70 for v in vals):
                    lines.append("FAILURE: Unrealistic tradeoff — all metrics > 0.70. Model friction.")
            mods = last_action.get("policy_modifications", [])
            if len(mods) < 2:
                lines.append("FAILURE: Need >= 2 policy_modifications.")

        # Append history summaries
        for act, sc in zip(self.action_history[-3:], self.score_history[-3:]):
            lines.append(f"  [{sc:.2f}] {act.get('action_type', '?')}")

        # Surgical Refinement Guard
        if last_score >= 0.80:
            lines = [
                f"\n=== SURGICAL REFINEMENT (Step {step}) ===",
                f"Current Score: {last_score:.3f} — EXCELLENT.",
                "CRITICAL: Do NOT rewrite the policy. Only perform 'surgical' removals or additions.",
                "1. CHECK: Remove 'might', 'could', 'perhaps', 'sometimes', 'often' if present.",
                "2. CHECK: Ensure words count >= 12. Add one more specific metric (%, hours, $) if needed.",
                "Do NOT add any words that could be seen as vague. Aim for 0.95+."
            ]
        else:
            target = min(last_score + 0.20, 0.95)
            lines.append(f"\nYour next proposal MUST score above {target:.2f}. Be more specific.")

        return "\n".join(lines)

    def get_action(self, client: OpenAI, task_id: str, obs: dict) -> dict:
        """Generate the next strategic action for the given task."""
        step = obs.get("step_count", 0)
        last_score = obs.get("info", {}).get("last_reward", 0.0)
        last_action = obs.get("info", {}).get("last_action", {})
        feedback = self._build_feedback(step, last_score, last_action, task_id)

        if task_id == "task_easy":
            prompt = (
                f"POLICIES: {obs.get('current_policies', [])}\n"
                f"DATA: {obs.get('data_corpus', [])[:5]}\n{feedback}\n"
                "TASK: Propose clarification for an ambiguous term with a measurable definition.\n"
                "JSON: {\"action_type\": \"propose_clarification\", \"ambiguous_term\": \"...\", "
                "\"suggested_definition\": \"...\", \"affected_policy_ids\": [\"str\"], "
                "\"justification\": \"...\", \"think\": \"...\"}"
            )
        elif task_id == "task_medium":
            prompt = (
                f"POLICIES: {obs.get('current_policies', [])}\n"
                f"DATA: {obs.get('data_corpus', [])}\n{feedback}\n"
                "TASK: Propose a new rule for a coverage gap. Use mandatory language.\n"
                "JSON: {\"action_type\": \"propose_new_rule\", \"rule_domain\": \"...\", "
                "\"new_rule\": \"...\", \"scope\": [\"str\"], \"integration_points\": [\"str\"], "
                "\"justification\": \"...\", \"think\": \"...\"}"
            )
        else:
            prompt = (
                f"METRICS: {obs.get('system_metrics', {})}\n"
                f"ISSUES: {obs.get('identified_issues', [])}\n{feedback}\n"
                "TASK: Evolve policies with realistic tradeoffs.\n"
                "JSON: {\"action_type\": \"evolve_policy\", \"policy_modifications\": "
                "[{\"policy_id\": \"...\", \"change_type\": \"enhance|restrict|add|remove\", "
                "\"new_text\": \"...\", \"reason\": \"...\"}], \"expected_outcomes\": "
                "{\"fraud_rate\": 0.8, \"revenue_velocity\": 0.4}, "
                "\"rollback_conditions\": [\"...\"], \"justification\": \"...\", \"think\": \"...\"}"
            )

        result = self._call_llm(client, prompt)
        return result


# ─── Episode Runner ───
async def run_episode(client: Optional[OpenAI], env: Optional[PolicyEvolverEnv], task_id: str, setup_error: Optional[Exception] = None) -> dict:
    """Run a single task episode following the hackathon format."""
    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

    if setup_error:
        print(f"[FATAL] Setup Error: {setup_error}", file=sys.stderr)
        log_step(step=1, action="setup", reward=0.0, done=True, error=str(setup_error))
        log_end(task=task_id, success=False, steps=0, score=0.0, rewards=[])
        sys.exit(1)

    if not client or not env:
        print("[FATAL] Client or Environment not initialized", file=sys.stderr)
        log_step(step=1, action="setup", reward=0.0, done=True, error="Client or Environment not initialized")
        log_end(task=task_id, success=False, steps=0, score=0.0, rewards=[])
        sys.exit(1)

    agent = PolicyEvolverAgent(MODEL_NAME)
    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False

    try:
        result = await env.reset(task_id=task_id)

        for step in range(1, MAX_STEPS + 1):
            if result.done:
                break

            # Get observation as dict
            obs_dict = result.observation
            if hasattr(obs_dict, "model_dump"):
                obs_dict = obs_dict.model_dump()
            elif not isinstance(obs_dict, dict):
                obs_dict = dict(obs_dict)

            # Agent decides action (graceful failure per step)
            try:
                action_dict = agent.get_action(client, task_id, obs_dict)
            except Exception as e:
                # LLM call failed — log error for this step and move to next task
                print(f"[DEBUG] LLM error on step {step}: {e}", file=sys.stderr)
                log_step(step=step, action="llm_error", reward=0.0, done=True, error=str(e))
                rewards.append(0.0)
                steps_taken = step
                break
            agent.action_history.append(action_dict)

            # Validate and step
            error = None
            try:
                action_obj = Action.model_validate(action_dict)
                result = await env.step(action_obj)
                reward = result.reward or 0.0
                done = result.done
            except Exception as e:
                reward = 0.0
                done = True
                error = str(e)

            rewards.append(reward)
            agent.score_history.append(reward)
            steps_taken = step

            act_name = action_dict.get("action_type", "unknown")
            log_step(step=step, action=act_name, reward=reward, done=done, error=error)

            if done:
                break

        score = rewards[-1] if rewards else 0.0
        success = score >= SUCCESS_THRESHOLD

    except Exception as e:
        print(f"[FATAL] Runtime Error: {e}", file=sys.stderr)
        log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(e))
        log_end(task=task_id, success=False, steps=steps_taken, score=0.0, rewards=rewards)
        sys.exit(1)

    finally:
        # We only log_end here if we didn't exit(1) already
        if not sys.exc_info()[0]:
            log_end(task=task_id, success=success, steps=steps_taken, score=score, rewards=rewards)


# ─── Main Entry Point ───
async def main() -> None:
    client = None
    env = None
    setup_error = None

    try:
        # 1. Initialize OpenAI Client
        try:
            if not API_KEY or not API_BASE_URL:
                raise Exception("Missing mandatory environment variables: API_KEY and/or API_BASE_URL")
            client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
        except Exception as e:
            setup_error = Exception(f"OpenAI client initialization failed: {e}")

        # 2. Initialize Environment
        if not setup_error:
            try:
                if IMAGE_NAME:
                    # Manually handle Docker startup to override the 30s library default
                    from openenv.core.containers.runtime.providers import LocalDockerProvider
                    provider = LocalDockerProvider()
                    base_url = provider.start_container(IMAGE_NAME)
                    
                    print(f"[DEBUG] Waiting for container {IMAGE_NAME} at {base_url} (Extended Timeout 120s)...", flush=True)
                    provider.wait_for_ready(base_url, timeout_s=120.0)
                    
                    env = PolicyEvolverEnv(base_url=base_url, provider=provider)
                    await env.connect()
                else:
                    local_url = os.environ.get("ENV_BASE_URL", "http://127.0.0.1:8000")
                    env = PolicyEvolverEnv(base_url=local_url)
                    # For local testing, we might want to check connection immediately or let run_episode handle it
            except Exception as e:
                setup_error = Exception(f"Environment initialization failed: {e}")

    except Exception as e:
        setup_error = e

    # 3. Always loop over tasks to ensure structured logs
    tasks = ["task_easy", "task_medium", "task_hard"]
    for task in tasks:
        await run_episode(client, env, task, setup_error=setup_error)

    # 4. Final Cleanup
    if env:
        try:
            await env.close()
        except Exception as e:
            print(f"[DEBUG] env.close() error: {e}", flush=True)


if __name__ == "__main__":
    asyncio.run(main())