File size: 16,409 Bytes
4c68ece
 
 
 
 
 
 
 
 
 
 
 
9cdb062
4c68ece
 
 
8cd3fa7
89d39f7
6aa8acb
8cd3fa7
4c68ece
511f04a
4c68ece
 
8cd3fa7
4c68ece
 
899c12a
09a9c72
 
4c68ece
79fb14b
 
 
 
 
 
 
 
 
 
 
 
9c3ced0
 
 
 
 
 
 
79fb14b
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
8cd3fa7
4c68ece
 
9cdb062
4c68ece
 
 
 
9cdb062
4c68ece
9cdb062
4c68ece
 
 
8cd3fa7
4c68ece
 
 
 
 
 
 
 
 
 
89d39f7
 
 
4c68ece
 
6aa8acb
 
511f04a
 
8cd3fa7
4c68ece
 
8cd3fa7
4c68ece
 
8cd3fa7
4c68ece
 
8cd3fa7
511f04a
4c68ece
 
8cd3fa7
 
4c68ece
 
6aa8acb
 
 
 
4c68ece
511f04a
 
 
 
 
 
4c68ece
511f04a
89d39f7
 
 
 
292424c
511f04a
4c68ece
 
511f04a
 
 
4c68ece
511f04a
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
511f04a
4c68ece
511f04a
 
4c68ece
 
 
 
511f04a
 
 
 
 
4c68ece
511f04a
4c68ece
 
 
 
 
 
 
 
 
89d39f7
 
 
 
 
 
 
 
 
 
 
 
 
 
4c68ece
8cd3fa7
4c68ece
 
511f04a
 
 
4c68ece
 
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
933baa6
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
933baa6
8cd3fa7
933baa6
4c68ece
 
 
 
 
 
 
 
933baa6
4c68ece
 
 
 
 
 
9cdb062
4c68ece
9cdb062
 
 
899c12a
9cdb062
 
899c12a
9cdb062
 
899c12a
9cdb062
 
899c12a
9cdb062
6aa8acb
4c68ece
 
 
6aa8acb
4c68ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4f91f0
 
 
 
 
 
 
 
 
 
4c68ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899c12a
4c68ece
899c12a
 
4c68ece
 
899c12a
 
 
4c68ece
 
 
 
9cdb062
4c68ece
9cdb062
4c68ece
 
9cdb062
 
09a9c72
 
 
9cdb062
 
 
 
 
 
 
6a19dc6
 
 
 
 
 
 
 
 
 
9cdb062
47a298a
9cdb062
 
 
 
4c68ece
 
9cdb062
4c68ece
9cdb062
 
 
 
 
 
 
 
 
 
 
4c68ece
511f04a
8cd3fa7
4c68ece
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""
PolicyEvolverEnv β€” Hackathon Inference Script
=============================================
MANDATORY ENV VARS:
    API_BASE_URL   The API endpoint for the LLM.
    MODEL_NAME     The model identifier to use for inference.
    HF_TOKEN       Your Hugging Face / API key.
    IMAGE_NAME     The Docker image name (set by validator).

STDOUT FORMAT:
    [START] task=<task_name> env=policy_evolver_env model=<model_name>
    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
    [END]   task=<task_name> success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
"""

import asyncio
import os
import sys
import json
from typing import Dict, List, Optional

from openai import OpenAI
from client import PolicyEvolverEnv
from models import Action

# ─── Environment Variables (Hackathon Mandatory) ───
IMAGE_NAME = os.getenv("IMAGE_NAME")
API_KEY = os.environ.get("API_KEY") or os.environ.get("HF_TOKEN")
API_BASE_URL = os.environ.get("API_BASE_URL")
MODEL_NAME = os.environ.get("MODEL_NAME")
BENCHMARK = "policy_evolver_env"

# ─── Auto-discover model if MODEL_NAME is not set ───
if not MODEL_NAME and API_BASE_URL and API_KEY:
    try:
        import httpx
        resp = httpx.get(
            f"{API_BASE_URL.rstrip('/')}/models",
            headers={"Authorization": f"Bearer {API_KEY}"},
            timeout=10,
        )
        if resp.status_code == 200:
            models_data = resp.json().get("data", [])
            # Filter out wildcards and pick a real model name
            for m in models_data:
                mid = m.get("id", "")
                if mid and mid != "*" and not mid.startswith("*"):
                    MODEL_NAME = mid
                    print(f"[DEBUG] Auto-discovered model: {MODEL_NAME}", flush=True)
                    break
    except Exception as e:
        print(f"[DEBUG] Model discovery failed: {e}", flush=True)

if not MODEL_NAME:
    MODEL_NAME = "gpt-4o-mini"
    print(f"[DEBUG] Using default MODEL_NAME: {MODEL_NAME}", flush=True)
MAX_STEPS = 5
TEMPERATURE = 0.0
SUCCESS_THRESHOLD = 0.70


# ─── Logging Helpers (Hackathon Mandatory Format) ───
def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = f'"{error}"' if error else "null"
    done_val = str(done).lower()
    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)


def log_end(task: str, success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] task={task} success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)


# ─── LLM Agent ───
class PolicyEvolverAgent:
    """Strategic Policy Agent β€” maximizes governance scores via in-context adaptation."""

    SYSTEM_PROMPT = (
        "You are a Strategic Policy Engineer. Your goal is to maximize governance outcomes through verifiable "
        "precision. STYLISTIC RULES:\n"
        "1. NO VAGUENESS: Never use words like 'maybe', 'perhaps', 'sometimes', 'usually'.\n"
        "2. COMMAND LANGUAGE: Use 'must', 'shall', 'prohibited', 'required', 'mandatory'.\n"
        "3. MEASURABLE CRITERIA: Define terms with 'if-then' and metrics.\n"
        "4. ANALYTICAL COT: Your 'think' field MUST be 150-250 words and include terms: 'tradeoff', 'precision', "
        "'recall', 'threshold', 'impact', 'evidence'.\n"
        "5. JSON ONLY: Output ONLY the JSON object. No preamble.\n"
        "6. INCREMENTALISM: If your previous score was high (>0.80), focus on surgical precision rather than holistic rewriting. "
        "DO NOT add words that create ambiguity."
    )

    def __init__(self, model: str):
        self.model = model
        self.action_history: list = []
        self.score_history: list = []

    def _call_llm(self, client: OpenAI, prompt: str) -> Optional[dict]:
        """Call the LLM and robustly parse the JSON response."""
        try:
            resp = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.SYSTEM_PROMPT},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=1024,
                temperature=TEMPERATURE,
                seed=42,
            )
            raw = resp.choices[0].message.content.strip()

            # Strip markdown fences
            if "```json" in raw:
                raw = raw.split("```json")[1].split("```")[0].strip()
            elif "```" in raw:
                raw = raw.split("```")[1].split("```")[0].strip()

            try:
                return json.loads(raw)
            except json.JSONDecodeError:
                start = raw.find("{")
                end = raw.rfind("}")
                if start != -1 and end != -1:
                    return json.loads(raw[start : end + 1])
                raise
        except Exception as e:
            print(f"[DEBUG] LLM Call Error: {e}", file=sys.stderr)
            if 'raw' in locals():
                print(f"[DEBUG] Raw content: {raw}", file=sys.stderr)
            raise e

    def _build_feedback(self, step: int, last_score: float, last_action: dict, task_id: str) -> str:
        """Build diagnostic feedback from previous step for in-context learning."""
        if step == 0 or not last_action:
            return ""

        lines = [
            f"\n=== STRATEGIC FEEDBACK (Step {step}) ===",
            f"Previous score: {last_score:.3f} / 1.000",
        ]

        if task_id == "task_easy":
            defn = last_action.get("suggested_definition", "")
            vague = ["might", "could", "perhaps", "sometimes", "often", "generally", "usually", "typically", "may", "possibly"]
            found = [w for w in vague if w in defn.lower()]
            meas = ["threshold", "verify", "days", "$", "%", "reports", "hours", "within", "exceed", "minimum", "must", "shall"]
            mfound = [w for w in meas if w in defn.lower()]
            if found:
                lines.append(f"FAILURE: Vague words detected: {found}. Remove them entirely.")
            if len(mfound) < 2:
                lines.append("FAILURE: Missing measurable criteria. Add numbers, hours, percentages.")
            if len(defn.split()) < 15:
                lines.append("FAILURE: Definition too short. Minimum 15 words.")

        elif task_id == "task_medium":
            if not last_action.get("rule_domain", "").strip():
                lines.append("FAILURE: rule_domain was empty.")
            if len(last_action.get("new_rule", "").split()) < 10:
                lines.append("FAILURE: New rule too short.")

        elif task_id == "task_hard":
            outcomes = last_action.get("expected_outcomes", {})
            if isinstance(outcomes, dict) and len(outcomes) >= 2:
                vals = [v for v in outcomes.values() if isinstance(v, (int, float))]
                vals = [v / 100 if v > 1 else v for v in vals]
                if vals and all(v > 0.70 for v in vals):
                    lines.append("FAILURE: Unrealistic tradeoff β€” all metrics > 0.70. Model friction.")
            mods = last_action.get("policy_modifications", [])
            if len(mods) < 2:
                lines.append("FAILURE: Need >= 2 policy_modifications.")

        # Append history summaries
        for act, sc in zip(self.action_history[-3:], self.score_history[-3:]):
            lines.append(f"  [{sc:.2f}] {act.get('action_type', '?')}")

        # Surgical Refinement Guard
        if last_score >= 0.80:
            lines = [
                f"\n=== SURGICAL REFINEMENT (Step {step}) ===",
                f"Current Score: {last_score:.3f} β€” EXCELLENT.",
                "CRITICAL: Do NOT rewrite the policy. Only perform 'surgical' removals or additions.",
                "1. CHECK: Remove 'might', 'could', 'perhaps', 'sometimes', 'often' if present.",
                "2. CHECK: Ensure words count >= 12. Add one more specific metric (%, hours, $) if needed.",
                "Do NOT add any words that could be seen as vague. Aim for 0.95+."
            ]
        else:
            target = min(last_score + 0.20, 0.95)
            lines.append(f"\nYour next proposal MUST score above {target:.2f}. Be more specific.")

        return "\n".join(lines)

    def get_action(self, client: OpenAI, task_id: str, obs: dict) -> dict:
        """Generate the next strategic action for the given task."""
        step = obs.get("step_count", 0)
        last_score = obs.get("info", {}).get("last_reward", 0.0)
        last_action = obs.get("info", {}).get("last_action", {})
        feedback = self._build_feedback(step, last_score, last_action, task_id)

        if task_id == "task_easy":
            prompt = (
                f"POLICIES: {obs.get('current_policies', [])}\n"
                f"DATA: {obs.get('data_corpus', [])[:5]}\n{feedback}\n"
                "TASK: Propose clarification for an ambiguous term with a measurable definition.\n"
                "JSON: {\"action_type\": \"propose_clarification\", \"ambiguous_term\": \"...\", "
                "\"suggested_definition\": \"...\", \"affected_policy_ids\": [\"str\"], "
                "\"justification\": \"...\", \"think\": \"...\"}"
            )
        elif task_id == "task_medium":
            prompt = (
                f"POLICIES: {obs.get('current_policies', [])}\n"
                f"DATA: {obs.get('data_corpus', [])}\n{feedback}\n"
                "TASK: Propose a new rule for a coverage gap. Use mandatory language.\n"
                "JSON: {\"action_type\": \"propose_new_rule\", \"rule_domain\": \"...\", "
                "\"new_rule\": \"...\", \"scope\": [\"str\"], \"integration_points\": [\"str\"], "
                "\"justification\": \"...\", \"think\": \"...\"}"
            )
        else:
            prompt = (
                f"METRICS: {obs.get('system_metrics', {})}\n"
                f"ISSUES: {obs.get('identified_issues', [])}\n{feedback}\n"
                "TASK: Evolve policies with realistic tradeoffs.\n"
                "JSON: {\"action_type\": \"evolve_policy\", \"policy_modifications\": "
                "[{\"policy_id\": \"...\", \"change_type\": \"enhance|restrict|add|remove\", "
                "\"new_text\": \"...\", \"reason\": \"...\"}], \"expected_outcomes\": "
                "{\"fraud_rate\": 0.8, \"revenue_velocity\": 0.4}, "
                "\"rollback_conditions\": [\"...\"], \"justification\": \"...\", \"think\": \"...\"}"
            )

        result = self._call_llm(client, prompt)
        return result


# ─── Episode Runner ───
async def run_episode(client: Optional[OpenAI], env: Optional[PolicyEvolverEnv], task_id: str, setup_error: Optional[Exception] = None) -> dict:
    """Run a single task episode following the hackathon format."""
    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

    if setup_error:
        print(f"[FATAL] Setup Error: {setup_error}", file=sys.stderr)
        log_step(step=1, action="setup", reward=0.0, done=True, error=str(setup_error))
        log_end(task=task_id, success=False, steps=0, score=0.0, rewards=[])
        sys.exit(1)

    if not client or not env:
        print("[FATAL] Client or Environment not initialized", file=sys.stderr)
        log_step(step=1, action="setup", reward=0.0, done=True, error="Client or Environment not initialized")
        log_end(task=task_id, success=False, steps=0, score=0.0, rewards=[])
        sys.exit(1)

    agent = PolicyEvolverAgent(MODEL_NAME)
    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False

    try:
        result = await env.reset(task_id=task_id)

        for step in range(1, MAX_STEPS + 1):
            if result.done:
                break

            # Get observation as dict
            obs_dict = result.observation
            if hasattr(obs_dict, "model_dump"):
                obs_dict = obs_dict.model_dump()
            elif not isinstance(obs_dict, dict):
                obs_dict = dict(obs_dict)

            # Agent decides action (graceful failure per step)
            try:
                action_dict = agent.get_action(client, task_id, obs_dict)
            except Exception as e:
                # LLM call failed β€” log error for this step and move to next task
                print(f"[DEBUG] LLM error on step {step}: {e}", file=sys.stderr)
                log_step(step=step, action="llm_error", reward=0.0, done=True, error=str(e))
                rewards.append(0.0)
                steps_taken = step
                break
            agent.action_history.append(action_dict)

            # Validate and step
            error = None
            try:
                action_obj = Action.model_validate(action_dict)
                result = await env.step(action_obj)
                reward = result.reward or 0.0
                done = result.done
            except Exception as e:
                reward = 0.0
                done = True
                error = str(e)

            rewards.append(reward)
            agent.score_history.append(reward)
            steps_taken = step

            act_name = action_dict.get("action_type", "unknown")
            log_step(step=step, action=act_name, reward=reward, done=done, error=error)

            if done:
                break

        score = rewards[-1] if rewards else 0.0
        success = score >= SUCCESS_THRESHOLD

    except Exception as e:
        print(f"[FATAL] Runtime Error: {e}", file=sys.stderr)
        log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(e))
        log_end(task=task_id, success=False, steps=steps_taken, score=0.0, rewards=rewards)
        sys.exit(1)

    finally:
        # We only log_end here if we didn't exit(1) already
        if not sys.exc_info()[0]:
            log_end(task=task_id, success=success, steps=steps_taken, score=score, rewards=rewards)


# ─── Main Entry Point ───
async def main() -> None:
    client = None
    env = None
    setup_error = None

    try:
        # 1. Initialize OpenAI Client
        try:
            if not API_KEY or not API_BASE_URL:
                raise Exception("Missing mandatory environment variables: API_KEY and/or API_BASE_URL")
            client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
        except Exception as e:
            setup_error = Exception(f"OpenAI client initialization failed: {e}")

        # 2. Initialize Environment
        if not setup_error:
            try:
                if IMAGE_NAME:
                    # Manually handle Docker startup to override the 30s library default
                    from openenv.core.containers.runtime.providers import LocalDockerProvider
                    provider = LocalDockerProvider()
                    base_url = provider.start_container(IMAGE_NAME)
                    
                    print(f"[DEBUG] Waiting for container {IMAGE_NAME} at {base_url} (Extended Timeout 120s)...", flush=True)
                    provider.wait_for_ready(base_url, timeout_s=120.0)
                    
                    env = PolicyEvolverEnv(base_url=base_url, provider=provider)
                    await env.connect()
                else:
                    local_url = os.environ.get("ENV_BASE_URL", "http://127.0.0.1:8000")
                    env = PolicyEvolverEnv(base_url=local_url)
                    # For local testing, we might want to check connection immediately or let run_episode handle it
            except Exception as e:
                setup_error = Exception(f"Environment initialization failed: {e}")

    except Exception as e:
        setup_error = e

    # 3. Always loop over tasks to ensure structured logs
    tasks = ["task_easy", "task_medium", "task_hard"]
    for task in tasks:
        await run_episode(client, env, task, setup_error=setup_error)

    # 4. Final Cleanup
    if env:
        try:
            await env.close()
        except Exception as e:
            print(f"[DEBUG] env.close() error: {e}", flush=True)


if __name__ == "__main__":
    asyncio.run(main())