amaljoe88's picture
deploy: sync 712e5bc -> HF
cf6c0e0
"""Developer and Critic agents for VisionCoder OpenEnv.
All agent logic (tool-call handling, TODO-list critique, episode loop)
lives here. Prompts are in openenv.prompts.
Usage:
from openenv.agents import run_episode, AgentConfig
config = AgentConfig(api_key=..., api_base=..., model=...)
result = run_episode(env_client, config, difficulty="hard", session=obs, dbg=dbg)
"""
from __future__ import annotations
import json
import logging
import os
import re
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
from openai import OpenAI
from openenv.prompts import (
DEVELOPER_SYSTEM,
FIRST_CRITIC_SYSTEM,
SUBSEQUENT_CRITIC_SYSTEM,
FALLBACK_HTML,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# TODO list tracker
# ---------------------------------------------------------------------------
_PRIORITY_ORDER = {"HIGH": 0, "MEDIUM": 1, "LOW": 2}
@dataclass
class TodoItem:
text: str # full item text including "PRIORITY | DIMENSION — description"
done: bool = False
priority: str = "MEDIUM" # HIGH / MEDIUM / LOW
@dataclass
class TodoList:
items: List[TodoItem] = field(default_factory=list)
def all_done(self) -> bool:
return bool(self.items) and all(item.done for item in self.items)
def pending_count(self) -> int:
return sum(1 for item in self.items if not item.done)
def format_for_critic(self) -> str:
"""Previous TODO list passed to Critic — includes priority tag for exact copying."""
if not self.items:
return "(No previous TODO list — this is the first review.)"
lines = ["Previous TODO list (copy with EXACT priority and text, update only the markers):"]
for item in self.items:
marker = "[✓]" if item.done else "[ ]"
lines.append(f"{marker} {item.priority} | {item.text}")
return "\n".join(lines)
def format_for_developer(self) -> str:
"""Pending items sorted by priority, formatted as actionable critique."""
_NOISE_PHRASES = (
"matches the reference", "which matches",
"is present and correct", "is correct", "matches reference",
)
pending = [
item for item in self.items
if not item.done
and not any(p in item.text.lower() for p in _NOISE_PHRASES)
]
if not pending:
return (
"The Critic found no remaining issues. Look carefully at the reference "
"screenshot for fine details (spacing, colors, missing elements) and refine."
)
# Sort by priority: HIGH first, cap at 8 so Developer gets focused feedback
pending.sort(key=lambda it: _PRIORITY_ORDER.get(it.priority, 1))
pending = pending[:8]
lines = ["Fix these issues in priority order (Critic feedback):"]
for item in pending:
lines.append(f"- [{item.priority}] {item.text}")
return "\n".join(lines)
@classmethod
def parse(cls, text: str) -> "TodoList":
"""Parse a TODO list from Critic output text.
Expected item format: [✓/[ ]/[+]] PRIORITY | DIMENSION — description
Priority tag (HIGH/MEDIUM/LOW) is optional — defaults to MEDIUM if absent.
[+] items are always kept pending (can't resolve same step they're discovered).
Duplicate and truncated items are dropped.
"""
_TRUNCATION_ENDINGS = (
" in", " on", " at", " to", " of", " for", " and", " the",
" a", " an", " with", " by", " from", " as", " or", " but",
)
_VALID_PRIORITIES = {"HIGH", "MEDIUM", "LOW"}
result = cls()
seen: set = set()
for line in text.split("\n"):
line = line.strip()
if line.startswith("[✓]"):
item_text = line[3:].strip()
done = True
elif line.startswith("[ ]"):
item_text = line[3:].strip()
done = False
elif line.startswith("[+]"):
item_text = line[3:].strip()
done = False
else:
continue
if len(item_text) < 10:
continue
if any(item_text.lower().endswith(e) for e in _TRUNCATION_ENDINGS):
continue
# Extract priority if present: "HIGH | LAYOUT — ..."
priority = "MEDIUM"
parts = item_text.split("|", 1)
if len(parts) == 2:
candidate = parts[0].strip().upper()
if candidate in _VALID_PRIORITIES:
priority = candidate
item_text = parts[1].strip()
key = item_text.lower()[:60]
if key not in seen:
seen.add(key)
result.items.append(TodoItem(text=item_text, done=done, priority=priority))
return result
@classmethod
def merge(cls, prev: "TodoList", updated: "TodoList") -> "TodoList":
"""Merge updated list back — re-adds any pending prev items the Critic forgot.
Uses 40-char prefix matching so paraphrased items count as the same issue.
Resolved prev items (done=True) are never re-added.
New [+] items introduced in this step are capped at 3 by priority so the
list doesn't balloon when the model ignores the per-step limit.
"""
prev_prefixes = {item.text.lower()[:40] for item in prev.items}
# Separate carried items (also in prev) from genuinely new [+] items
carried: list = []
new_items: list = []
for item in updated.items:
if item.text.lower()[:40] in prev_prefixes:
carried.append(item)
else:
new_items.append(item)
# Keep at most 3 new items (highest priority first)
new_items.sort(key=lambda it: _PRIORITY_ORDER.get(it.priority, 1))
new_items = new_items[:3]
result = cls(items=carried + new_items)
updated_prefixes = {item.text.lower()[:40] for item in result.items}
# Re-add any pending prev items the Critic dropped entirely
for prev_item in prev.items:
if prev_item.done:
continue
if prev_item.text.lower()[:40] not in updated_prefixes:
result.items.append(prev_item)
return result
# ---------------------------------------------------------------------------
# HTML helpers
# ---------------------------------------------------------------------------
def _looks_like_html(text: str) -> bool:
t = text.strip().lower()
return t.startswith("<!doctype") or t.startswith("<html")
def _parse_qwen_xml_tool_call(content: str) -> Optional[Tuple[str, dict]]:
"""Fallback parser for Qwen3's XML tool call format when vllm hermes parser misses it."""
if "<tool_call>" not in content:
return None
fn_m = re.search(r"<function=(\w+)>", content)
if not fn_m:
return None
func_name = fn_m.group(1)
args = {
m.group(1): m.group(2).strip()
for m in re.finditer(r"<parameter=(\w+)>(.*?)(?:</parameter>|\Z)", content, re.DOTALL)
}
return (func_name, args) if args else None
def _clean_html_output(content: str) -> str:
"""Strip residual <tool_call> wrapper or markdown fences from model output."""
parsed = _parse_qwen_xml_tool_call(content)
if parsed:
_, args = parsed
if "html" in args:
return args["html"]
fence = re.match(r"```(?:html)?\s*(.*?)\s*```", content, re.DOTALL)
if fence:
return fence.group(1)
return content
# ---------------------------------------------------------------------------
# Developer agent
# ---------------------------------------------------------------------------
def developer_turn(
client: OpenAI,
env_client, # unused — kept for signature compatibility
model: str,
ref_b64: str,
current_html: str,
todo: Optional[TodoList] = None,
dbg=None,
) -> str:
"""Developer generates HTML from the reference screenshot in a single LLM call.
No tools — rendering is the environment's responsibility after step().
On subsequent steps the Critic's TODO list is included so the Developer
knows exactly what to fix.
"""
if dbg:
dbg.log_developer_input(current_html, todo.format_for_developer() if todo else None)
messages = [{"role": "system", "content": DEVELOPER_SYSTEM}]
user_content: list = [
{"type": "text", "text": "Reference screenshot (reproduce this UI):"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
]
if current_html and todo and todo.items:
user_content.append({
"type": "text",
"text": (
f"\n\nYour previous HTML:\n```html\n{current_html[:5000]}\n```\n\n"
f"{todo.format_for_developer()}\n\n"
"Output the revised HTML only."
),
})
else:
user_content.append({
"type": "text",
"text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.",
})
messages.append({"role": "user", "content": user_content})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
temperature=0.7,
)
content = response.choices[0].message.content or ""
html_out = _clean_html_output(content)
if not _looks_like_html(html_out):
html_out = FALLBACK_HTML
if dbg:
dbg.log_developer_output(html_out)
return html_out
# ---------------------------------------------------------------------------
# Critic agent
# ---------------------------------------------------------------------------
def critic_turn(
client: OpenAI,
model: str,
ref_b64: str,
render_curr_b64: str,
prev_todo: Optional[TodoList],
render_prev_b64: Optional[str] = None,
current_html: str = "",
dbg=None,
) -> Tuple[str, TodoList]:
"""Critic reviews current render vs reference and returns (raw_text, updated TodoList).
Receives the Developer's HTML source so it can write selector-specific CSS fixes
instead of abstract visual observations.
"""
is_first = prev_todo is None
if dbg:
prev_critique_text = prev_todo.format_for_developer() if prev_todo else None
dbg.log_critic_input(ref_b64, render_prev_b64, prev_critique_text, render_curr_b64)
system = FIRST_CRITIC_SYSTEM if is_first else SUBSEQUENT_CRITIC_SYSTEM
critic_messages = [{"role": "system", "content": system}]
content: list = [
{"type": "text", "text": "Reference screenshot:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
]
if render_prev_b64 and prev_todo:
content += [
{"type": "text", "text": "Previous render (before this step's revision):"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_prev_b64}"}},
]
content += [
{"type": "text", "text": "Current render:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_curr_b64}"}},
]
if current_html:
content.append({
"type": "text",
"text": (
f"\nDeveloper's current HTML source (use exact selectors in your FIX instructions):\n"
f"```html\n{current_html[:5000]}\n```"
),
})
if is_first:
content.append({
"type": "text",
"text": (
"\nThis is the first review. Perform a comprehensive visual audit covering "
"LAYOUT, STRUCTURE, COLOR, TYPOGRAPHY, SPACING, and TEXT dimensions. "
"Output your initial TODO LIST with [+] items only. "
"Each item MUST include a → FIX: instruction with exact CSS."
),
})
else:
content.append({
"type": "text",
"text": (
f"\n{prev_todo.format_for_critic()}\n\n"
"Update the TODO list based on what you see in the CURRENT RENDER and HTML. "
"Mark fixed items [✓], keep unresolved items [ ] (update FIX selector if HTML changed), "
"add new issues with [+]. Each item must have a → FIX: instruction. "
"Stop after the last item — no STATUS or summary line."
),
})
critic_messages.append({"role": "user", "content": content})
response = client.chat.completions.create(
model=model,
messages=critic_messages,
max_tokens=2048,
temperature=0.1,
)
critique_text = response.choices[0].message.content or ""
updated_todo = TodoList.parse(critique_text)
if prev_todo:
if updated_todo.all_done():
# Critic explicitly marked every visible item [✓] — trust that signal.
# Skipping merge avoids re-adding items the Critic intentionally resolved.
pass
else:
updated_todo = TodoList.merge(prev_todo, updated_todo)
if dbg:
dbg.log_critic_output(critique_text, updated_todo)
return critique_text, updated_todo
# ---------------------------------------------------------------------------
# Episode config
# ---------------------------------------------------------------------------
@dataclass
class AgentConfig:
api_key: str
api_base: str
model: str
max_steps: int = 5
# ---------------------------------------------------------------------------
# Episode runner
# ---------------------------------------------------------------------------
@dataclass
class StepResult:
step: int
html: str
reward: float
done: bool
critique: str
todo: Optional[TodoList]
render_full_b64: Optional[str]
sub_rewards: Optional[dict]
error: Optional[str] = None
def run_episode(
env_client,
config: AgentConfig,
session_id: str,
ref_b64: str,
dbg=None,
on_step=None, # optional callback(StepResult) → None, called immediately after env step
) -> List[StepResult]:
"""Run one full episode (Developer↔Critic loop) and return per-step results.
Terminates when:
- max_steps reached (env done=True)
- Critic marks all TODO items resolved
- No reward improvement for 2 consecutive steps (plateau)
Monotonic reward guarantee: Developer always receives the best-seen HTML as
its base, so regressions don't compound. If a step produces lower reward the
Developer retries from the best-known state on the next step.
"""
client = OpenAI(api_key=config.api_key, base_url=config.api_base)
current_html = ""
best_html = ""
best_reward = 0.0
no_improve_streak = 0
_MAX_NO_IMPROVE = 2
todo: Optional[TodoList] = None
render_prev: Optional[str] = None
results: List[StepResult] = []
for step_i in range(config.max_steps):
# Guard: Critic resolved everything
if todo is not None and todo.pending_count() == 0:
break
# Guard: plateau — no improvement for N consecutive steps
if no_improve_streak >= _MAX_NO_IMPROVE:
print(
f"[CRITIC] No improvement for {_MAX_NO_IMPROVE} consecutive steps "
f"(best={best_reward:.3f}) — stopping early.",
flush=True,
)
break
error: Optional[str] = None
# Developer always starts from the best-seen HTML to avoid compounding regressions
try:
current_html = developer_turn(
client, env_client, config.model,
ref_b64, best_html, todo, dbg,
)
except Exception as exc:
error = str(exc)[:120]
current_html = FALLBACK_HTML
# Step the environment
step_resp = env_client.post(
"/step",
json={"html": current_html, "session_id": session_id},
)
step_resp.raise_for_status()
result = step_resp.json()
reward = float(result.get("reward", 0.0))
env_done = bool(result.get("done", False))
render_full = result.get("render_full")
sub_rewards = result.get("metadata", {}).get("rewards")
# Monotonic tracking — update best only on genuine improvement
if reward > best_reward:
best_reward = reward
best_html = current_html
no_improve_streak = 0
else:
no_improve_streak += 1
if dbg:
dbg.log_step_result(reward, env_done, render_full, sub_rewards)
step_n = step_i + 1
sr = StepResult(
step=step_n,
html=current_html,
reward=reward,
done=env_done,
critique="",
todo=todo,
render_full_b64=render_full,
sub_rewards=sub_rewards,
error=error,
)
# Notify caller immediately so [STEP] prints before [CRITIC]
if on_step:
on_step(sr)
# Critic turn (skip on final env step)
if not env_done:
try:
critique_text, todo = critic_turn(
client, config.model,
ref_b64, render_full,
prev_todo=todo,
render_prev_b64=render_prev,
current_html=current_html,
dbg=dbg,
)
sr.critique = critique_text
sr.todo = todo
preview = critique_text.replace("\n", " ")[:200]
print(
f"[CRITIC] step={step_n} reward={reward:.3f} best={best_reward:.3f}{preview}",
flush=True,
)
except Exception as exc:
logger.warning("Critic failed: %s", exc)
todo = None
results.append(sr)
render_prev = render_full
if env_done:
break
if todo is not None and todo.pending_count() == 0:
print(
f"[CRITIC] All items resolved at step={step_n} reward={reward:.3f} — stopping.",
flush=True,
)
break
return results
# ---------------------------------------------------------------------------
# Approach B: Long-horizon Developer (no Critic, sees full history)
# ---------------------------------------------------------------------------
def developer_turn_long_horizon(
client: OpenAI,
model: str,
ref_b64: str,
history: List[Tuple[str, str]], # list of (render_full_b64, html)
dbg=None,
) -> str:
"""Developer with full history: reference + all previous renders + all previous HTML."""
messages = [{"role": "system", "content": DEVELOPER_SYSTEM}]
user_content: list = [
{"type": "text", "text": "Reference screenshot (reproduce this UI):"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
]
if history:
for i, (render_b64, prev_html) in enumerate(history, 1):
user_content.append({
"type": "text",
"text": f"\n\nStep {i} render:",
})
user_content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{render_b64}"},
})
user_content.append({
"type": "text",
"text": f"Step {i} HTML:\n```html\n{prev_html[:2000]}\n```",
})
user_content.append({
"type": "text",
"text": (
"\n\nAll your previous attempts are shown above. "
"Generate improved HTML that better matches the reference. "
"Output the HTML only."
),
})
else:
user_content.append({
"type": "text",
"text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.",
})
messages.append({"role": "user", "content": user_content})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
temperature=0.7,
)
content = response.choices[0].message.content or ""
html_out = _clean_html_output(content)
return html_out if _looks_like_html(html_out) else FALLBACK_HTML
def run_episode_long_dev(
env_client,
config: AgentConfig,
session_id: str,
ref_b64: str,
dbg=None,
on_step=None,
) -> List[StepResult]:
"""Approach B: Long-horizon Developer only — full history, no Critic."""
client = OpenAI(api_key=config.api_key, base_url=config.api_base)
current_html = ""
history: List[Tuple[str, str]] = []
results: List[StepResult] = []
for step_i in range(config.max_steps):
error: Optional[str] = None
try:
current_html = developer_turn_long_horizon(
client, config.model, ref_b64, history, dbg
)
except Exception as exc:
error = str(exc)[:120]
current_html = FALLBACK_HTML
step_resp = env_client.post(
"/step",
json={"html": current_html, "session_id": session_id},
)
step_resp.raise_for_status()
result = step_resp.json()
reward = float(result.get("reward", 0.0))
env_done = bool(result.get("done", False))
render_full = result.get("render_full")
sub_rewards = result.get("metadata", {}).get("rewards")
step_n = step_i + 1
sr = StepResult(
step=step_n,
html=current_html,
reward=reward,
done=env_done,
critique="",
todo=None,
render_full_b64=render_full,
sub_rewards=sub_rewards,
error=error,
)
if on_step:
on_step(sr)
if render_full:
history.append((render_full, current_html))
results.append(sr)
if env_done:
break
return results
# ---------------------------------------------------------------------------
# Approach C: Short-horizon Developer (no Critic, sees only last render)
# ---------------------------------------------------------------------------
def developer_turn_short_horizon(
client: OpenAI,
model: str,
ref_b64: str,
prev_render_b64: Optional[str],
prev_html: Optional[str],
dbg=None,
) -> str:
"""Developer with short horizon: reference + only last render + only last HTML."""
messages = [{"role": "system", "content": DEVELOPER_SYSTEM}]
user_content: list = [
{"type": "text", "text": "Reference screenshot (reproduce this UI):"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
]
if prev_render_b64 and prev_html:
user_content += [
{"type": "text", "text": "\n\nYour previous render:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{prev_render_b64}"}},
{
"type": "text",
"text": (
f"\n\nYour previous HTML:\n```html\n{prev_html[:3000]}\n```\n\n"
"Compare the renders and output improved HTML only."
),
},
]
else:
user_content.append({
"type": "text",
"text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.",
})
messages.append({"role": "user", "content": user_content})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
temperature=0.7,
)
content = response.choices[0].message.content or ""
html_out = _clean_html_output(content)
return html_out if _looks_like_html(html_out) else FALLBACK_HTML
def run_episode_short_dev(
env_client,
config: AgentConfig,
session_id: str,
ref_b64: str,
dbg=None,
on_step=None,
) -> List[StepResult]:
"""Approach C: Short-horizon Developer only — sees only last render each step, no Critic."""
client = OpenAI(api_key=config.api_key, base_url=config.api_base)
current_html = ""
prev_render: Optional[str] = None
results: List[StepResult] = []
for step_i in range(config.max_steps):
error: Optional[str] = None
try:
current_html = developer_turn_short_horizon(
client, config.model, ref_b64,
prev_render,
current_html if step_i > 0 else None,
dbg,
)
except Exception as exc:
error = str(exc)[:120]
current_html = FALLBACK_HTML
step_resp = env_client.post(
"/step",
json={"html": current_html, "session_id": session_id},
)
step_resp.raise_for_status()
result = step_resp.json()
reward = float(result.get("reward", 0.0))
env_done = bool(result.get("done", False))
render_full = result.get("render_full")
sub_rewards = result.get("metadata", {}).get("rewards")
step_n = step_i + 1
sr = StepResult(
step=step_n,
html=current_html,
reward=reward,
done=env_done,
critique="",
todo=None,
render_full_b64=render_full,
sub_rewards=sub_rewards,
error=error,
)
if on_step:
on_step(sr)
prev_render = render_full # only keep the latest render
results.append(sr)
if env_done:
break
return results
# ---------------------------------------------------------------------------
# Approach D: Long-horizon Developer (low-res renders) + simple free-form Critic
# ---------------------------------------------------------------------------
_SIMPLE_CRITIC_SYSTEM = (
"You are a UI reviewer. You will be shown a reference screenshot and a current render "
"of HTML that is meant to reproduce it.\n\n"
"Describe what needs to change the most to make the render match the reference. "
"Be concise and specific — mention exact colors, sizes, or elements where helpful. "
"You can write a short paragraph or a bullet list. No structured format required."
)
_SIMPLE_DEV_SYSTEM = (
"You are a UI-to-code expert. Given a reference screenshot of a web page, "
"generate complete HTML with inline CSS that reproduces the layout as accurately as possible.\n\n"
"Critical layout rules:\n"
"- Always use `* { box-sizing: border-box; margin: 0; padding: 0; }` reset.\n"
"- Page and all top-level sections must be full-width: `width: 100%; min-height: 100vh`.\n"
"- Never center-constrain the overall page — only constrain inner content containers if the reference does.\n"
"- Match background colors, section colors, and typography as precisely as possible.\n\n"
"Output ONLY the raw HTML code starting with <!DOCTYPE html>. "
"No explanations, no markdown fences — just the HTML."
)
def _simple_critic_turn(
client: OpenAI,
model: str,
ref_b64: str,
render_full_b64: str,
) -> str:
"""Simple free-form critic: compare ref vs render, say what needs to change most."""
messages = [{"role": "system", "content": _SIMPLE_CRITIC_SYSTEM}]
messages.append({"role": "user", "content": [
{"type": "text", "text": "Reference:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
{"type": "text", "text": "Current render:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_full_b64}"}},
{"type": "text", "text": "What needs to change the most?"},
]})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=512,
temperature=0.1,
)
return response.choices[0].message.content or ""
def _developer_turn_d(
client: OpenAI,
model: str,
ref_b64: str,
history: List[Tuple[str, str]], # (render_low_b64, html)
critique: Optional[str],
) -> str:
"""Approach D developer: full-res ref + all previous low-res renders + all HTML + critic feedback."""
messages = [{"role": "system", "content": _SIMPLE_DEV_SYSTEM}]
user_content: list = [
{"type": "text", "text": "Reference screenshot (reproduce this UI):"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}},
]
if history:
for i, (render_low_b64, prev_html) in enumerate(history, 1):
user_content.append({"type": "text", "text": f"\n\nStep {i} render (low-res preview):"})
user_content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_low_b64}"}})
user_content.append({"type": "text", "text": f"Step {i} HTML:\n```html\n{prev_html[:2000]}\n```"})
if critique:
user_content.append({
"type": "text",
"text": f"\n\nReviewer feedback on your last render:\n{critique}\n\nGenerate improved HTML addressing this feedback. Output the HTML only.",
})
elif history:
user_content.append({
"type": "text",
"text": "\n\nGenerate improved HTML that better matches the reference. Output the HTML only.",
})
else:
user_content.append({
"type": "text",
"text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.",
})
messages.append({"role": "user", "content": user_content})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=4096,
temperature=0.7,
)
content = response.choices[0].message.content or ""
html_out = _clean_html_output(content)
return html_out if _looks_like_html(html_out) else FALLBACK_HTML
def run_episode_d(
env_client,
config: AgentConfig,
session_id: str,
ref_b64: str,
dbg=None,
on_step=None,
) -> List[StepResult]:
"""Approach D: long-horizon dev (low-res renders) + simple free-form critic."""
client = OpenAI(api_key=config.api_key, base_url=config.api_base)
current_html = ""
history: List[Tuple[str, str]] = [] # (render_low_b64, html)
critique: Optional[str] = None
results: List[StepResult] = []
for step_i in range(config.max_steps):
error: Optional[str] = None
try:
current_html = _developer_turn_d(
client, config.model, ref_b64, history, critique
)
except Exception as exc:
error = str(exc)[:120]
current_html = FALLBACK_HTML
step_resp = env_client.post(
"/step",
json={"html": current_html, "session_id": session_id},
)
step_resp.raise_for_status()
result = step_resp.json()
reward = float(result.get("reward", 0.0))
env_done = bool(result.get("done", False))
render_full = result.get("render_full")
render_low = result.get("render_low")
sub_rewards = result.get("metadata", {}).get("rewards")
step_n = step_i + 1
sr = StepResult(
step=step_n,
html=current_html,
reward=reward,
done=env_done,
critique=critique or "",
todo=None,
render_full_b64=render_full,
sub_rewards=sub_rewards,
error=error,
)
if on_step:
on_step(sr)
if render_low:
history.append((render_low, current_html))
# Critic turn (skip on final env step)
if not env_done and render_full:
try:
critique = _simple_critic_turn(client, config.model, ref_b64, render_full)
preview = critique.replace("\n", " ")[:200]
print(f"[CRITIC-D] step={step_n} reward={reward:.2f}{preview}", flush=True)
except Exception as exc:
logger.warning("Critic-D failed: %s", exc)
critique = None
results.append(sr)
if env_done:
break
return results