File size: 19,936 Bytes
acf77ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | from __future__ import annotations
import logging
import re
import uuid
from pathlib import Path
from typing import Any
from openenv.core.env_server.interfaces import Environment
from codeforge.audit.ledger import AuditLedger
from codeforge.grader import compute_reward
from codeforge.grounder import ground
from codeforge.interrogator.interrogator import Interrogator
from codeforge.kb.cluster import build_clusters
from codeforge.kb.indexer import SkillsIndex
from codeforge.models import AuditEntry, CodeForgeAction, CodeForgeActionType, CodeForgeObservation
from codeforge.observation import build_observation
from codeforge.ralph.loop import run_loop
from codeforge.ralph.models import LoopConfig
from codeforge.ralph.synthesizer import StubSynthesizer, Synthesizer
from codeforge.sandbox.sandbox import run_sandbox
from codeforge.shaping import citation_shaping_bonus
from codeforge.tasks import Task, get_task
_log = logging.getLogger(__name__)
_DEFAULT_CORPUS = Path(__file__).resolve().parent / "kb" / "skills_corpus.jsonl"
# ---------------------------------------------------------------------------
# Filename validation (SYSTEM_DESIGN §14.2, §14.3)
# ---------------------------------------------------------------------------
_FILENAME_RE = re.compile(r"^[a-z][a-z0-9_]*\.py$")
_FORBIDDEN_FILENAMES = frozenset({
"conftest.py", "pytest.ini", "setup.cfg", "pyproject.toml", "tox.ini",
})
_MAX_FILES = 10
_MAX_FILE_SIZE = 50 * 1024 # 50 KB
_MAX_TOTAL_SIZE = 200 * 1024 # 200 KB
def _validate_files(files: dict[str, str]) -> str | None:
"""Return an error message if *files* violates submission constraints, else None."""
if not files:
return "files dict is empty"
if len(files) > _MAX_FILES:
return f"too many files ({len(files)} > {_MAX_FILES})"
total_size = 0
for name, content in files.items():
if name in _FORBIDDEN_FILENAMES:
return f"filename '{name}' is not allowed"
if not _FILENAME_RE.match(name):
return f"filename '{name}' must match [a-z][a-z0-9_]*.py"
size = len(content.encode("utf-8"))
if size > _MAX_FILE_SIZE:
return f"file '{name}' exceeds {_MAX_FILE_SIZE} bytes"
total_size += size
if total_size > _MAX_TOTAL_SIZE:
return f"total size ({total_size}) exceeds {_MAX_TOTAL_SIZE} bytes"
return None
# ---------------------------------------------------------------------------
# Valid action types (for fast membership check)
# ---------------------------------------------------------------------------
_VALID_ACTION_TYPES = frozenset(member.value for member in CodeForgeActionType)
# ---------------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------------
class CodeForgeEnvironment(Environment): # type: ignore[type-arg]
"""OpenEnv-compliant RL environment with all 6 CodeForge actions.
Implements SYSTEM_DESIGN §4.9, §5.2, §17.
"""
SUPPORTS_CONCURRENT_SESSIONS = True
def __init__(
self,
*,
corpus_path: Path | None = None,
synthesizer: Synthesizer | None = None,
) -> None:
super().__init__()
self._corpus_path = corpus_path or _DEFAULT_CORPUS
self._synthesizer = synthesizer
self._index: SkillsIndex | None = None
self._task: Task | None = None
self._episode_id: str = ""
self._budget_remaining: int = 0
self._current_files: dict[str, str] = {}
self._previous_score: float = 0.0
self._is_done: bool = False
# Per-step state
self._last_citations: tuple[dict[str, object], ...] = ()
self._last_grounding: dict[str, object] | None = None
self._last_reward: float = 0.0
self._last_cluster_hits: tuple[str, ...] = ()
self._last_interrogation_questions: tuple[str, ...] = ()
self._last_ralph_run_id: str | None = None
self._last_ralph_iterations: tuple[dict[str, object], ...] = ()
# Brier/quality tracking for audit entries
self._last_brier_penalty: float | None = None
self._last_quality: float = 0.0
# Episode-level accumulators
self._all_episode_citations: list[dict[str, object]] = []
self._all_episode_cluster_hits: list[str] = []
self._ledger: AuditLedger | None = None
self._step_index: int = 0
# ------------------------------------------------------------------
# Index management
# ------------------------------------------------------------------
def _ensure_index(self) -> SkillsIndex:
if self._index is None:
if not self._corpus_path.is_file():
msg = (
f"corpus not found: {self._corpus_path}. "
f"Run the skills scraper first."
)
raise FileNotFoundError(msg)
idx = SkillsIndex(corpus_path=self._corpus_path)
idx.build()
# Build and attach clusters
import json
nodes: list[dict[str, Any]] = []
with self._corpus_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
nodes.append(json.loads(line))
manifest = build_clusters(nodes)
idx.attach_cluster_manifest(manifest)
self._index = idx
return self._index
# ------------------------------------------------------------------
# OpenEnv interface
# ------------------------------------------------------------------
def reset(
self,
seed: int | None = None,
episode_id: str | None = None,
**kwargs: Any,
) -> CodeForgeObservation:
task_level: str = kwargs.get("task_level", "easy")
task = get_task(task_level)
self._task = task
self._episode_id = episode_id or uuid.uuid4().hex[:12]
self._budget_remaining = task.max_budget
self._current_files = dict(task.initial_files)
self._previous_score = 0.0
self._is_done = False
# Reset per-step
self._last_citations = ()
self._last_grounding = None
self._last_reward = 0.0
self._last_cluster_hits = ()
self._last_interrogation_questions = ()
self._last_ralph_run_id = None
self._last_ralph_iterations = ()
# Reset episode accumulators
self._all_episode_citations = []
self._all_episode_cluster_hits = []
self._ledger = AuditLedger()
self._step_index = 0
_log.info(
"reset id=%s task=%s budget=%s",
self._episode_id, task.task_id, task.max_budget,
)
return self._build_obs()
def step(
self,
action: CodeForgeAction,
timeout_s: float | None = None,
**kwargs: Any,
) -> CodeForgeObservation:
# --- Pre-check: no active episode --------------------------------
if self._task is None:
return self._error_obs("No active episode — call reset() first")
# --- Pre-check: episode already done -----------------------------
if self._is_done:
return self._build_obs()
# --- Pre-check: valid action_type --------------------------------
action_type_str = str(action.action_type)
if action_type_str not in _VALID_ACTION_TYPES:
return self._error_obs(f"Unknown action_type: {action_type_str!r}")
# --- Budget check (variable cost) --------------------------------
cost = self._action_cost(action)
if cost > self._budget_remaining:
return self._error_obs(
f"Insufficient budget: need {cost}, have {self._budget_remaining}"
)
self._budget_remaining -= cost
# --- Clear per-step state ----------------------------------------
self._last_reward = 0.0
self._last_citations = ()
self._last_grounding = None
self._last_cluster_hits = ()
self._last_interrogation_questions = ()
self._last_ralph_run_id = None
self._last_ralph_iterations = ()
error: str | None = None
# --- Route to handler --------------------------------------------
try:
if action_type_str == CodeForgeActionType.QUERY_KB:
error = self._handle_query_kb(action)
elif action_type_str == CodeForgeActionType.QUERY_CLUSTER:
error = self._handle_query_cluster(action)
elif action_type_str == CodeForgeActionType.INTERROGATE:
error = self._handle_interrogate(action)
elif action_type_str == CodeForgeActionType.SUBMIT:
error = self._handle_submit(action)
elif action_type_str == CodeForgeActionType.RUN_RALPH:
error = self._handle_run_ralph(action)
elif action_type_str == CodeForgeActionType.GET_AUDIT:
error = self._handle_get_audit(action)
except Exception as exc:
_log.exception("handler error: %s", exc)
error = f"Internal error: {exc}"
# --- Append audit entry ------------------------------------------
assert self._ledger is not None
_cited: list[str] = []
_cite: dict[str, object]
for _cite in self._last_citations:
_cited.append(str(_cite.get("node_id", "")))
cited_ids: tuple[str, ...] = tuple(_cited)
self._ledger.append(
AuditEntry(
step_index=self._step_index,
action_type=action_type_str,
cited_skill_ids=cited_ids,
cited_clusters=self._last_cluster_hits,
grounding_report=(
self._last_grounding if self._last_grounding else None
),
reward=self._last_reward,
brier_penalty=(
self._last_brier_penalty
if action_type_str == CodeForgeActionType.SUBMIT
else None
),
confidence_declared=(
action.confidence
if action_type_str == CodeForgeActionType.SUBMIT
else None
),
quality=(
self._last_quality
if action_type_str == CodeForgeActionType.SUBMIT
else self._previous_score
),
),
)
self._step_index += 1
# --- Check budget exhaustion -------------------------------------
if self._budget_remaining <= 0:
self._is_done = True
return self._build_obs(error=error)
@property
def state(self) -> CodeForgeObservation:
if self._task is None:
return self._error_obs("No active episode — call reset() first")
return self._build_obs()
# ------------------------------------------------------------------
# Cost computation
# ------------------------------------------------------------------
@staticmethod
def _action_cost(action: CodeForgeAction) -> int:
"""Variable-cost budget accounting (SYSTEM_DESIGN §17.2)."""
if str(action.action_type) == CodeForgeActionType.GET_AUDIT:
return 0
if str(action.action_type) == CodeForgeActionType.RUN_RALPH:
return action.max_iters
return 1
# ------------------------------------------------------------------
# Action handlers (each returns an error string or None)
# ------------------------------------------------------------------
def _handle_query_kb(self, action: CodeForgeAction) -> str | None:
try:
idx = self._ensure_index()
except FileNotFoundError as e:
_log.warning("query_kb: no corpus: %s", e)
self._last_citations = ()
return None
tags = set(action.required_tags) if action.required_tags else None
results = idx.search(
action.claim or "", top_k=action.top_k, required_tags=tags,
)
self._last_citations = tuple(
{
"node_id": r.node_id,
"skill_name": r.skill_name,
"section_path": list(r.section_path),
"section_body": r.section_body,
"score": r.score,
"rank": r.rank,
}
for r in results
)
self._all_episode_citations.extend(self._last_citations)
return None
def _handle_query_cluster(self, action: CodeForgeAction) -> str | None:
try:
idx = self._ensure_index()
except FileNotFoundError as e:
_log.warning("query_cluster: no corpus: %s", e)
self._last_cluster_hits = ()
return None
label = action.cluster_label or ""
results = idx.nodes_in_cluster(label)
if not results:
self._last_cluster_hits = ()
return None
self._last_cluster_hits = tuple(r.node_id for r in results)
self._all_episode_cluster_hits.extend(self._last_cluster_hits)
return None
def _handle_interrogate(self, action: CodeForgeAction) -> str | None:
idx: SkillsIndex | None
try:
idx = self._ensure_index()
except FileNotFoundError:
idx = None
interrogator = Interrogator(idx)
assert self._task is not None
result = interrogator.generate(self._task.brief)
self._last_interrogation_questions = result.questions
return None
def _handle_submit(self, action: CodeForgeAction) -> str | None:
if action.files is None:
return "files required for submit"
file_err = _validate_files(action.files)
if file_err is not None:
return file_err
self._current_files = dict(action.files)
assert self._task is not None
# Merge hidden correctness tests into sandbox files (agent cannot see these)
sandbox_files = dict(action.files)
if self._task.hidden_tests:
sandbox_files.update(self._task.hidden_tests)
# Run sandbox
try:
sandbox_result = run_sandbox(
files=sandbox_files,
tools=self._task.tools,
timeout_per_tool=30.0,
)
sandbox_score = sandbox_result.composite_score
except Exception as e:
_log.exception("sandbox error: %s", e)
sandbox_score = 0.0
# Run grounder (pass local module names so they're not penalized)
local_modules = frozenset(
f.removesuffix(".py") for f in action.files if f.endswith(".py")
)
concatenated = "\n".join(action.files.values())
grounding_report = ground(concatenated, local_modules=local_modules)
self._last_grounding = grounding_report.model_dump()
# Compute reward with Brier calibration
quality = 0.6 * sandbox_score + 0.4 * grounding_report.groundedness
effective_conf = action.confidence if action.confidence is not None else 0.5
brier_penalty: float | None = min((effective_conf - quality) ** 2, 0.5)
self._last_brier_penalty = brier_penalty
self._last_quality = quality
reward = compute_reward(
sandbox_score=sandbox_score,
groundedness=grounding_report.groundedness,
confidence=action.confidence,
)
# Apply citation shaping bonus only on successful submits (§4.8.4)
if reward > 0:
shaping = citation_shaping_bonus(
submit_files=action.files,
prior_citations=self._all_episode_citations,
prior_cluster_hits=self._all_episode_cluster_hits,
)
reward = round(min(1.0, reward + shaping), 3)
self._last_reward = reward
self._previous_score = reward
# Check target score
if reward >= self._task.target_score:
self._is_done = True
return None
def _handle_run_ralph(self, action: CodeForgeAction) -> str | None:
assert self._task is not None
try:
idx = self._ensure_index()
except FileNotFoundError as e:
return f"corpus not available: {e}"
config = LoopConfig(
max_iters=action.max_iters,
target_score=self._task.target_score,
tools=self._task.tools,
)
synthesizer = self._synthesizer or StubSynthesizer()
result = run_loop(
spec=self._task.brief,
initial_files=self._current_files,
index=idx,
synthesizer=synthesizer,
config=config,
)
self._last_ralph_run_id = result.run_id
self._last_ralph_iterations = tuple(
it.model_dump() for it in result.iterations
)
self._current_files = dict(result.final_files)
# Compute ralph reward (SYSTEM_DESIGN §4.8.5)
concatenated = "\n".join(result.final_files.values())
grounding_report = ground(concatenated)
self._last_grounding = grounding_report.model_dump()
wasted = sum(
1 for it in result.iterations if it.reason in ("score_regressed", "score_plateau")
)
base = compute_reward(
sandbox_score=result.final_score,
groundedness=grounding_report.groundedness,
confidence=0.75,
)
waste_penalty = wasted * 0.05
ralph_reward = round(max(0.0, min(1.0, base - waste_penalty)), 3)
self._last_reward = ralph_reward
self._previous_score = ralph_reward
return None
def _handle_get_audit(self, action: CodeForgeAction) -> str | None:
# Audit data is populated in _build_obs via cumulative_audit_summary
return None
# ------------------------------------------------------------------
# Observation helpers
# ------------------------------------------------------------------
def _build_obs(self, *, error: str | None = None) -> CodeForgeObservation:
assert self._task is not None
audit_summary: dict[str, object] | None = None
if self._ledger is not None:
audit_summary = self._ledger.serialize()
return build_observation(
episode_id=self._episode_id,
task=self._task,
current_files=self._current_files,
budget_remaining=self._budget_remaining,
previous_score=self._previous_score,
last_citations=self._last_citations,
last_grounding=self._last_grounding,
is_done=self._is_done,
last_reward=self._last_reward,
last_cluster_hits=self._last_cluster_hits,
last_interrogation_questions=self._last_interrogation_questions,
last_ralph_run_id=self._last_ralph_run_id,
last_ralph_iterations=self._last_ralph_iterations,
cumulative_audit_summary=audit_summary,
error=error,
)
def _error_obs(self, msg: str) -> CodeForgeObservation:
"""Return an error observation without modifying episode state."""
if self._task is None:
# No task set — use a dummy task for the observation structure
dummy = get_task("easy")
return build_observation(
episode_id=self._episode_id or "none",
task=dummy,
current_files=self._current_files,
budget_remaining=self._budget_remaining,
previous_score=self._previous_score,
is_done=self._is_done,
error=msg,
)
return self._build_obs(error=msg)
|