diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..149a453e2c06ad87de858c984702ed5f87027c15
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
+*.egg-info/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f47f7e010e9d11ae9807dac8b0445505137b485c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,27 @@
+# Hugging Face Space Dockerfile.
+# Mirrors the root Dockerfile, exists separately because HF Spaces looks for
+# the Dockerfile inside the Space root by default.
+
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY pyproject.toml ./
+COPY graphforge ./graphforge
+COPY env ./env
+COPY openenv.yaml ./
+
+RUN pip install --no-cache-dir \
+ "pydantic>=2.6" \
+ "fastapi>=0.110" \
+ "uvicorn[standard]>=0.27" \
+ "httpx>=0.27" \
+ "openenv-core>=0.1.0" \
+ "pyyaml>=6.0"
+
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+
+EXPOSE 7860
+
+CMD ["uvicorn", "env.server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 3cc7917620928b0e79b294f3799caa4552158a70..70a6c752fecbdcd5192a10d33cc76d8fef1975de 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,38 @@
---
-title: Graphforge Openenv
-emoji: ๐ป
-colorFrom: green
+title: GraphForge OpenEnv
+emoji: ๐งฑ
+colorFrom: indigo
colorTo: purple
sdk: docker
+app_port: 8000
pinned: false
license: mit
-short_description: A graph-first code-editing RL environment for Python repos.
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# GraphForge โ OpenEnv server
+
+Live deployment of the GraphForge environment for the Meta PyTorch OpenEnv
+Hackathon. The server hosts the OpenEnv-compliant `/reset`, `/step`, `/state`
+endpoints over HTTP. Anything that speaks the OpenEnv client protocol (or
+plain JSON) can drive episodes.
+
+See the main project repo for the architecture overview, training notebook,
+plots, and writeup.
+
+## Endpoints
+
+```
+POST /reset โ GraphForgeObservation
+POST /step { ... } โ { observation, reward, done }
+GET /state โ GraphForgeState
+GET /healthz
+```
+
+## Quick smoke test
+
+```bash
+EID=$(curl -s -X POST $SPACE_URL/reset | python3 -c "import sys,json; print(json.load(sys.stdin)['episode_id'])")
+curl -s -X POST $SPACE_URL/step -H 'content-type: application/json' \
+ -d '{"kind": "add_module", "payload": {"name": "validators", "responsibility": "validation"}}' \
+ | python3 -m json.tool
+```
diff --git a/env/__init__.py b/env/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22926036d1f56a4b6f462040d73b683b1d1d6a85
--- /dev/null
+++ b/env/__init__.py
@@ -0,0 +1,34 @@
+"""Multi-turn repo-editing OpenEnv environment.
+
+Public surface:
+ RepoEditAction, RepoEditObservation, RepoEditState โ wire models
+ RepoEditEnvironment โ OpenEnv environment
+ RepoEditEnv โ HTTP client
+"""
+
+from env.actions import (
+ AddNodeAction,
+ InspectAction,
+ QueryAction,
+ RemoveNodeAction,
+ RepoEditAction,
+ SubmitAction,
+ UpdateNodeAction,
+)
+from env.client import RepoEditEnv
+from env.environment import RepoEditEnvironment
+from env.models import RepoEditObservation, RepoEditState
+
+__all__ = [
+ "AddNodeAction",
+ "InspectAction",
+ "QueryAction",
+ "RemoveNodeAction",
+ "RepoEditAction",
+ "RepoEditEnv",
+ "RepoEditEnvironment",
+ "RepoEditObservation",
+ "RepoEditState",
+ "SubmitAction",
+ "UpdateNodeAction",
+]
diff --git a/env/actions.py b/env/actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..82717404d0204eeaf29651c569c6c28037972a8d
--- /dev/null
+++ b/env/actions.py
@@ -0,0 +1,90 @@
+"""Action schema for the multi-turn repo-editing environment.
+
+All actions are expressed as JSON dicts with a "kind" discriminator.
+The agent emits one action per turn inside ... XML tags.
+
+Actions
+-------
+query Search the knowledge graph for relevant nodes.
+inspect View the full source of a specific node.
+add_node Insert a new function or class into a module/class.
+update_node Replace the source of an existing node.
+remove_node Delete a node from the graph.
+submit Apply all pending changes, run tests, end the episode.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+_cfg = ConfigDict(extra="forbid")
+
+
+class QueryAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["query"] = "query"
+ keywords: str
+ node_type: str = "all" # "all" | "function" | "class" | "module" | "method"
+
+
+class InspectAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["inspect"] = "inspect"
+ node_id: str
+
+
+class AddNodeAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["add_node"] = "add_node"
+ parent_id: str # node_id of the parent (module or class)
+ name: str # name of the new function/class
+ node_type: str # "function" | "class"
+ code: str # full source of the new node (incl. def/class line)
+
+
+class UpdateNodeAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["update_node"] = "update_node"
+ node_id: str # which node to replace
+ new_code: str # full replacement source (incl. def/class line)
+
+
+class RemoveNodeAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["remove_node"] = "remove_node"
+ node_id: str
+
+
+class SubmitAction(BaseModel):
+ model_config = _cfg
+ kind: Literal["submit"] = "submit"
+
+
+RepoEditAction = (
+ QueryAction
+ | InspectAction
+ | AddNodeAction
+ | UpdateNodeAction
+ | RemoveNodeAction
+ | SubmitAction
+)
+
+
+def parse_action(raw: dict) -> RepoEditAction:
+ """Dispatch raw dict to the correct action model."""
+ kind = raw.get("kind", "")
+ mapping = {
+ "query": QueryAction,
+ "inspect": InspectAction,
+ "add_node": AddNodeAction,
+ "update_node": UpdateNodeAction,
+ "remove_node": RemoveNodeAction,
+ "submit": SubmitAction,
+ }
+ cls = mapping.get(kind)
+ if cls is None:
+ raise ValueError(f"Unknown action kind: {kind!r}. Valid: {list(mapping)}")
+ return cls.model_validate(raw)
diff --git a/env/ast_parser.py b/env/ast_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a7e51d8f06020d084fa0558bdefd9c05aee3f
--- /dev/null
+++ b/env/ast_parser.py
@@ -0,0 +1,249 @@
+"""AST-based DAG parser and code injection utilities.
+
+parse_source(source, module_name) -> CodeDAG
+ Parses a Python source string and returns a structured DAG with nodes
+ (module, function, imported_module) and typed edges (contains, calls, imports).
+
+inject_function_body(source, func_name, new_body) -> str
+ Replaces the body of func_name in source with new_body, preserving the
+ def line and any docstring. Used by the environment's step() method.
+"""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass, field
+
+
+# โโ DAG data model โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+@dataclass
+class DAGNode:
+ name: str
+ node_type: str # "module" | "function" | "class" | "imported_module"
+ signature: str = ""
+ is_stub: bool = False
+ body_summary: str = ""
+
+
+@dataclass
+class DAGEdge:
+ edge_type: str # "contains" | "calls" | "imports"
+ source: str
+ target: str
+
+
+@dataclass
+class FunctionInfo:
+ name: str
+ signature: str
+ is_stub: bool
+ start_line: int # 1-indexed
+ end_line: int # 1-indexed, inclusive
+ has_docstring: bool
+ docstring_end_line: int # 1-indexed; == start_line when no docstring
+
+
+@dataclass
+class CodeDAG:
+ module_name: str
+ nodes: list[DAGNode] = field(default_factory=list)
+ edges: list[DAGEdge] = field(default_factory=list)
+ function_infos: dict[str, FunctionInfo] = field(default_factory=dict)
+
+ def callers_of(self, func_name: str) -> list[str]:
+ return [e.source for e in self.edges if e.edge_type == "calls" and e.target == func_name]
+
+ def callees_of(self, func_name: str) -> list[str]:
+ return [e.target for e in self.edges if e.edge_type == "calls" and e.source == func_name]
+
+ def stub_functions(self) -> list[str]:
+ return [n.name for n in self.nodes if n.node_type == "function" and n.is_stub]
+
+
+# โโ helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
+ parts = []
+ for arg in node.args.args:
+ ann = f": {ast.unparse(arg.annotation)}" if arg.annotation else ""
+ parts.append(f"{arg.arg}{ann}")
+ ret = f" -> {ast.unparse(node.returns)}" if node.returns else ""
+ return f"({', '.join(parts)}){ret}"
+
+
+def _is_stub(node: ast.FunctionDef | ast.AsyncFunctionDef, source: str) -> bool:
+ func_src = "\n".join(source.splitlines()[node.lineno - 1:node.end_lineno])
+ if "# STUB" in func_src:
+ return True
+ # body that is just "raise NotImplementedError"
+ stmts = [s for s in node.body
+ if not (isinstance(s, ast.Expr) and isinstance(s.value, ast.Constant))]
+ if len(stmts) == 1 and isinstance(stmts[0], ast.Raise):
+ exc = stmts[0].exc
+ if isinstance(exc, ast.Name) and exc.id == "NotImplementedError":
+ return True
+ if isinstance(exc, ast.Call) and isinstance(exc.func, ast.Name) and exc.func.id == "NotImplementedError":
+ return True
+ return False
+
+
+def _extract_calls(node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
+ calls: set[str] = set()
+ for child in ast.walk(node):
+ if isinstance(child, ast.Call):
+ if isinstance(child.func, ast.Name):
+ calls.add(child.func.id)
+ return calls
+
+
+# โโ main parser โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def parse_source(source: str, module_name: str = "module") -> CodeDAG:
+ """Parse Python source into a CodeDAG."""
+ tree = ast.parse(source)
+ dag = CodeDAG(module_name=module_name)
+ dag.nodes.append(DAGNode(name=module_name, node_type="module"))
+
+ func_names: set[str] = set()
+
+ # imports
+ for node in ast.walk(tree):
+ if isinstance(node, ast.Import):
+ for alias in node.names:
+ imp = alias.asname or alias.name
+ dag.nodes.append(DAGNode(name=imp, node_type="imported_module"))
+ dag.edges.append(DAGEdge("imports", module_name, imp))
+ elif isinstance(node, ast.ImportFrom) and node.module:
+ dag.nodes.append(DAGNode(name=node.module, node_type="imported_module"))
+ dag.edges.append(DAGEdge("imports", module_name, node.module))
+
+ # top-level functions and classes
+ for node in tree.body:
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ sig = _signature(node)
+ stub = _is_stub(node, source)
+ has_doc = (
+ bool(node.body)
+ and isinstance(node.body[0], ast.Expr)
+ and isinstance(node.body[0].value, ast.Constant)
+ )
+ doc_end = node.body[0].end_lineno if has_doc else node.lineno
+
+ dag.nodes.append(DAGNode(
+ name=node.name,
+ node_type="function",
+ signature=sig,
+ is_stub=stub,
+ body_summary="STUB โ needs implementation" if stub else "(implemented)",
+ ))
+ dag.edges.append(DAGEdge("contains", module_name, node.name))
+ dag.function_infos[node.name] = FunctionInfo(
+ name=node.name,
+ signature=sig,
+ is_stub=stub,
+ start_line=node.lineno,
+ end_line=node.end_lineno,
+ has_docstring=has_doc,
+ docstring_end_line=doc_end,
+ )
+ func_names.add(node.name)
+
+ elif isinstance(node, ast.ClassDef):
+ dag.nodes.append(DAGNode(name=node.name, node_type="class"))
+ dag.edges.append(DAGEdge("contains", module_name, node.name))
+ for item in node.body:
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ qname = f"{node.name}.{item.name}"
+ dag.nodes.append(DAGNode(
+ name=qname,
+ node_type="function",
+ signature=_signature(item),
+ is_stub=_is_stub(item, source),
+ ))
+ dag.edges.append(DAGEdge("contains", node.name, qname))
+ func_names.add(qname)
+
+ # call edges (same-module only)
+ for node in tree.body:
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ for callee in _extract_calls(node):
+ if callee in func_names and callee != node.name:
+ dag.edges.append(DAGEdge("calls", node.name, callee))
+
+ return dag
+
+
+# โโ code injection โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def inject_function_body(source: str, func_name: str, new_body: str) -> str:
+ """Replace the body of func_name in source with new_body.
+
+ Preserves the def line and any docstring. new_body should be the raw body
+ text (with or without indentation โ we normalise it).
+ """
+ tree = ast.parse(source)
+ lines = source.splitlines(keepends=True)
+
+ for node in tree.body:
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ continue
+ if node.name != func_name:
+ continue
+
+ # Determine where to keep up to (def line + optional docstring)
+ has_doc = (
+ bool(node.body)
+ and isinstance(node.body[0], ast.Expr)
+ and isinstance(node.body[0].value, ast.Constant)
+ )
+ keep_until = node.body[0].end_lineno if has_doc else node.lineno
+ # keep_until is 1-indexed; lines[:keep_until] gives 0..keep_until-1
+
+ before = lines[:keep_until]
+ after = lines[node.end_lineno:] # everything after the function
+
+ # Normalise body indent: strip common leading whitespace, then re-add 4 spaces.
+ raw_lines = new_body.splitlines()
+ # find minimum indent of non-empty lines
+ min_indent = min(
+ (len(l) - len(l.lstrip()) for l in raw_lines if l.strip()),
+ default=0,
+ )
+ body_lines: list[str] = []
+ for raw_line in raw_lines:
+ if raw_line.strip():
+ body_lines.append(" " + raw_line[min_indent:] + "\n")
+ else:
+ body_lines.append("\n")
+
+ if not body_lines:
+ body_lines = [" pass\n"]
+
+ return "".join(before + body_lines + after)
+
+ raise ValueError(f"Function {func_name!r} not found in source")
+
+
+# โโ DAG โ text description (for prompts) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def dag_to_text(dag: CodeDAG) -> str:
+ """Render the DAG as a concise human-readable block for the agent prompt."""
+ lines: list[str] = [f"## Module: {dag.module_name}", "", "### Nodes"]
+
+ for n in dag.nodes:
+ if n.node_type == "module":
+ lines.append(f"- [MODULE] {n.name}")
+ elif n.node_type == "function":
+ status = "[ STUB ]" if n.is_stub else "[ready ]"
+ lines.append(f"- [FUNC] {status} {n.name}{n.signature}")
+ elif n.node_type == "class":
+ lines.append(f"- [CLASS] {n.name}")
+ elif n.node_type == "imported_module":
+ lines.append(f"- [IMPORT] {n.name}")
+
+ lines += ["", "### Edges"]
+ for e in dag.edges:
+ lines.append(f"- {e.source} --{e.edge_type}--> {e.target}")
+
+ return "\n".join(lines)
diff --git a/env/client.py b/env/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d69d4c658cc5af002b73492e04ccacfdcbdbf37
--- /dev/null
+++ b/env/client.py
@@ -0,0 +1,36 @@
+"""HTTP client for the repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+from env.models import RepoEditObservation, RepoEditState
+
+
+class RepoEditEnv:
+ def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 60.0) -> None:
+ self._client = httpx.Client(base_url=base_url.rstrip("/"), timeout=timeout)
+
+ def reset(self, task_id: str | None = None) -> RepoEditObservation:
+ params = {"task_id": task_id} if task_id else {}
+ r = self._client.post("/reset", params=params)
+ r.raise_for_status()
+ return RepoEditObservation.model_validate(r.json())
+
+ def step(self, action_dict: dict[str, Any]) -> dict[str, Any]:
+ r = self._client.post("/step", json=action_dict)
+ r.raise_for_status()
+ return r.json()
+
+ def state(self) -> RepoEditState:
+ r = self._client.get("/state")
+ r.raise_for_status()
+ return RepoEditState.model_validate(r.json())
+
+ def __enter__(self) -> "RepoEditEnv":
+ return self
+
+ def __exit__(self, *_: object) -> None:
+ self._client.close()
diff --git a/env/environment.py b/env/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f389ec7109b6497bb93792b14b0ee78781c1742
--- /dev/null
+++ b/env/environment.py
@@ -0,0 +1,467 @@
+"""Multi-turn repo-editing OpenEnv environment.
+
+Episode flow
+------------
+reset() Parse the target repo into a KnowledgeGraph. Return an observation
+ containing the full graph overview and the task description.
+
+step() The agent emits one RepoEditAction per turn:
+ - query โ search results (information, no graph mutation)
+ - inspect โ full node source (information)
+ - add_node โ insert new function/class into the live graph
+ - update_node โ replace a node's source in the live graph
+ - remove_node โ delete a node
+ - submit โ materialise all changes back to disk (temp), run tests,
+ compute reward, end episode
+
+Reward structure (sparse โ designed for long-horizon RL)
+---------------------------------------------------------
+ Per-turn cost : -0.05 (forces efficiency)
+ Malformed action : -0.2
+ On submit
+ all tests pass : +1.0
+ partial pass : +0.5 * (n_pass / n_total)
+ compile error : 0.0
+ Episode cap hit : 0.0
+
+This sparse reward deliberately requires the agent to plan, navigate, and
+execute across many turns โ it cannot succeed by guessing on the first turn.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import os
+import re
+import sys
+import tempfile
+import textwrap
+import traceback
+import uuid
+from pathlib import Path
+from typing import Any
+
+from env.actions import (
+ AddNodeAction,
+ InspectAction,
+ QueryAction,
+ RemoveNodeAction,
+ RepoEditAction,
+ SubmitAction,
+ UpdateNodeAction,
+ parse_action,
+)
+from env.models import RepoEditObservation, RepoEditState
+from env.tasks import SAMPLE_REPOS_DIR, TASK_BANK, RepoTask, all_task_ids, get_task
+from graphforge.knowledge_graph import KGEdge, KGNode, KnowledgeGraph
+from graphforge.repo_parser import parse_repo, _node_id
+
+try:
+ from openenv.core import Environment # type: ignore
+ _HAS_OPENENV = True
+except Exception:
+ _HAS_OPENENV = False
+ from typing import Generic, TypeVar
+ A = TypeVar("A")
+ O = TypeVar("O")
+ S = TypeVar("S")
+
+ class Environment(Generic[A, O, S]): # type: ignore[no-redef]
+ def reset(self) -> O: ...
+ def step(self, action: A) -> tuple[O, float, bool]: ...
+ def get_state(self) -> S: ...
+
+
+# โโ constants โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+PER_TURN_COST = -0.05
+MALFORMED_PENALTY = -0.2
+
+
+# โโ materialiser (graph โ disk) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _materialise_changes(
+ kg: KnowledgeGraph,
+ repo_src_path: Path,
+ tmp_dir: str,
+) -> dict[str, str]:
+ """Write mutated module sources to tmp_dir. Returns {rel_path: source}."""
+ files: dict[str, str] = {}
+ for node in kg.all_nodes("module"):
+ if not node.file_path:
+ continue
+ # Re-assemble module source from its children's current sources
+ # For simplicity: use the node.source field (which we keep in sync)
+ files[node.file_path] = node.source
+ dest = Path(tmp_dir) / node.file_path
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ dest.write_text(node.source, encoding="utf-8")
+ # Copy non-py files (like __init__.py markers) from original
+ for root, _, fnames in os.walk(str(repo_src_path)):
+ for fname in fnames:
+ if fname.endswith(".py"):
+ continue
+ src = Path(root) / fname
+ rel = src.relative_to(repo_src_path)
+ dst = Path(tmp_dir) / rel
+ dst.parent.mkdir(parents=True, exist_ok=True)
+ dst.write_bytes(src.read_bytes())
+ return files
+
+
+# โโ code injection into module source โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _apply_add_node(
+ module_source: str,
+ code: str,
+ class_name: str | None = None,
+) -> str:
+ """Insert code into module_source.
+
+ If class_name is given, the code is indented and appended inside the class
+ body. Otherwise it is appended at module level.
+ """
+ new_code = textwrap.dedent(code).strip()
+ if class_name is None:
+ return module_source.rstrip() + "\n\n\n" + new_code + "\n"
+
+ # Insert indented method just before the end of the class block
+ indented = "\n".join(" " + line for line in new_code.splitlines())
+ # Find the class definition via AST and splice
+ try:
+ tree = ast.parse(module_source)
+ lines = module_source.splitlines(keepends=True)
+ for node in tree.body:
+ if isinstance(node, ast.ClassDef) and node.name == class_name:
+ insert_at = node.end_lineno # 1-indexed, inclusive last line of class
+ before = "".join(lines[:insert_at])
+ after = "".join(lines[insert_at:])
+ return before.rstrip() + "\n\n" + indented + "\n" + after
+ except Exception:
+ pass
+ # Fallback: append at module level
+ return module_source.rstrip() + "\n\n\n" + indented + "\n"
+
+
+def _apply_update_node(
+ module_source: str,
+ old_source: str,
+ new_code: str,
+) -> str:
+ """Replace old_source verbatim in module_source with new_code."""
+ new_code_clean = textwrap.dedent(new_code).strip()
+ if old_source in module_source:
+ return module_source.replace(old_source, new_code_clean, 1)
+ # Fallback: try stripping indentation differences
+ return module_source + "\n\n# PATCHED\n" + new_code_clean + "\n"
+
+
+def _apply_remove_node(module_source: str, old_source: str) -> str:
+ if old_source in module_source:
+ return module_source.replace(old_source, "", 1)
+ return module_source
+
+
+def _validate_python(source: str) -> tuple[bool, str]:
+ try:
+ ast.parse(source)
+ return True, ""
+ except SyntaxError as exc:
+ return False, str(exc)
+
+
+# โโ environment โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+class RepoEditEnvironment(
+ Environment[RepoEditAction, RepoEditObservation, RepoEditState]
+):
+ """Multi-turn OpenEnv environment for repository-level code editing.
+
+ The agent receives a Knowledge Graph of a real Python repo and must
+ navigate it to find the right location, then apply the correct edit.
+ Reward is sparse: only granted on a passing submit().
+ """
+
+ def __init__(self, task_id: str | None = None) -> None:
+ self._configured_task_id = task_id
+ self._task: RepoTask | None = None
+ self._kg: KnowledgeGraph | None = None
+ self._episode_id: str | None = None
+ self._turn: int = 0
+ self._done: bool = False
+ self._total_reward: float = 0.0
+ self._history: list[dict[str, Any]] = []
+
+ # ----- OpenEnv contract ---------------------------------------------------
+
+ def reset(self, task_id: str | None = None, task: Any = None) -> RepoEditObservation:
+ """Reset the environment.
+
+ Pass either task_id (looks up TASK_BANK) or a task object directly
+ (supports AutoTask from graphforge.task_generator).
+ """
+ if task is not None:
+ tid = task.task_id
+ else:
+ tid = task_id or self._configured_task_id or _pick_random_task()
+ task = TASK_BANK.get(tid)
+ if task is None:
+ raise ValueError(f"Unknown task_id: {tid!r}. Available: {all_task_ids()}")
+
+ # Resolve the repo path: use task.repo_path if set, else fall back to sample_repos/
+ if getattr(task, "repo_path", None):
+ repo_path = task.repo_path
+ else:
+ repo_path = str(SAMPLE_REPOS_DIR / task.repo_name)
+
+ self._task = task
+ self._kg = parse_repo(repo_path)
+ self._episode_id = str(uuid.uuid4())[:8]
+ self._turn = 0
+ self._done = False
+ self._total_reward = 0.0
+ self._history = []
+
+ return RepoEditObservation(
+ episode_id=self._episode_id,
+ task_id=tid,
+ turn=0,
+ max_turns=task.max_turns,
+ graph_overview=self._kg.overview(),
+ task_description=task.description,
+ action_result="Episode started. Use query/inspect to navigate, then add_node/update_node to edit, then submit.",
+ done=False,
+ )
+
+ def step(self, action: RepoEditAction) -> tuple[RepoEditObservation, float, bool]:
+ if self._task is None or self._kg is None:
+ raise RuntimeError("step() called before reset()")
+ if self._done:
+ return self._terminal_obs("Episode already done."), 0.0, True
+
+ self._turn += 1
+ turn_reward = PER_TURN_COST
+
+ # Dispatch
+ try:
+ result_text, extra_reward, done = self._dispatch(action)
+ turn_reward += extra_reward
+ except Exception as exc:
+ result_text = f"[ERROR] {exc}"
+ turn_reward += MALFORMED_PENALTY
+ done = False
+
+ self._total_reward += turn_reward
+
+ # Episode cap
+ if not done and self._turn >= self._task.max_turns:
+ done = True
+ result_text += f"\n[Episode cap reached: {self._task.max_turns} turns]"
+
+ self._done = done
+ self._history.append({
+ "turn": self._turn,
+ "action_kind": getattr(action, "kind", "unknown"),
+ "reward": turn_reward,
+ })
+
+ obs = RepoEditObservation(
+ episode_id=self._episode_id,
+ task_id=self._task.task_id,
+ turn=self._turn,
+ max_turns=self._task.max_turns,
+ graph_overview=self._kg.overview(),
+ task_description=self._task.description,
+ action_result=result_text,
+ turn_reward=turn_reward,
+ total_reward=self._total_reward,
+ done=done,
+ )
+ return obs, turn_reward, done
+
+ def get_state(self) -> RepoEditState:
+ return RepoEditState(
+ episode_id=self._episode_id,
+ task_id=self._task.task_id if self._task else None,
+ turn=self._turn,
+ done=self._done,
+ total_reward=self._total_reward,
+ )
+
+ @property
+ def state(self) -> RepoEditState:
+ return self.get_state()
+
+ # ----- action dispatch ----------------------------------------------------
+
+ def _dispatch(
+ self, action: RepoEditAction
+ ) -> tuple[str, float, bool]:
+ """Returns (result_text, extra_reward, done)."""
+ kg = self._kg
+ assert kg is not None
+
+ if isinstance(action, QueryAction):
+ nt = None if action.node_type == "all" else action.node_type
+ results = kg.search(action.keywords, node_type=nt)
+ if not results:
+ return f"No nodes found for query: {action.keywords!r}", 0.0, False
+ lines = [f"Found {len(results)} node(s) matching {action.keywords!r}:"]
+ for n in results[:10]:
+ lines.append(f" {n.node_id} ({n.file_path}:{n.line_start})")
+ return "\n".join(lines), 0.0, False
+
+ if isinstance(action, InspectAction):
+ detail = kg.node_detail(action.node_id)
+ return detail, 0.0, False
+
+ if isinstance(action, AddNodeAction):
+ parent = kg.get_node(action.parent_id)
+ if parent is None:
+ return f"[ERROR] parent_id {action.parent_id!r} not found.", MALFORMED_PENALTY, False
+ ok, err = _validate_python(action.code)
+ if not ok:
+ return f"[SYNTAX ERROR in your code] {err}", MALFORMED_PENALTY, False
+
+ # Append to parent module's source
+ module_node = _find_module_for(kg, action.parent_id)
+ if module_node is None:
+ return f"[ERROR] could not find module for parent {action.parent_id!r}", MALFORMED_PENALTY, False
+
+ parent_node = kg.get_node(action.parent_id)
+ class_name = parent_node.name if parent_node and parent_node.node_type == "class" else None
+ module_node.source = _apply_add_node(module_node.source, action.code, class_name=class_name)
+
+ # Register the new node in the KG
+ ntype = action.node_type if action.node_type in ("function", "class", "method") else "function"
+ new_id = _node_id(ntype, module_node.file_path, action.name)
+ new_node = KGNode(
+ node_id=new_id,
+ node_type=ntype,
+ name=action.name,
+ file_path=module_node.file_path,
+ line_start=module_node.line_end,
+ line_end=module_node.line_end + action.code.count("\n") + 1,
+ source=textwrap.dedent(action.code).strip(),
+ )
+ kg.insert_node(action.parent_id, new_node)
+ return f"Added {ntype} `{action.name}` to `{module_node.file_path}`.\nNew node_id: {new_id}", 0.0, False
+
+ if isinstance(action, UpdateNodeAction):
+ target = kg.get_node(action.node_id)
+ if target is None:
+ return f"[ERROR] node_id {action.node_id!r} not found.", MALFORMED_PENALTY, False
+ ok, err = _validate_python(action.new_code)
+ if not ok:
+ return f"[SYNTAX ERROR in your code] {err}", MALFORMED_PENALTY, False
+
+ module_node = _find_module_for(kg, action.node_id)
+ if module_node is None:
+ return f"[ERROR] could not find module for {action.node_id!r}", MALFORMED_PENALTY, False
+
+ old_source = target.source
+ module_node.source = _apply_update_node(module_node.source, old_source, action.new_code)
+ target.source = textwrap.dedent(action.new_code).strip()
+ return f"Updated `{action.node_id}`.", 0.0, False
+
+ if isinstance(action, RemoveNodeAction):
+ target = kg.get_node(action.node_id)
+ if target is None:
+ return f"[ERROR] node_id {action.node_id!r} not found.", MALFORMED_PENALTY, False
+ module_node = _find_module_for(kg, action.node_id)
+ if module_node:
+ module_node.source = _apply_remove_node(module_node.source, target.source)
+ kg.remove_node(action.node_id)
+ return f"Removed `{action.node_id}`.", 0.0, False
+
+ if isinstance(action, SubmitAction):
+ return self._run_submit()
+
+ return f"[ERROR] unrecognised action type: {type(action)}", MALFORMED_PENALTY, False
+
+ def _run_submit(self) -> tuple[str, float, bool]:
+ """Write modified sources to a temp dir, run tests there, clean up."""
+ kg = self._kg
+ task = self._task
+ assert kg is not None and task is not None
+
+ reward, msg = _run_tests_in_tempdir(kg, task.test_code, task.repo_name)
+ return f"[SUBMIT RESULT]\n{msg}", reward, True
+
+ def _terminal_obs(self, msg: str) -> RepoEditObservation:
+ return RepoEditObservation(
+ episode_id=self._episode_id,
+ task_id=self._task.task_id if self._task else None,
+ turn=self._turn,
+ max_turns=self._task.max_turns if self._task else 0,
+ graph_overview="",
+ task_description="",
+ action_result=msg,
+ done=True,
+ total_reward=self._total_reward,
+ )
+
+
+# โโ helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _find_module_for(kg: KnowledgeGraph, node_id: str) -> KGNode | None:
+ """Walk up the parent chain until we hit a module node."""
+ current_id = node_id
+ seen: set[str] = set()
+ while current_id and current_id not in seen:
+ seen.add(current_id)
+ node = kg.get_node(current_id)
+ if node and node.node_type == "module":
+ return node
+ parent = kg.parent_of(current_id)
+ if parent is None:
+ break
+ current_id = parent.node_id
+ return None
+
+
+def _run_tests_in_tempdir(
+ kg: KnowledgeGraph, test_code: str, pkg_name: str
+) -> tuple[float, str]:
+ """Write mutated module sources to a temp dir, import from there, run tests.
+
+ This works for ANY Python repo โ no hardcoded package paths needed.
+ The test_code must use short imports: `from . import ...`
+ """
+ with tempfile.TemporaryDirectory() as tmpdir:
+ pkg_dir = Path(tmpdir) / pkg_name
+ pkg_dir.mkdir(parents=True)
+ (pkg_dir / "__init__.py").write_text("")
+
+ # Write each module's current (potentially mutated) source
+ for node in kg.all_nodes("module"):
+ if not node.file_path or node.file_path == "__init__.py":
+ continue
+ dest = pkg_dir / node.file_path
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ dest.write_text(node.source, encoding="utf-8")
+
+ # Remove any stale cached copies of this package
+ stale = [k for k in sys.modules if k == pkg_name or k.startswith(pkg_name + ".")]
+ for k in stale:
+ del sys.modules[k]
+
+ sys.path.insert(0, tmpdir)
+ try:
+ exec(compile(test_code, "", "exec"), {}) # noqa: S102
+ return 1.0, "โ All tests passed!"
+ except AssertionError as exc:
+ return 0.0, f"โ Test failed: {exc}"
+ except Exception:
+ return 0.0, f"โ Exception during tests:\n{traceback.format_exc(limit=5)}"
+ finally:
+ sys.path.remove(tmpdir)
+ stale = [k for k in sys.modules if k == pkg_name or k.startswith(pkg_name + ".")]
+ for k in stale:
+ del sys.modules[k]
+
+
+def _pick_random_task() -> str:
+ import random
+ return random.choice(all_task_ids())
diff --git a/env/models.py b/env/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d5fd9fe80a50acae0ce5d41c5301ae5c7a0f69f
--- /dev/null
+++ b/env/models.py
@@ -0,0 +1,46 @@
+"""Pydantic wire models for the multi-turn repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+_cfg = ConfigDict(extra="ignore")
+
+
+class RepoEditObservation(BaseModel):
+ """What the env returns after reset() or step().
+
+ Contains the current graph overview + the result of the last action.
+ The agent should read action_result carefully before deciding the next step.
+ """
+
+ model_config = _cfg
+
+ episode_id: Optional[str] = None
+ task_id: Optional[str] = None
+ turn: int = 0
+ max_turns: int = 15
+
+ graph_overview: str = "" # compact text view of the entire repo KG
+ task_description: str = "" # what the agent needs to accomplish
+ action_result: str = "" # feedback from the last action
+
+ turn_reward: float = 0.0
+ total_reward: float = 0.0
+ done: bool = False
+
+ info: dict[str, Any] = Field(default_factory=dict)
+
+
+class RepoEditState(BaseModel):
+ """Episode-level state snapshot."""
+
+ model_config = _cfg
+
+ episode_id: Optional[str] = None
+ task_id: Optional[str] = None
+ turn: int = 0
+ done: bool = False
+ total_reward: float = 0.0
diff --git a/env/server.py b/env/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..60361013b7257977ef44d6de2e3471758fb59af1
--- /dev/null
+++ b/env/server.py
@@ -0,0 +1,44 @@
+"""FastAPI server for the multi-turn repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import FastAPI, HTTPException
+
+from env.actions import RepoEditAction, parse_action
+from env.environment import RepoEditEnvironment
+from env.models import RepoEditObservation, RepoEditState
+
+_env = RepoEditEnvironment()
+
+
+def _make_app() -> FastAPI:
+ app = FastAPI(title="Repo-Edit OpenEnv", version="0.3.0")
+
+ @app.post("/reset", response_model=RepoEditObservation)
+ def reset(task_id: str | None = None) -> RepoEditObservation:
+ return _env.reset(task_id=task_id)
+
+ @app.post("/step")
+ def step(action_dict: dict[str, Any]) -> dict[str, Any]:
+ try:
+ action = parse_action(action_dict)
+ obs, reward, done = _env.step(action)
+ except (ValueError, RuntimeError) as exc:
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
+ return {"observation": obs.model_dump(), "reward": reward, "done": done}
+
+ @app.get("/state", response_model=RepoEditState)
+ def state() -> RepoEditState:
+ return _env.get_state()
+
+ @app.get("/healthz")
+ def healthz() -> dict[str, Any]:
+ return {"status": "ok"}
+
+ return app
+
+
+app = _make_app()
+__all__ = ["app"]
diff --git a/env/tasks.py b/env/tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ef484d5d0ae069d487081c7bd0c0bc73c8eb48
--- /dev/null
+++ b/env/tasks.py
@@ -0,0 +1,363 @@
+"""Multi-turn repo-editing tasks.
+
+Each Task specifies:
+ - A target repo to work on (points to a sample_repos/ subdir)
+ - A natural-language description of the change to make
+ - A set of test functions (Python code strings) that verify the change
+ - The maximum number of turns allowed
+
+Training tasks are deliberately structured to require multi-step navigation:
+ 1. The agent must QUERY the graph to find relevant nodes
+ 2. INSPECT nodes to understand the existing code
+ 3. ADD or UPDATE nodes to implement the change
+ 4. SUBMIT to trigger compilation + test execution
+
+This sparse reward structure forces the agent to develop structured planning
+and state tracking across long trajectories โ the core theme of this project.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import textwrap
+import traceback
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+SAMPLE_REPOS_DIR = Path(__file__).resolve().parent.parent / "graphforge" / "sample_repos"
+
+
+@dataclass
+class RepoTask:
+ task_id: str
+ repo_name: str # package name (used as tempdir subdir)
+ description: str # natural-language task for the agent
+ test_code: str # Python assertions using short imports
+ max_turns: int = 15
+ difficulty: int = 0 # 0=easy, 1=medium, 2=hard
+ hints: list[str] = field(default_factory=list)
+ repo_path: str | None = None # if set, full path to repo source dir
+
+
+TASK_BANK: dict[str, RepoTask] = {}
+
+
+def _reg(task: RepoTask) -> RepoTask:
+ TASK_BANK[task.task_id] = task
+ return task
+
+
+# โโ Task 0: add validate_due_date โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+_reg(RepoTask(
+ task_id="t0.validate_due_date",
+ repo_name="task_manager",
+ description=textwrap.dedent("""\
+ Add a function `validate_due_date(due_date) -> bool` to `validators.py`.
+
+ The function should return True if:
+ - due_date is None (no deadline), OR
+ - due_date is a datetime.date instance
+
+ It should return False for any other type (strings, integers, etc.).
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from datetime import date
+ from task_manager.validators import validate_due_date
+ assert validate_due_date(None) is True, "None is valid (no deadline)"
+ assert validate_due_date(date(2025, 1, 1)) is True, "date object is valid"
+ assert validate_due_date("2025-01-01") is False, "string is not valid"
+ assert validate_due_date(20250101) is False, "int is not valid"
+ assert validate_due_date([]) is False, "list is not valid"
+ """).strip(),
+ max_turns=12,
+ hints=[
+ "Look in validators.py to see the style of existing validators.",
+ "The function signature should be: def validate_due_date(due_date) -> bool",
+ "Import datetime.date inside the function or at the top of validators.py.",
+ ],
+))
+
+# โโ Task 1: add Task.is_overdue โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+_reg(RepoTask(
+ task_id="t1.is_overdue",
+ repo_name="task_manager",
+ description=textwrap.dedent("""\
+ Add a method `is_overdue(self, today: date) -> bool` to the `Task`
+ class in `models.py`.
+
+ The method should return True if:
+ - the task has a due_date AND
+ - today is strictly after the due_date AND
+ - the task is not yet done
+
+ It should return False if there is no due_date, or if the task is done,
+ or if today <= due_date.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from datetime import date
+ from task_manager.models import Task
+
+ t_past = Task("x", "low", [], due_date=date(2020, 1, 1))
+ t_future = Task("y", "low", [], due_date=date(2099, 1, 1))
+ t_none = Task("z", "low", [], due_date=None)
+ t_done = Task("d", "low", [], due_date=date(2020, 1, 1))
+ t_done.complete()
+
+ today = date.today()
+ assert t_past.is_overdue(today) is True, "past due date โ overdue"
+ assert t_future.is_overdue(today) is False, "future due date โ not overdue"
+ assert t_none.is_overdue(today) is False, "no due date โ not overdue"
+ assert t_done.is_overdue(today) is False, "done task โ not overdue"
+ """).strip(),
+ max_turns=15,
+ difficulty=1,
+ hints=[
+ "The Task class is in models.py.",
+ "The method should check self.due_date, today, and self.done.",
+ ],
+))
+
+# โโ Task 2: add TaskStore.find_by_tag โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+_reg(RepoTask(
+ task_id="t2.find_by_tag",
+ repo_name="task_manager",
+ description=textwrap.dedent("""\
+ Add a method `find_by_tag(self, tag: str) -> list[Task]` to the
+ `TaskStore` class in `storage.py`.
+
+ The method should return a list of all tasks that have `tag` in
+ their `tags` list. Return an empty list if no tasks match.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from task_manager.models import Task
+ from task_manager.storage import TaskStore
+
+ store = TaskStore()
+ store.add(Task("t1", "high", ["python", "backend"], None))
+ store.add(Task("t2", "low", ["frontend"], None))
+ store.add(Task("t3", "medium", ["python"], None))
+
+ result = store.find_by_tag("python")
+ assert len(result) == 2, f"Expected 2, got {len(result)}"
+ titles = {t.title for t in result}
+ assert titles == {"t1", "t3"}, f"Wrong titles: {titles}"
+
+ empty = store.find_by_tag("devops")
+ assert empty == [], f"Expected [], got {empty}"
+ """).strip(),
+ max_turns=15,
+ difficulty=1,
+))
+
+# โโ Task 3 (hard): enforce priority validation in api.create_task โโโโโโโโโโโโโ
+
+_reg(RepoTask(
+ task_id="t3.enforce_priority",
+ repo_name="task_manager",
+ description=textwrap.dedent("""\
+ Update the `create_task` function in `api.py` so that it validates
+ the `priority` argument using `validate_priority` from `validators.py`.
+
+ If the priority is invalid, raise `ValueError` with a clear message.
+ The existing validations for title and tags must still work.
+
+ Note: `validate_priority` already exists in validators.py.
+ You must import and call it inside `create_task`.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from task_manager import api as _api
+ _api.reset_store() # clean state between runs
+
+ # valid priority passes through
+ t = _api.create_task("Buy milk", priority="high")
+ assert t.priority == "high"
+
+ # invalid priority raises ValueError
+ raised = False
+ try:
+ _api.create_task("Bad task", priority="urgent")
+ except ValueError:
+ raised = True
+ assert raised, "create_task should raise ValueError for invalid priority"
+
+ # title validation still works
+ raised2 = False
+ try:
+ _api.create_task("", priority="low")
+ except ValueError:
+ raised2 = True
+ assert raised2, "create_task should still reject empty title"
+ """).strip(),
+ max_turns=18,
+ difficulty=2,
+ hints=[
+ "api.py already imports validate_title and validate_tags from validators.",
+ "You need to also import validate_priority and call it in create_task.",
+ ],
+))
+
+
+# โโ Humanize tasks (real-world library) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+_reg(RepoTask(
+ task_id="t4.intpercent",
+ repo_name="humanize",
+ description=textwrap.dedent("""\
+ Add a function `intpercent(value: float, decimal_places: int = 1) -> str`
+ to `number.py`.
+
+ The function should convert a fraction to a percentage string:
+ 0.0 โ "0.0%"
+ 0.5 โ "50.0%"
+ 0.753 โ "75.3%"
+ 1.0 โ "100.0%"
+
+ Use `decimal_places` to control how many digits appear after the decimal.
+ If decimal_places=0, return an integer percentage with no decimal point.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from humanize.number import intpercent
+ assert intpercent(0.0) == "0.0%", f"got {intpercent(0.0)!r}"
+ assert intpercent(0.5) == "50.0%", f"got {intpercent(0.5)!r}"
+ assert intpercent(0.753) == "75.3%", f"got {intpercent(0.753)!r}"
+ assert intpercent(1.0) == "100.0%", f"got {intpercent(1.0)!r}"
+ assert intpercent(0.5, decimal_places=0) == "50%", f"got {intpercent(0.5, decimal_places=0)!r}"
+ """).strip(),
+ max_turns=12,
+ difficulty=0,
+ hints=[
+ "Look at number.py โ the existing functions show the style to follow.",
+ "Use f-string formatting: f'{value * 100:.{decimal_places}f}%'",
+ ],
+))
+
+_reg(RepoTask(
+ task_id="t5.naturalfilecount",
+ repo_name="humanize",
+ description=textwrap.dedent("""\
+ Add a function `naturalfilecount(n: int) -> str` to `filesize.py`.
+
+ The function should return a human-readable file count:
+ 0 โ "no files"
+ 1 โ "1 file"
+ 2 โ "2 files"
+ 99 โ "99 files"
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from humanize.filesize import naturalfilecount
+ assert naturalfilecount(0) == "no files", f"got {naturalfilecount(0)!r}"
+ assert naturalfilecount(1) == "1 file", f"got {naturalfilecount(1)!r}"
+ assert naturalfilecount(2) == "2 files", f"got {naturalfilecount(2)!r}"
+ assert naturalfilecount(99) == "99 files", f"got {naturalfilecount(99)!r}"
+ """).strip(),
+ max_turns=12,
+ difficulty=0,
+ hints=[
+ "Look at filesize.py โ naturalsize is the only function there.",
+ "This is a short function: handle n==0, n==1, and n>1 as three cases.",
+ ],
+))
+
+_reg(RepoTask(
+ task_id="t6.metric",
+ repo_name="humanize",
+ description=textwrap.dedent("""\
+ Add a function `metric(value: float, unit: str = "") -> str` to `number.py`.
+
+ The function should format a number using SI metric prefixes:
+ 1_500_000 โ "1.5 M"
+ 2_000 โ "2.0 k"
+ 500 โ "500" (no prefix below 1000)
+
+ Supported prefixes (largest to smallest): T (10ยนยฒ), G (10โน), M (10โถ), k (10ยณ).
+ If a unit is provided, append it after the prefix: metric(1500, "Hz") โ "1.5 kHz".
+ Always format the scaled number to 1 decimal place.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ from humanize.number import metric
+ assert metric(1_500_000) == "1.5 M", f"got {metric(1_500_000)!r}"
+ assert metric(2_000) == "2.0 k", f"got {metric(2_000)!r}"
+ assert metric(500) == "500", f"got {metric(500)!r}"
+ assert metric(1_500, "Hz") == "1.5 kHz", f"got {metric(1_500, 'Hz')!r}"
+ assert metric(2e9, "W") == "2.0 GW", f"got {metric(2e9, 'W')!r}"
+ """).strip(),
+ max_turns=15,
+ difficulty=1,
+ hints=[
+ "Loop through prefixes from largest to smallest: (1e12,'T'), (1e9,'G'), (1e6,'M'), (1e3,'k').",
+ "If abs(value) >= threshold, scale and format; otherwise return str(int(value)).",
+ ],
+))
+
+_reg(RepoTask(
+ task_id="t7.age",
+ repo_name="humanize",
+ description=textwrap.dedent("""\
+ Add a function `age(birth_date) -> str` to `time.py`.
+
+ The function receives a `datetime.date` and returns a human-readable age:
+ - If the person is under 1 year old, return "X months old" (use 30-day months).
+ - If exactly 1 year, return "1 year old".
+ - Otherwise return "X years old".
+
+ Use `datetime.date.today()` as the reference point.
+ Assume birth_date is always a valid date in the past.
+ """).strip(),
+ test_code=textwrap.dedent("""\
+ import datetime as dt
+ from humanize.time import age
+
+ today = dt.date.today()
+ dob_25y = today.replace(year=today.year - 25)
+ dob_1y = today.replace(year=today.year - 1)
+ dob_6m = today - dt.timedelta(days=182)
+ dob_2m = today - dt.timedelta(days=61)
+
+ assert age(dob_25y) == "25 years old", f"got {age(dob_25y)!r}"
+ assert age(dob_1y) == "1 year old", f"got {age(dob_1y)!r}"
+ assert age(dob_6m) == "6 months old", f"got {age(dob_6m)!r}"
+ assert age(dob_2m) == "2 months old", f"got {age(dob_2m)!r}"
+ """).strip(),
+ max_turns=15,
+ difficulty=1,
+ hints=[
+ "import datetime as dt is already at the top of time.py.",
+ "days = (dt.date.today() - birth_date).days; years = days // 365; months = days // 30",
+ ],
+))
+
+
+# โโ test runner โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def run_tests(task: RepoTask) -> tuple[bool, str]:
+ """Execute task.test_code and return (passed, message)."""
+ # Reload all task_manager modules to pick up any source-level changes
+ _reload_task_manager()
+ try:
+ exec(compile(task.test_code, "", "exec"), {}) # noqa: S102
+ return True, "All assertions passed."
+ except AssertionError as exc:
+ return False, f"AssertionError: {exc}"
+ except Exception:
+ return False, traceback.format_exc(limit=5)
+
+
+def _reload_task_manager() -> None:
+ """Force-reload all task_manager submodules so edits take effect."""
+ prefix = "graphforge.sample_repos.task_manager"
+ to_reload = [k for k in sys.modules if k.startswith(prefix)]
+ for mod_name in to_reload:
+ del sys.modules[mod_name]
+
+
+def all_task_ids() -> list[str]:
+ return list(TASK_BANK.keys())
+
+
+def get_task(task_id: str) -> RepoTask | None:
+ return TASK_BANK.get(task_id)
diff --git a/graphforge/__init__.py b/graphforge/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..785c7afa9ff4f84120664d4afb9cd98f5e0b2d03
--- /dev/null
+++ b/graphforge/__init__.py
@@ -0,0 +1,24 @@
+"""GraphForge โ graph-first code generation environment for long-horizon RL.
+
+The agent constructs Python programs by mutating a typed function-call graph;
+source files are a deterministic projection of the canonical graph.
+
+Top-level subsystems:
+ graph canonical graph schema (Modules, Nodes, Edges)
+ actions eleven-action surface, atomic dispatcher with rollback
+ types signature parser + edge type-flow validator
+ templates ~25-template body library, parameterized
+ materializer graph -> Python source
+ parser Python source -> graph (round-trip)
+ validator parse / import / mypy --strict gate
+ behavioral hypothesis-based property test runner
+ constraints per-kind constraint checker dispatch
+ reward reward engine (per-turn + terminal)
+ tasks task bank + variant generator
+ server FastAPI OpenEnv server
+ training GRPO multi-turn rollout
+
+See README.md for design rationale and PROPOSAL.md for the full spec.
+"""
+
+__version__ = "0.0.1"
diff --git a/graphforge/actions/__init__.py b/graphforge/actions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5b5bd770d975615e7daff3f36863d9ee89d2e9
--- /dev/null
+++ b/graphforge/actions/__init__.py
@@ -0,0 +1,15 @@
+"""Action surface for GraphForge.
+
+Public API:
+
+ from graphforge.actions import dispatch, ActionResult
+ from graphforge.actions.schema import Action, AddNode, ...
+ from graphforge.actions.errors import ActionError
+
+See PROPOSAL.md ยง4 for the full action vocabulary.
+"""
+
+from graphforge.actions.dispatcher import ActionResult, dispatch
+from graphforge.actions.errors import ActionError
+
+__all__ = ["ActionError", "ActionResult", "dispatch"]
diff --git a/graphforge/actions/dispatcher.py b/graphforge/actions/dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fbcbeb03f2833fc93eb4d562ee20aacc1c60789
--- /dev/null
+++ b/graphforge/actions/dispatcher.py
@@ -0,0 +1,442 @@
+"""Atomic action dispatcher.
+
+Applies an :class:`Action` to a :class:`Graph`. Every mutation is atomic:
+the dispatcher snapshots the graph before the handler runs and restores it on
+any failure. Failures surface as :class:`ActionError` with a stable code, never
+as silent partial state.
+
+Information actions (query_*, materialize_*, run_*) are routed but their
+implementations live in their respective subsystems and are stubbed for now.
+``submit`` returns a sentinel so the episode runner can recognize termination.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from graphforge.actions import errors as E
+from graphforge.actions.schema import (
+ Action,
+ AddEdge,
+ AddModule,
+ AddNode,
+ AttachBody,
+ MaterializeAndValidate,
+ QuerySpec,
+ QuerySubgraph,
+ QueryTypes,
+ RemoveEdge,
+ RemoveModule,
+ RemoveNode,
+ RunBehavioralTests,
+ SetNodeModule,
+ Submit,
+)
+from graphforge.actions.signature import parse_signature
+from graphforge.graph.schema import (
+ ArgMapping,
+ Edge,
+ Graph,
+ Module,
+ Node,
+)
+from graphforge.templates import get_template, validate_args
+
+
+# ---- result envelope -------------------------------------------------
+
+
+@dataclass
+class ActionResult:
+ """Envelope returned by :func:`dispatch`."""
+
+ ok: bool
+ payload: dict[str, Any]
+ terminal: bool = False
+
+ @classmethod
+ def success(cls, **payload: Any) -> "ActionResult":
+ return cls(ok=True, payload=payload, terminal=False)
+
+ @classmethod
+ def failure(cls, err: E.ActionError) -> "ActionResult":
+ return cls(ok=False, payload=err.to_dict(), terminal=False)
+
+ @classmethod
+ def terminate(cls, **payload: Any) -> "ActionResult":
+ return cls(ok=True, payload=payload, terminal=True)
+
+
+# ---- dispatcher ------------------------------------------------------
+
+
+def dispatch(graph: Graph, action: Action) -> ActionResult:
+ """Apply ``action`` to ``graph`` in place. Atomic on failure.
+
+ On any handler exception (including :class:`ActionError`) the graph is
+ rolled back to the pre-call snapshot.
+ """
+ snap = graph.snapshot()
+ try:
+ return _route(graph, action)
+ except E.ActionError as err:
+ _restore(graph, snap)
+ return ActionResult.failure(err)
+ except Exception as exc: # pragma: no cover โ unexpected handler bug
+ _restore(graph, snap)
+ return ActionResult.failure(
+ E.ActionError(E.SCHEMA_REJECTION, f"unhandled: {exc}")
+ )
+
+
+def _restore(graph: Graph, snap: Graph) -> None:
+ graph.modules = snap.modules
+ graph.nodes = snap.nodes
+ graph.edges = snap.edges
+
+
+def _route(graph: Graph, action: Action) -> ActionResult:
+ # Mutations
+ if isinstance(action, AddModule):
+ return _h_add_module(graph, action)
+ if isinstance(action, RemoveModule):
+ return _h_remove_module(graph, action)
+ if isinstance(action, AddNode):
+ return _h_add_node(graph, action)
+ if isinstance(action, RemoveNode):
+ return _h_remove_node(graph, action)
+ if isinstance(action, SetNodeModule):
+ return _h_set_node_module(graph, action)
+ if isinstance(action, AttachBody):
+ return _h_attach_body(graph, action)
+ if isinstance(action, AddEdge):
+ return _h_add_edge(graph, action)
+ if isinstance(action, RemoveEdge):
+ return _h_remove_edge(graph, action)
+ # Information (delegated; stubs for now)
+ if isinstance(action, QuerySpec):
+ return _h_query_spec(graph, action)
+ if isinstance(action, QuerySubgraph):
+ return _h_query_subgraph(graph, action)
+ if isinstance(action, QueryTypes):
+ return _h_query_types(graph, action)
+ if isinstance(action, MaterializeAndValidate):
+ return _h_materialize(graph, action)
+ if isinstance(action, RunBehavioralTests):
+ return _h_run_tests(graph, action)
+ if isinstance(action, Submit):
+ return _h_submit(graph, action)
+ raise E.ActionError(E.SCHEMA_REJECTION, f"unknown action: {type(action).__name__}")
+
+
+# ---- mutation handlers ----------------------------------------------
+
+
+def _h_add_module(graph: Graph, a: AddModule) -> ActionResult:
+ if graph.find_module(a.name) is not None:
+ raise E.ActionError(
+ E.NAME_COLLISION, f"module {a.name!r} already exists", name=a.name
+ )
+ graph.modules.append(Module(name=a.name, responsibility=a.responsibility))
+ return ActionResult.success(added_module=a.name)
+
+
+def _h_remove_module(graph: Graph, a: RemoveModule) -> ActionResult:
+ mod = graph.find_module(a.name)
+ if mod is None:
+ raise E.ActionError(E.UNKNOWN_MODULE, f"module {a.name!r} does not exist", name=a.name)
+ if any(n.module == a.name for n in graph.nodes):
+ raise E.ActionError(
+ E.MODULE_NOT_EMPTY,
+ f"module {a.name!r} still contains nodes",
+ name=a.name,
+ node_count=sum(1 for n in graph.nodes if n.module == a.name),
+ )
+ graph.modules = [m for m in graph.modules if m.name != a.name]
+ return ActionResult.success(removed_module=a.name)
+
+
+def _h_add_node(graph: Graph, a: AddNode) -> ActionResult:
+ if graph.find_module(a.module) is None:
+ raise E.ActionError(E.UNKNOWN_MODULE, f"module {a.module!r} does not exist", name=a.module)
+ if graph.find_node(a.name, a.module) is not None:
+ raise E.ActionError(
+ E.NAME_COLLISION,
+ f"node {a.module}.{a.name} already exists",
+ name=a.name,
+ module=a.module,
+ )
+ # Surface signature parse โ catches errors that the pydantic regex misses.
+ try:
+ parse_signature(a.signature)
+ except ValueError as ve:
+ raise E.ActionError(E.SCHEMA_REJECTION, str(ve), signature=a.signature) from ve
+ decl_order = max((n.decl_order for n in graph.nodes), default=-1) + 1
+ graph.nodes.append(
+ Node(
+ name=a.name,
+ module=a.module,
+ signature=a.signature,
+ purity=a.purity,
+ error_policy=a.error_policy,
+ decl_order=decl_order,
+ )
+ )
+ return ActionResult.success(added_node=f"{a.module}.{a.name}", decl_order=decl_order)
+
+
+def _h_remove_node(graph: Graph, a: RemoveNode) -> ActionResult:
+ n = graph.find_node(a.name, a.module)
+ if n is None:
+ raise E.ActionError(
+ E.UNKNOWN_NODE, f"node {a.module}.{a.name} does not exist", name=a.name, module=a.module
+ )
+ qn = n.qualified_name
+ refs = [e for e in graph.edges if e.caller == qn or e.callee == qn]
+ if refs:
+ raise E.ActionError(
+ E.NODE_HAS_REFERENCES,
+ f"node {qn} is referenced by {len(refs)} edge(s)",
+ name=a.name,
+ module=a.module,
+ referencing_edges=[(e.caller, e.callee) for e in refs],
+ )
+ graph.nodes = [m for m in graph.nodes if not (m.name == a.name and m.module == a.module)]
+ return ActionResult.success(removed_node=qn)
+
+
+def _h_set_node_module(graph: Graph, a: SetNodeModule) -> ActionResult:
+ n = graph.find_node(a.name, a.current_module)
+ if n is None:
+ raise E.ActionError(
+ E.UNKNOWN_NODE,
+ f"node {a.current_module}.{a.name} does not exist",
+ name=a.name,
+ module=a.current_module,
+ )
+ new_mod = graph.find_module(a.new_module)
+ if new_mod is None:
+ raise E.ActionError(
+ E.UNKNOWN_MODULE,
+ f"target module {a.new_module!r} does not exist",
+ name=a.new_module,
+ )
+ if graph.find_node(a.name, a.new_module) is not None:
+ raise E.ActionError(
+ E.NAME_COLLISION,
+ f"node named {a.name!r} already exists in {a.new_module!r}",
+ name=a.name,
+ module=a.new_module,
+ )
+ old_qn = n.qualified_name
+ new_qn = f"{a.new_module}.{a.name}"
+ n.module = a.new_module
+ # Rewrite edge endpoints that referred to the old qualified name.
+ for e in graph.edges:
+ if e.caller == old_qn:
+ e.caller = new_qn
+ if e.callee == old_qn:
+ e.callee = new_qn
+ # Post-condition: rewriting must not have introduced an import cycle.
+ if graph.has_module_cycle():
+ raise E.ActionError(
+ E.WOULD_CREATE_CYCLE,
+ f"moving {old_qn} -> {new_qn} would create an import cycle",
+ from_qn=old_qn,
+ to_qn=new_qn,
+ )
+ return ActionResult.success(moved_node={"from": old_qn, "to": new_qn})
+
+
+def _h_attach_body(graph: Graph, a: AttachBody) -> ActionResult:
+ n = graph.find_node(a.name, a.module)
+ if n is None:
+ raise E.ActionError(
+ E.UNKNOWN_NODE,
+ f"node {a.module}.{a.name} does not exist",
+ name=a.name,
+ module=a.module,
+ )
+ spec = get_template(a.template)
+ if spec is None:
+ raise E.ActionError(
+ E.UNKNOWN_TEMPLATE, f"unknown template {a.template!r}", template=a.template
+ )
+ problems = validate_args(a.template, a.args)
+ if problems:
+ raise E.ActionError(
+ E.TEMPLATE_ARGS_INVALID,
+ f"args invalid for template {a.template!r}: {'; '.join(problems)}",
+ template=a.template,
+ problems=problems,
+ )
+ out_d = graph.fan_out(n.qualified_name)
+ in_d = graph.fan_in(n.qualified_name)
+ if not spec.edges_ok(out_d, in_d):
+ raise E.ActionError(
+ E.TEMPLATE_ARGS_INVALID,
+ f"template {a.template!r} requires different edge structure "
+ f"(out_d={out_d}, in_d={in_d})",
+ template=a.template,
+ out_degree=out_d,
+ in_degree=in_d,
+ )
+ n.body_template = a.template
+ n.body_template_args = dict(a.args)
+ return ActionResult.success(
+ attached={"node": n.qualified_name, "template": a.template}
+ )
+
+
+def _h_add_edge(graph: Graph, a: AddEdge) -> ActionResult:
+ caller = graph.find_node_qualified(a.caller)
+ callee = graph.find_node_qualified(a.callee)
+ if caller is None:
+ raise E.ActionError(E.UNKNOWN_NODE, f"caller {a.caller!r} does not exist", node=a.caller)
+ if callee is None:
+ raise E.ActionError(E.UNKNOWN_NODE, f"callee {a.callee!r} does not exist", node=a.callee)
+ if graph.find_edge(a.caller, a.callee) is not None:
+ raise E.ActionError(
+ E.DUPLICATE_EDGE,
+ f"edge {a.caller} -> {a.callee} already exists",
+ caller=a.caller,
+ callee=a.callee,
+ )
+ # Validate arg_mapping covers all required parameters of callee.
+ callee_sig = parse_signature(callee.signature)
+ caller_sig = parse_signature(caller.signature)
+ mapped_callee = {m.callee_param for m in a.arg_mapping}
+ mapped_caller = {m.caller_arg for m in a.arg_mapping}
+ missing = set(callee_sig.required_params) - mapped_callee
+ if missing:
+ raise E.ActionError(
+ E.ARG_MAPPING_INVALID,
+ f"arg_mapping is missing required callee params: {sorted(missing)}",
+ missing=sorted(missing),
+ )
+ bogus_callee = mapped_callee - set(callee_sig.all_params)
+ if bogus_callee:
+ raise E.ActionError(
+ E.ARG_MAPPING_INVALID,
+ f"arg_mapping references unknown callee params: {sorted(bogus_callee)}",
+ unknown=sorted(bogus_callee),
+ )
+ bogus_caller = mapped_caller - set(caller_sig.all_params)
+ if bogus_caller:
+ raise E.ActionError(
+ E.ARG_MAPPING_INVALID,
+ f"arg_mapping references unknown caller args: {sorted(bogus_caller)}",
+ unknown=sorted(bogus_caller),
+ )
+ # Add tentatively; check post-condition.
+ graph.edges.append(
+ Edge(
+ caller=a.caller,
+ callee=a.callee,
+ arg_mapping=[ArgMapping(**m.model_dump()) for m in a.arg_mapping],
+ )
+ )
+ if graph.has_module_cycle():
+ raise E.ActionError(
+ E.WOULD_CREATE_CYCLE,
+ f"adding edge {a.caller} -> {a.callee} would create an import cycle",
+ caller=a.caller,
+ callee=a.callee,
+ )
+ return ActionResult.success(added_edge={"caller": a.caller, "callee": a.callee})
+
+
+def _h_remove_edge(graph: Graph, a: RemoveEdge) -> ActionResult:
+ e = graph.find_edge(a.caller, a.callee)
+ if e is None:
+ raise E.ActionError(
+ E.UNKNOWN_EDGE,
+ f"edge {a.caller} -> {a.callee} does not exist",
+ caller=a.caller,
+ callee=a.callee,
+ )
+ graph.edges = [
+ x for x in graph.edges if not (x.caller == a.caller and x.callee == a.callee)
+ ]
+ return ActionResult.success(removed_edge={"caller": a.caller, "callee": a.callee})
+
+
+# ---- info / terminal handlers (stubs) -------------------------------
+
+
+def _h_query_spec(graph: Graph, a: QuerySpec) -> ActionResult:
+ # TODO: route to graphforge.constraints once tasks/specs are wired in.
+ return ActionResult.success(
+ not_implemented="query_spec routed via dispatcher; constraint engine TODO",
+ constraint_kind=a.constraint_kind,
+ )
+
+
+def _h_query_subgraph(graph: Graph, a: QuerySubgraph) -> ActionResult:
+ scope = a.scope
+ if scope.startswith("module:"):
+ mod = scope[len("module:") :]
+ nodes = [n.model_dump() for n in graph.nodes_in_module(mod)]
+ edges = [
+ e.model_dump()
+ for e in graph.edges
+ if e.caller.split(".")[0] == mod and e.callee.split(".")[0] == mod
+ ]
+ return ActionResult.success(scope=scope, nodes=nodes, edges=edges)
+ if scope.startswith("neighbors:"):
+ qn = scope[len("neighbors:") :]
+ return ActionResult.success(
+ scope=scope,
+ callers=graph.callers_of(qn),
+ callees=graph.callees_of(qn),
+ )
+ if scope.startswith("path:"):
+ # TODO: shortest-path search over call graph.
+ return ActionResult.success(
+ scope=scope, not_implemented="path search TODO"
+ )
+ raise E.ActionError(E.SCHEMA_REJECTION, f"unrecognized subgraph scope {scope!r}")
+
+
+def _h_query_types(graph: Graph, a: QueryTypes) -> ActionResult:
+ # TODO: delegate to graphforge.types.
+ return ActionResult.success(
+ scope=a.scope, not_implemented="type engine TODO"
+ )
+
+
+def _h_materialize(graph: Graph, a: MaterializeAndValidate) -> ActionResult:
+ """Project the graph to source and run the parse-only validator gate.
+
+ Heavier validation gates (mypy --strict, import-resolution, behavioral
+ tests) are added to this action's report as their subsystems land.
+ """
+ from graphforge.materializer import materialize as _materialize
+ from graphforge.validator import full_check
+
+ try:
+ files = _materialize(graph)
+ except ValueError as ve:
+ # Codegen rejected the graph (e.g. unknown pattern, template/edge
+ # structure mismatch missed by the dispatcher's preconditions).
+ raise E.ActionError(
+ E.SCHEMA_REJECTION, f"materialization failed: {ve}"
+ ) from ve
+ report = full_check(files)
+ return ActionResult.success(
+ files=list(files.keys()),
+ bytes_total=sum(len(s) for s in files.values()),
+ report=report.to_dict(),
+ )
+
+
+def _h_run_tests(graph: Graph, a: RunBehavioralTests) -> ActionResult:
+ # TODO: delegate to graphforge.behavioral.
+ raise E.ActionError(
+ E.SCHEMA_REJECTION, "run_behavioral_tests is not yet implemented"
+ )
+
+
+def _h_submit(graph: Graph, a: Submit) -> ActionResult:
+ return ActionResult.terminate(submitted=True)
diff --git a/graphforge/actions/errors.py b/graphforge/actions/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..e499876f65202e64b92c8ab129fd6bc545067eb3
--- /dev/null
+++ b/graphforge/actions/errors.py
@@ -0,0 +1,44 @@
+"""Structured action errors.
+
+Every failure mode in the action dispatcher surfaces as an :class:`ActionError`
+with a stable ``code`` so the agent can be trained against deterministic error
+strings (see PROPOSAL.md ยง4.4 โ "failures return structured errors describing
+the cause"). Codes are kept short and stable across versions.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class ActionError(Exception):
+ """Raised by action handlers; caught and reported by the dispatcher."""
+
+ def __init__(self, code: str, message: str, **details: Any) -> None:
+ super().__init__(f"[{code}] {message}")
+ self.code = code
+ self.message = message
+ self.details = details
+
+ def to_dict(self) -> dict[str, Any]:
+ return {"error": self.code, "message": self.message, **self.details}
+
+
+# ---- canonical codes -------------------------------------------------
+# Schema layer
+SCHEMA_REJECTION = "schema_rejection"
+# Pre-condition layer
+UNKNOWN_MODULE = "unknown_module"
+UNKNOWN_NODE = "unknown_node"
+UNKNOWN_EDGE = "unknown_edge"
+NAME_COLLISION = "name_collision"
+MODULE_NOT_EMPTY = "module_not_empty"
+NODE_HAS_REFERENCES = "node_has_references"
+DUPLICATE_EDGE = "duplicate_edge"
+UNKNOWN_TEMPLATE = "unknown_template"
+TEMPLATE_ARGS_INVALID = "template_args_invalid"
+RESPONSIBILITY_MISMATCH = "responsibility_mismatch"
+ARG_MAPPING_INVALID = "arg_mapping_invalid"
+# Post-condition layer
+WOULD_CREATE_CYCLE = "would_create_cycle"
+TYPE_MISMATCH = "type_mismatch"
diff --git a/graphforge/actions/schema.py b/graphforge/actions/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc089c21ac800003bc817e159eaeac2ebb8dbb9
--- /dev/null
+++ b/graphforge/actions/schema.py
@@ -0,0 +1,180 @@
+"""Action message schemas.
+
+These are the wire shapes accepted by the dispatcher. Every action is a
+discriminated-union member keyed on ``kind``.
+
+The action vocabulary mirrors PROPOSAL.md ยง4. Total surface:
+
+ Graph mutations
+ add_module, remove_module
+ add_node, remove_node, set_node_module, attach_body
+ add_edge, remove_edge
+ Information
+ query_spec, query_subgraph, query_types,
+ materialize_and_validate, run_behavioral_tests
+ Terminal
+ submit
+
+Note: the proposal abstract states "eleven actions"; the section-4 listing
+contains fourteen. We implement the section-4 set; the abstract count will
+be corrected in the next revision of PROPOSAL.md.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.graph.schema import ArgMapping, ErrorPolicy, Purity, ResponsibilityTag
+
+
+# Common config: forbid unknown fields, fail loudly on schema drift.
+_cfg = ConfigDict(extra="forbid")
+
+
+# ---- mutations -------------------------------------------------------
+
+
+class AddModule(BaseModel):
+ model_config = _cfg
+ kind: Literal["add_module"] = "add_module"
+ name: str
+ responsibility: ResponsibilityTag
+
+
+class RemoveModule(BaseModel):
+ model_config = _cfg
+ kind: Literal["remove_module"] = "remove_module"
+ name: str
+
+
+class AddNode(BaseModel):
+ model_config = _cfg
+ kind: Literal["add_node"] = "add_node"
+ name: str
+ module: str
+ signature: str
+ purity: Purity = "impure"
+ error_policy: ErrorPolicy = "none"
+
+
+class RemoveNode(BaseModel):
+ model_config = _cfg
+ kind: Literal["remove_node"] = "remove_node"
+ name: str
+ module: str
+
+
+class SetNodeModule(BaseModel):
+ model_config = _cfg
+ kind: Literal["set_node_module"] = "set_node_module"
+ name: str
+ current_module: str
+ new_module: str
+
+
+class AttachBody(BaseModel):
+ model_config = _cfg
+ kind: Literal["attach_body"] = "attach_body"
+ name: str
+ module: str
+ template: str
+ args: dict[str, object] = Field(default_factory=dict)
+
+
+class AddEdge(BaseModel):
+ model_config = _cfg
+ kind: Literal["add_edge"] = "add_edge"
+ caller: str
+ callee: str
+ arg_mapping: list[ArgMapping] = Field(default_factory=list)
+
+
+class RemoveEdge(BaseModel):
+ model_config = _cfg
+ kind: Literal["remove_edge"] = "remove_edge"
+ caller: str
+ callee: str
+
+
+# ---- information actions --------------------------------------------
+
+
+class QuerySpec(BaseModel):
+ model_config = _cfg
+ kind: Literal["query_spec"] = "query_spec"
+ constraint_kind: Optional[str] = None
+
+
+class QuerySubgraph(BaseModel):
+ model_config = _cfg
+ kind: Literal["query_subgraph"] = "query_subgraph"
+ scope: str # "module:" | "neighbors:" | "path::"
+
+
+class QueryTypes(BaseModel):
+ model_config = _cfg
+ kind: Literal["query_types"] = "query_types"
+ scope: str # "all" | "module:" | "node:"
+
+
+class MaterializeAndValidate(BaseModel):
+ model_config = _cfg
+ kind: Literal["materialize_and_validate"] = "materialize_and_validate"
+
+
+class RunBehavioralTests(BaseModel):
+ model_config = _cfg
+ kind: Literal["run_behavioral_tests"] = "run_behavioral_tests"
+ materialized: bool = True
+
+
+# ---- terminal --------------------------------------------------------
+
+
+class Submit(BaseModel):
+ model_config = _cfg
+ kind: Literal["submit"] = "submit"
+
+
+# ---- discriminated union --------------------------------------------
+
+Action = Annotated[
+ Union[
+ AddModule,
+ RemoveModule,
+ AddNode,
+ RemoveNode,
+ SetNodeModule,
+ AttachBody,
+ AddEdge,
+ RemoveEdge,
+ QuerySpec,
+ QuerySubgraph,
+ QueryTypes,
+ MaterializeAndValidate,
+ RunBehavioralTests,
+ Submit,
+ ],
+ Field(discriminator="kind"),
+]
+
+
+__all__ = [
+ "Action",
+ "AddModule",
+ "RemoveModule",
+ "AddNode",
+ "RemoveNode",
+ "SetNodeModule",
+ "AttachBody",
+ "AddEdge",
+ "RemoveEdge",
+ "QuerySpec",
+ "QuerySubgraph",
+ "QueryTypes",
+ "MaterializeAndValidate",
+ "RunBehavioralTests",
+ "Submit",
+]
diff --git a/graphforge/actions/signature.py b/graphforge/actions/signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..f853a7a513c673e51958dbe2c39355614f25b394
--- /dev/null
+++ b/graphforge/actions/signature.py
@@ -0,0 +1,116 @@
+"""Cheap signature parser.
+
+Used by the dispatcher to validate ``add_edge`` arg-mappings against the
+callee's parameter list. Real type flow validation (caller_arg type vs
+callee_param type) is the type engine; this module only extracts parameter
+*names* from a signature string of the form::
+
+ (a: int, b: str = "x", *, c: bool) -> bool
+
+Annotations are tolerated as opaque text. Defaults are tolerated and treated
+as making the parameter optional.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Parameter:
+ name: str
+ annotation: str | None
+ has_default: bool
+
+
+@dataclass(frozen=True)
+class ParsedSignature:
+ parameters: list[Parameter]
+ return_annotation: str
+
+ @property
+ def required_params(self) -> list[str]:
+ return [p.name for p in self.parameters if not p.has_default]
+
+ @property
+ def all_params(self) -> list[str]:
+ return [p.name for p in self.parameters]
+
+
+_SIG_RE = re.compile(r"^\s*\((?P.*)\)\s*->\s*(?P.+?)\s*$", re.DOTALL)
+
+
+def parse_signature(sig: str) -> ParsedSignature:
+ """Parse a function signature string. Lenient โ caller validates more deeply.
+
+ Raises ``ValueError`` on signatures that fail surface checks. The schema
+ layer (Node validator) already requires ``(`` and ``->``; this is the
+ secondary parse used at dispatch time.
+ """
+ m = _SIG_RE.match(sig)
+ if not m:
+ raise ValueError(f"could not parse signature: {sig!r}")
+ raw_params = m.group("params").strip()
+ ret = m.group("ret").strip()
+
+ params: list[Parameter] = []
+ if raw_params:
+ for piece in _split_top_level(raw_params, ","):
+ piece = piece.strip()
+ if not piece or piece in {"*", "/"}:
+ continue
+ if piece.startswith("**"):
+ piece = piece[2:].lstrip()
+ elif piece.startswith("*"):
+ piece = piece[1:].lstrip()
+ has_default = False
+ if "=" in piece:
+ # split off default at top-level '=' (ignore ones inside [..]).
+ head, default = _split_default(piece)
+ piece = head.strip()
+ has_default = default is not None
+ name = piece
+ annotation: str | None = None
+ if ":" in piece:
+ name, annotation = piece.split(":", 1)
+ name = name.strip()
+ annotation = annotation.strip()
+ if not name.isidentifier():
+ raise ValueError(f"unparseable parameter {piece!r} in {sig!r}")
+ params.append(Parameter(name=name, annotation=annotation, has_default=has_default))
+
+ return ParsedSignature(parameters=params, return_annotation=ret)
+
+
+def _split_top_level(s: str, sep: str) -> list[str]:
+ """Split ``s`` on ``sep`` at bracket-depth 0."""
+ out: list[str] = []
+ depth = 0
+ buf: list[str] = []
+ for ch in s:
+ if ch in "([{":
+ depth += 1
+ elif ch in ")]}":
+ depth -= 1
+ if ch == sep and depth == 0:
+ out.append("".join(buf))
+ buf = []
+ else:
+ buf.append(ch)
+ if buf:
+ out.append("".join(buf))
+ return out
+
+
+def _split_default(piece: str) -> tuple[str, str | None]:
+ """Split off ``= default`` at bracket-depth 0. Returns (head, default | None)."""
+ depth = 0
+ for i, ch in enumerate(piece):
+ if ch in "([{":
+ depth += 1
+ elif ch in ")]}":
+ depth -= 1
+ elif ch == "=" and depth == 0:
+ return piece[:i], piece[i + 1 :]
+ return piece, None
diff --git a/graphforge/behavioral/__init__.py b/graphforge/behavioral/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..819cb3abdb861b54abf4592aade6f2ff083980e5
--- /dev/null
+++ b/graphforge/behavioral/__init__.py
@@ -0,0 +1,25 @@
+"""Behavioral test runner.
+
+Responsibilities (PROPOSAL.md ยง2.1, ยง6.2):
+
+ * Run a property-based test suite (hypothesis) against materialized code,
+ in a sandboxed subprocess with timeout + memory limit.
+ * Tests are part of the task definition; their bodies are *hidden* from
+ the agent. The agent sees only test names and pass/fail at submission.
+ * Distinguish failures (assertion) from errors (timeout, crash) โ both
+ count as test failures, but they're surfaced separately for diagnostics.
+
+Public surface (TODO):
+
+ run_tests(files, tests, timeout=12.0) -> dict[str, TestResult]
+"""
+
+from __future__ import annotations
+
+
+def run_tests( # pragma: no cover โ TODO
+ files: dict[str, str],
+ tests: list[object],
+ timeout: float = 12.0,
+) -> dict[str, object]:
+ raise NotImplementedError("behavioral runner TODO โ see PROPOSAL.md ยง6.2")
diff --git a/graphforge/constraints/__init__.py b/graphforge/constraints/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b65de419a05fa2b2772b7e83586f16e1acaf5e1
--- /dev/null
+++ b/graphforge/constraints/__init__.py
@@ -0,0 +1,49 @@
+"""Constraint vocabulary and dispatch.
+
+Three families (PROPOSAL.md ยง2.2):
+
+ * Structural โ node_exists, edge_exists, module_count, acyclic_imports,
+ fan_in_max, fan_out_max, dag_depth_max, internal_only, โฆ
+ * Type / signature โ signature_matches, return_type, arg_type,
+ type_consistency, no_any_types, pure_function (TODO)
+ * Behavioral / materialization โ materializes, imports_resolve,
+ type_checks, behavioral_test_passes, error_handling_present|absent
+
+Currently shipped: tier-0 subset of structural + ``materializes``. Additional
+kinds land as new discriminated members in :mod:`schema` and matching
+``_check_*`` functions in :mod:`checker`.
+"""
+
+from graphforge.constraints.checker import (
+ SatisfactionReport,
+ check,
+ evaluate_all,
+)
+from graphforge.constraints.schema import (
+ AcyclicImports,
+ Constraint,
+ EdgeExists,
+ Materializes,
+ ModuleCount,
+ ModuleResponsibility,
+ ModuleSizeMax,
+ NodeAbsent,
+ NodeExists,
+ STRUCTURAL_KINDS,
+)
+
+__all__ = [
+ "AcyclicImports",
+ "Constraint",
+ "EdgeExists",
+ "Materializes",
+ "ModuleCount",
+ "ModuleResponsibility",
+ "ModuleSizeMax",
+ "NodeAbsent",
+ "NodeExists",
+ "STRUCTURAL_KINDS",
+ "SatisfactionReport",
+ "check",
+ "evaluate_all",
+]
diff --git a/graphforge/constraints/checker.py b/graphforge/constraints/checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a54277b5420b867ee0a2110e7793ea20ef0ee8b
--- /dev/null
+++ b/graphforge/constraints/checker.py
@@ -0,0 +1,141 @@
+"""Constraint checker dispatch.
+
+Each constraint kind has a small ``_check_*`` function. ``check`` routes by
+isinstance and ``evaluate_all`` reports which constraints from a list are
+satisfied or not.
+
+Behavioral / materialization constraints (currently just ``materializes``)
+delegate to the materializer and validator subsystems.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from graphforge.constraints.schema import (
+ AcyclicImports,
+ Constraint,
+ EdgeExists,
+ Materializes,
+ ModuleCount,
+ ModuleResponsibility,
+ ModuleSizeMax,
+ NodeAbsent,
+ NodeExists,
+ STRUCTURAL_KINDS,
+)
+from graphforge.graph.schema import Graph
+
+
+@dataclass
+class SatisfactionReport:
+ satisfied: list[Constraint] = field(default_factory=list)
+ unsatisfied: list[Constraint] = field(default_factory=list)
+
+ @property
+ def total(self) -> int:
+ return len(self.satisfied) + len(self.unsatisfied)
+
+ @property
+ def all_satisfied(self) -> bool:
+ return self.total > 0 and not self.unsatisfied
+
+ def split_by_family(self) -> tuple["SatisfactionReport", "SatisfactionReport"]:
+ """Split into (structural, behavioral) sub-reports.
+
+ Useful for the reward engine, which scores the two families with
+ different magnitudes per PROPOSAL.md ยง5.2.
+ """
+ sr = SatisfactionReport()
+ br = SatisfactionReport()
+ for c in self.satisfied:
+ (sr if c.kind in STRUCTURAL_KINDS else br).satisfied.append(c)
+ for c in self.unsatisfied:
+ (sr if c.kind in STRUCTURAL_KINDS else br).unsatisfied.append(c)
+ return sr, br
+
+ def to_dict(self) -> dict[str, object]:
+ return {
+ "satisfied": [c.model_dump() for c in self.satisfied],
+ "unsatisfied": [c.model_dump() for c in self.unsatisfied],
+ "total": self.total,
+ "all_satisfied": self.all_satisfied,
+ }
+
+
+# ---- per-kind checkers ----------------------------------------------
+
+
+def _check_node_exists(g: Graph, c: NodeExists) -> bool:
+ return g.find_node(c.name, c.module) is not None
+
+
+def _check_node_absent(g: Graph, c: NodeAbsent) -> bool:
+ return g.find_node(c.name, c.module) is None
+
+
+def _check_edge_exists(g: Graph, c: EdgeExists) -> bool:
+ return g.find_edge(c.caller, c.callee) is not None
+
+
+def _check_module_count(g: Graph, c: ModuleCount) -> bool:
+ return len(g.modules) == c.n
+
+
+def _check_module_size_max(g: Graph, c: ModuleSizeMax) -> bool:
+ return len(g.nodes_in_module(c.module)) <= c.n
+
+
+def _check_module_responsibility(g: Graph, c: ModuleResponsibility) -> bool:
+ m = g.find_module(c.module)
+ return m is not None and m.responsibility == c.responsibility
+
+
+def _check_acyclic_imports(g: Graph, _c: AcyclicImports) -> bool:
+ return not g.has_module_cycle()
+
+
+def _check_materializes(g: Graph, _c: Materializes) -> bool:
+ # Imported lazily so that callers who don't use this checker don't pay
+ # the cost of pulling the materializer/validator graph.
+ from graphforge.materializer import materialize
+ from graphforge.validator import full_check
+
+ try:
+ files = materialize(g)
+ except Exception:
+ return False
+ return full_check(files).ok
+
+
+# ---- dispatch --------------------------------------------------------
+
+
+def check(graph: Graph, constraint: Constraint) -> bool:
+ if isinstance(constraint, NodeExists):
+ return _check_node_exists(graph, constraint)
+ if isinstance(constraint, NodeAbsent):
+ return _check_node_absent(graph, constraint)
+ if isinstance(constraint, EdgeExists):
+ return _check_edge_exists(graph, constraint)
+ if isinstance(constraint, ModuleCount):
+ return _check_module_count(graph, constraint)
+ if isinstance(constraint, ModuleSizeMax):
+ return _check_module_size_max(graph, constraint)
+ if isinstance(constraint, ModuleResponsibility):
+ return _check_module_responsibility(graph, constraint)
+ if isinstance(constraint, AcyclicImports):
+ return _check_acyclic_imports(graph, constraint)
+ if isinstance(constraint, Materializes):
+ return _check_materializes(graph, constraint)
+ raise ValueError(f"unknown constraint kind: {constraint!r}")
+
+
+def evaluate_all(graph: Graph, constraints: list[Constraint]) -> SatisfactionReport:
+ rep = SatisfactionReport()
+ for c in constraints:
+ if check(graph, c):
+ rep.satisfied.append(c)
+ else:
+ rep.unsatisfied.append(c)
+ return rep
diff --git a/graphforge/constraints/schema.py b/graphforge/constraints/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..05fbbd38ce8e8070827582326a9f26d498ac2a4f
--- /dev/null
+++ b/graphforge/constraints/schema.py
@@ -0,0 +1,129 @@
+"""Constraint schemas (tier-0 subset).
+
+Constraints are pydantic discriminated-union members keyed on ``kind``.
+Tier-0 carves out the smallest set sufficient to express a real task and
+exercise the reward engine end-to-end. The remaining vocabulary in
+PROPOSAL.md ยง2.2 (fan_in_max, dag_depth_max, type_consistency,
+behavioral_test_passes, โฆ) lands on top of this same shape as new
+discriminated members + checker functions.
+
+Each constraint member is a pure data record. Behavior lives in
+:mod:`graphforge.constraints.checker`.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.graph.schema import ResponsibilityTag
+
+_cfg = ConfigDict(extra="forbid")
+
+
+# ---- structural ------------------------------------------------------
+
+
+class NodeExists(BaseModel):
+ model_config = _cfg
+ kind: Literal["node_exists"] = "node_exists"
+ name: str
+ module: str
+
+
+class NodeAbsent(BaseModel):
+ model_config = _cfg
+ kind: Literal["node_absent"] = "node_absent"
+ name: str
+ module: str
+
+
+class EdgeExists(BaseModel):
+ model_config = _cfg
+ kind: Literal["edge_exists"] = "edge_exists"
+ caller: str # qualified
+ callee: str # qualified
+
+
+class ModuleCount(BaseModel):
+ model_config = _cfg
+ kind: Literal["module_count"] = "module_count"
+ n: int = Field(..., ge=0)
+
+
+class ModuleSizeMax(BaseModel):
+ model_config = _cfg
+ kind: Literal["module_size_max"] = "module_size_max"
+ module: str
+ n: int = Field(..., ge=0)
+
+
+class ModuleResponsibility(BaseModel):
+ model_config = _cfg
+ kind: Literal["module_responsibility"] = "module_responsibility"
+ module: str
+ responsibility: ResponsibilityTag
+
+
+class AcyclicImports(BaseModel):
+ model_config = _cfg
+ kind: Literal["acyclic_imports"] = "acyclic_imports"
+
+
+# ---- behavioral / materialization -----------------------------------
+
+
+class Materializes(BaseModel):
+ model_config = _cfg
+ kind: Literal["materializes"] = "materializes"
+
+
+# ---- discriminated union --------------------------------------------
+
+Constraint = Annotated[
+ Union[
+ NodeExists,
+ NodeAbsent,
+ EdgeExists,
+ ModuleCount,
+ ModuleSizeMax,
+ ModuleResponsibility,
+ AcyclicImports,
+ Materializes,
+ ],
+ Field(discriminator="kind"),
+]
+
+
+# Set of kinds considered "structural" for the reward engine's per-constraint
+# +1 magnitude. The "behavioral" family is reserved for property-test results
+# (BehavioralTestPasses, TODO) which earn the higher +3 magnitude. The
+# ``materializes`` constraint is structural for scoring purposes; the more
+# severe "Materialization fails: -8" penalty in PROPOSAL.md ยง5.2 is an
+# independent gate driven by the materializer raising or returning parse
+# errors, not by this constraint kind.
+STRUCTURAL_KINDS = {
+ "node_exists",
+ "node_absent",
+ "edge_exists",
+ "module_count",
+ "module_size_max",
+ "module_responsibility",
+ "acyclic_imports",
+ "materializes",
+}
+
+
+__all__ = [
+ "AcyclicImports",
+ "Constraint",
+ "EdgeExists",
+ "Materializes",
+ "ModuleCount",
+ "ModuleResponsibility",
+ "ModuleSizeMax",
+ "NodeAbsent",
+ "NodeExists",
+ "STRUCTURAL_KINDS",
+]
diff --git a/graphforge/graph/__init__.py b/graphforge/graph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a954ba19fa90da3c80d1d6f5df43cb19d6ad07
--- /dev/null
+++ b/graphforge/graph/__init__.py
@@ -0,0 +1,23 @@
+"""Canonical graph schema. See :mod:`graphforge.graph.schema`."""
+
+from graphforge.graph.schema import (
+ ArgMapping,
+ Edge,
+ ErrorPolicy,
+ Graph,
+ Module,
+ Node,
+ Purity,
+ ResponsibilityTag,
+)
+
+__all__ = [
+ "ArgMapping",
+ "Edge",
+ "ErrorPolicy",
+ "Graph",
+ "Module",
+ "Node",
+ "Purity",
+ "ResponsibilityTag",
+]
diff --git a/graphforge/graph/schema.py b/graphforge/graph/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2bc176073c477a5857979f96573a7b2a9165482
--- /dev/null
+++ b/graphforge/graph/schema.py
@@ -0,0 +1,308 @@
+"""Canonical graph schema.
+
+The graph is the single source of truth for an in-progress program. Every
+materialization is a deterministic function of (graph, template library).
+
+Wire format mirrors the JSON shape documented in PROPOSAL.md ยง3.1, exactly:
+
+ {
+ "modules": [{"name": ..., "responsibility": ...}, ...],
+ "nodes": [{"name": ..., "module": ..., "signature": ...,
+ "body_template": ..., "body_template_args": {...},
+ "purity": ..., "error_policy": ..., "decl_order": ...}, ...],
+ "edges": [{"caller": ".",
+ "callee": ".",
+ "arg_mapping": [{"caller_arg": ..., "callee_param": ...}, ...]}, ...]
+ }
+
+This module enforces shape and well-formedness only. Higher-order invariants
+(unique names, edge endpoints exist, no cycles, type-flow compatibility) are
+enforced by the action dispatcher and the type engine, not the schema, so
+that callers can build partial / invalid graphs and inspect why they fail.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from typing import Literal, Optional
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
+# ----------------------------------------------------------------------
+# Enumerated tags
+# ----------------------------------------------------------------------
+
+# Responsibility tags constrain which kinds of nodes a module is allowed to
+# host. The canonical set; new tags are added intentionally because tasks
+# encode constraints against this vocabulary.
+ResponsibilityTag = Literal[
+ "io",
+ "validation",
+ "transform",
+ "orchestration",
+ "storage",
+ "formatting",
+ "lookup",
+ "policy",
+ "logging",
+ "computation",
+]
+
+Purity = Literal["pure", "impure"]
+
+# How a function handles errors in its body. "guard" means it includes a
+# guard / try-except. "propagate" means it deliberately lets errors flow up.
+# "none" is the default โ no claim either way.
+ErrorPolicy = Literal["guard", "propagate", "none"]
+
+
+# ----------------------------------------------------------------------
+# Atomic records
+# ----------------------------------------------------------------------
+
+
+class Module(BaseModel):
+ """A declared module โ one Python file at materialization time."""
+
+ model_config = ConfigDict(extra="forbid", frozen=False)
+
+ name: str = Field(..., min_length=1)
+ responsibility: ResponsibilityTag
+
+ @field_validator("name")
+ @classmethod
+ def _name_is_identifier(cls, v: str) -> str:
+ if not v.isidentifier():
+ raise ValueError(f"module name {v!r} is not a Python identifier")
+ if v.startswith("_"):
+ raise ValueError(f"module name {v!r} must not start with an underscore")
+ return v
+
+
+class Node(BaseModel):
+ """A declared function. ``body_template`` may be unset until attach_body."""
+
+ model_config = ConfigDict(extra="forbid", frozen=False)
+
+ name: str = Field(..., min_length=1)
+ module: str = Field(..., min_length=1)
+ signature: str = Field(..., min_length=2) # e.g., "(x: int) -> bool"
+ body_template: Optional[str] = None
+ body_template_args: dict[str, object] = Field(default_factory=dict)
+ purity: Purity = "impure"
+ error_policy: ErrorPolicy = "none"
+ decl_order: int = 0
+
+ @field_validator("name")
+ @classmethod
+ def _name_is_identifier(cls, v: str) -> str:
+ if not v.isidentifier():
+ raise ValueError(f"node name {v!r} is not a Python identifier")
+ return v
+
+ @field_validator("signature")
+ @classmethod
+ def _signature_shape(cls, v: str) -> str:
+ # Cheap surface check; the type engine does the real parse.
+ if not v.lstrip().startswith("("):
+ raise ValueError(f"signature must start with '(': got {v!r}")
+ if "->" not in v:
+ raise ValueError(f"signature must include '->' return arrow: got {v!r}")
+ return v
+
+ # Convenience -----------------------------------------------------
+
+ @property
+ def qualified_name(self) -> str:
+ """``.`` โ the canonical address used on edges."""
+ return f"{self.module}.{self.name}"
+
+
+class ArgMapping(BaseModel):
+ """How an edge wires a caller's argument to a callee's parameter."""
+
+ model_config = ConfigDict(extra="forbid", frozen=False)
+
+ caller_arg: str = Field(..., min_length=1)
+ callee_param: str = Field(..., min_length=1)
+
+
+class Edge(BaseModel):
+ """A CALLS edge. Endpoints are qualified node names ``.``."""
+
+ model_config = ConfigDict(extra="forbid", frozen=False)
+
+ caller: str = Field(..., min_length=3)
+ callee: str = Field(..., min_length=3)
+ arg_mapping: list[ArgMapping] = Field(default_factory=list)
+
+ @field_validator("caller", "callee")
+ @classmethod
+ def _qualified(cls, v: str) -> str:
+ if v.count(".") != 1:
+ raise ValueError(
+ f"edge endpoint {v!r} is not qualified (expected '.')"
+ )
+ mod, name = v.split(".")
+ if not mod.isidentifier() or not name.isidentifier():
+ raise ValueError(f"edge endpoint {v!r} has non-identifier parts")
+ return v
+
+
+# ----------------------------------------------------------------------
+# Graph
+# ----------------------------------------------------------------------
+
+
+class Graph(BaseModel):
+ """Canonical graph state. Mutable; cloned via ``snapshot``/``restore``."""
+
+ model_config = ConfigDict(extra="forbid", frozen=False)
+
+ modules: list[Module] = Field(default_factory=list)
+ nodes: list[Node] = Field(default_factory=list)
+ edges: list[Edge] = Field(default_factory=list)
+
+ # ----- lookup ----------------------------------------------------
+
+ def find_module(self, name: str) -> Optional[Module]:
+ for m in self.modules:
+ if m.name == name:
+ return m
+ return None
+
+ def find_node(self, name: str, module: str) -> Optional[Node]:
+ for n in self.nodes:
+ if n.name == name and n.module == module:
+ return n
+ return None
+
+ def find_node_qualified(self, qualified: str) -> Optional[Node]:
+ if qualified.count(".") != 1:
+ return None
+ mod, nm = qualified.split(".")
+ return self.find_node(nm, mod)
+
+ def find_edge(self, caller: str, callee: str) -> Optional[Edge]:
+ for e in self.edges:
+ if e.caller == caller and e.callee == callee:
+ return e
+ return None
+
+ def nodes_in_module(self, module: str) -> list[Node]:
+ return [n for n in self.nodes if n.module == module]
+
+ def callers_of(self, qualified: str) -> list[str]:
+ return [e.caller for e in self.edges if e.callee == qualified]
+
+ def callees_of(self, qualified: str) -> list[str]:
+ return [e.callee for e in self.edges if e.caller == qualified]
+
+ def fan_in(self, qualified: str) -> int:
+ return len(self.callers_of(qualified))
+
+ def fan_out(self, qualified: str) -> int:
+ return len(self.callees_of(qualified))
+
+ # ----- structural derivations ------------------------------------
+
+ def import_edges(self) -> set[tuple[str, str]]:
+ """Set of (caller_module, callee_module) pairs from cross-module edges."""
+ out: set[tuple[str, str]] = set()
+ for e in self.edges:
+ cm = e.caller.split(".")[0]
+ tm = e.callee.split(".")[0]
+ if cm != tm:
+ out.add((cm, tm))
+ return out
+
+ def has_module_cycle(self) -> bool:
+ """True iff the cross-module import graph contains a directed cycle."""
+ adj: dict[str, set[str]] = {m.name: set() for m in self.modules}
+ for src, dst in self.import_edges():
+ adj.setdefault(src, set()).add(dst)
+ adj.setdefault(dst, set())
+ WHITE, GRAY, BLACK = 0, 1, 2
+ color: dict[str, int] = {k: WHITE for k in adj}
+
+ def visit(u: str) -> bool:
+ color[u] = GRAY
+ for v in adj.get(u, ()):
+ if color[v] == GRAY:
+ return True
+ if color[v] == WHITE and visit(v):
+ return True
+ color[u] = BLACK
+ return False
+
+ return any(color[u] == WHITE and visit(u) for u in adj)
+
+ def call_graph_depth(self) -> int:
+ """Longest path length (in edges) in the function call DAG.
+
+ If the call graph is cyclic, returns the special value -1 (callers
+ should treat this as an invariant violation).
+ """
+ adj: dict[str, list[str]] = {n.qualified_name: [] for n in self.nodes}
+ for e in self.edges:
+ adj.setdefault(e.caller, []).append(e.callee)
+ adj.setdefault(e.callee, [])
+ memo: dict[str, int] = {}
+ ON_STACK = -2
+
+ def dfs(u: str) -> int:
+ if u in memo:
+ if memo[u] == ON_STACK:
+ return -1
+ return memo[u]
+ memo[u] = ON_STACK
+ best = 0
+ for v in adj.get(u, ()):
+ d = dfs(v)
+ if d == -1:
+ return -1
+ best = max(best, d + 1)
+ memo[u] = best
+ return best
+
+ results = [dfs(u) for u in adj]
+ if any(r == -1 for r in results):
+ return -1
+ return max(results, default=0)
+
+ # ----- copying / hashing -----------------------------------------
+
+ def snapshot(self) -> "Graph":
+ """Deep copy. Used by the dispatcher for atomic action rollback."""
+ return self.model_copy(deep=True)
+
+ def structural_hash(self) -> str:
+ """Stable SHA-256 over a canonical JSON projection.
+
+ Insensitive to list ordering on the dimensions where order is not
+ semantically meaningful (modules, nodes), but sensitive to
+ ``decl_order`` because that affects materialized output.
+ """
+ canon: dict[str, object] = {
+ "modules": sorted(
+ [m.model_dump() for m in self.modules],
+ key=lambda d: d["name"],
+ ),
+ "nodes": sorted(
+ [n.model_dump() for n in self.nodes],
+ key=lambda d: (d["module"], d["name"]),
+ ),
+ "edges": sorted(
+ [e.model_dump() for e in self.edges],
+ key=lambda d: (d["caller"], d["callee"]),
+ ),
+ }
+ blob = json.dumps(canon, sort_keys=True, default=str).encode("utf-8")
+ return hashlib.sha256(blob).hexdigest()
+
+ # ----- factories -------------------------------------------------
+
+ @classmethod
+ def empty(cls) -> "Graph":
+ return cls()
diff --git a/graphforge/knowledge_graph.py b/graphforge/knowledge_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf3de8b9fccc3b7160d650c7264aed5fb4f6cef
--- /dev/null
+++ b/graphforge/knowledge_graph.py
@@ -0,0 +1,233 @@
+"""In-memory Knowledge Graph for a Python repository.
+
+Mirrors the structure of a Neo4j property graph but lives in RAM:
+
+Nodes
+-----
+ repo โ the repository root
+ package โ a directory containing __init__.py
+ module โ a .py file
+ class โ a class definition
+ function โ a top-level or nested function / async function
+ method โ a method inside a class
+
+Edges (directed)
+-----------------
+ contains โ parent โ child (repoโpackage, packageโmodule, moduleโclass, โฆ)
+ calls โ function/method โ function/method (same-file same-package)
+ imports โ module โ module (from x import y / import x)
+ inherits โ class โ class
+
+Each node stores the actual source lines so the agent can read/edit them.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from dataclasses import dataclass, field
+from typing import Iterable
+
+
+# โโ node & edge โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+@dataclass
+class KGNode:
+ node_id: str # unique key, e.g. "function:validators.py:validate_title"
+ node_type: str # module | class | function | method | package | repo
+ name: str # short identifier
+ file_path: str # relative path from repo root (empty for repo/package)
+ line_start: int = 0
+ line_end: int = 0
+ source: str = "" # full source text of this node (incl. def line)
+ docstring: str = ""
+ metadata: dict = field(default_factory=dict)
+
+ def brief(self) -> str:
+ """One-line summary for graph overviews."""
+ loc = f" [{self.file_path}:{self.line_start}]" if self.file_path else ""
+ return f"[{self.node_type.upper():<8}] {self.node_id}{loc}"
+
+
+@dataclass
+class KGEdge:
+ edge_type: str # contains | calls | imports | inherits
+ source_id: str
+ target_id: str
+
+
+# โโ knowledge graph โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+class KnowledgeGraph:
+ """Property graph for a repository.
+
+ Supports rich queries used by the agent and reward checker.
+ """
+
+ def __init__(self, repo_path: str) -> None:
+ self.repo_path = repo_path
+ self._nodes: dict[str, KGNode] = {}
+ self._edges: list[KGEdge] = []
+
+ # โโ mutation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+ def add_node(self, node: KGNode) -> None:
+ self._nodes[node.node_id] = node
+
+ def add_edge(self, edge: KGEdge) -> None:
+ self._edges.append(edge)
+
+ def update_node_source(self, node_id: str, new_source: str) -> None:
+ """Replace a node's source and recount lines."""
+ node = self._nodes[node_id]
+ node.source = new_source
+ lines = new_source.splitlines()
+ node.line_end = node.line_start + len(lines) - 1
+
+ def insert_node(
+ self,
+ parent_id: str,
+ new_node: KGNode,
+ ) -> None:
+ """Add new_node to the graph and wire a contains edge from parent."""
+ self._nodes[new_node.node_id] = new_node
+ self._edges.append(KGEdge("contains", parent_id, new_node.node_id))
+
+ def remove_node(self, node_id: str) -> None:
+ self._nodes.pop(node_id, None)
+ self._edges = [e for e in self._edges
+ if e.source_id != node_id and e.target_id != node_id]
+
+ # โโ queries โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+ def get_node(self, node_id: str) -> KGNode | None:
+ return self._nodes.get(node_id)
+
+ def all_nodes(self, node_type: str | None = None) -> list[KGNode]:
+ nodes = list(self._nodes.values())
+ if node_type:
+ nodes = [n for n in nodes if n.node_type == node_type]
+ return nodes
+
+ def children_of(self, node_id: str) -> list[KGNode]:
+ child_ids = {e.target_id for e in self._edges
+ if e.source_id == node_id and e.edge_type == "contains"}
+ return [self._nodes[cid] for cid in child_ids if cid in self._nodes]
+
+ def parent_of(self, node_id: str) -> KGNode | None:
+ for e in self._edges:
+ if e.target_id == node_id and e.edge_type == "contains":
+ return self._nodes.get(e.source_id)
+ return None
+
+ def callers_of(self, node_id: str) -> list[KGNode]:
+ caller_ids = {e.source_id for e in self._edges
+ if e.target_id == node_id and e.edge_type == "calls"}
+ return [self._nodes[cid] for cid in caller_ids if cid in self._nodes]
+
+ def callees_of(self, node_id: str) -> list[KGNode]:
+ callee_ids = {e.target_id for e in self._edges
+ if e.source_id == node_id and e.edge_type == "calls"}
+ return [self._nodes[cid] for cid in callee_ids if cid in self._nodes]
+
+ def imports_of(self, module_id: str) -> list[KGNode]:
+ imp_ids = {e.target_id for e in self._edges
+ if e.source_id == module_id and e.edge_type == "imports"}
+ return [self._nodes[i] for i in imp_ids if i in self._nodes]
+
+ def search(self, keywords: str, node_type: str | None = None) -> list[KGNode]:
+ """Fuzzy keyword search over node names, docstrings, and source."""
+ kws = keywords.lower().split()
+ results: list[KGNode] = []
+ for node in self._nodes.values():
+ if node_type and node.node_type != node_type:
+ continue
+ haystack = f"{node.name} {node.docstring} {node.source}".lower()
+ if all(kw in haystack for kw in kws):
+ results.append(node)
+ return results
+
+ def subgraph(self, root_id: str, depth: int = 2) -> list[KGNode]:
+ """BFS from root_id up to depth hops; returns all encountered nodes."""
+ visited: set[str] = set()
+ frontier = {root_id}
+ for _ in range(depth):
+ next_frontier: set[str] = set()
+ for nid in frontier:
+ if nid in visited:
+ continue
+ visited.add(nid)
+ for e in self._edges:
+ if e.source_id == nid and e.target_id not in visited:
+ next_frontier.add(e.target_id)
+ frontier = next_frontier
+ visited.update(frontier)
+ return [self._nodes[nid] for nid in visited if nid in self._nodes]
+
+ # โโ text representations โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+ def overview(self, max_chars: int = 3000) -> str:
+ """Compact multi-line overview of the repo graph, capped to avoid LLM context overflow."""
+ lines: list[str] = [f"## Repository: {self.repo_path}", ""]
+ modules = self.all_nodes("module")
+ all_fns = self.all_nodes("function")
+ all_cls = self.all_nodes("class")
+ lines.append(f" {len(modules)} modules ยท {len(all_fns)} functions ยท {len(all_cls)} classes")
+ lines.append("")
+
+ for mod in sorted(modules, key=lambda n: n.file_path):
+ children = self.children_of(mod.node_id)
+ funcs = [c for c in children if c.node_type in ("function", "method")]
+ classes = [c for c in children if c.node_type == "class"]
+ summary = []
+ if classes:
+ summary.append(f"{len(classes)} class{'es' if len(classes)>1 else ''}")
+ if funcs:
+ summary.append(f"{len(funcs)} fn{'s' if len(funcs)>1 else ''}")
+ lines.append(f" [{mod.file_path}] ({', '.join(summary) or 'empty'})")
+ for cls in sorted(classes, key=lambda n: n.name):
+ methods = [c for c in self.children_of(cls.node_id) if c.node_type == "method"]
+ mnames = ", ".join(m.name for m in sorted(methods, key=lambda n: n.line_start))
+ lines.append(f" class {cls.name} โ {mnames or '(no methods)'}")
+ lines.append(f" node_id: {cls.node_id}")
+ for fn in sorted(funcs, key=lambda n: n.line_start):
+ lines.append(f" def {fn.name}{fn.metadata.get('signature', '')}")
+ lines.append(f" node_id: {fn.node_id}")
+
+ # Stop expanding if we are already near the character cap
+ current = "\n".join(lines)
+ if len(current) > max_chars:
+ remaining = len(modules) - (modules.index(mod) + 1)
+ if remaining:
+ lines.append(f"\n ... [{remaining} more modules not shown โ use query() to explore]")
+ break
+
+ return "\n".join(lines)
+
+ def node_detail(self, node_id: str) -> str:
+ """Full inspection view of a single node."""
+ node = self._nodes.get(node_id)
+ if node is None:
+ return f"[ERROR] node_id {node_id!r} not found in graph."
+ lines = [
+ f"## Node: {node.node_id}",
+ f"type : {node.node_type}",
+ f"file : {node.file_path} (lines {node.line_start}โ{node.line_end})",
+ ]
+ if node.docstring:
+ lines.append(f"docstring: {node.docstring[:120]}")
+ callers = self.callers_of(node_id)
+ callees = self.callees_of(node_id)
+ if callers:
+ lines.append("called by: " + ", ".join(n.name for n in callers))
+ if callees:
+ lines.append("calls : " + ", ".join(n.name for n in callees))
+ children = self.children_of(node_id)
+ if children:
+ lines.append("contains : " + ", ".join(c.name for c in children))
+ lines += ["", "### Source", "```python", node.source or "(no source)", "```"]
+ return "\n".join(lines)
+
+ def snapshot(self) -> "KnowledgeGraph":
+ """Deep copy โ used to preserve state before mutations."""
+ import copy
+ return copy.deepcopy(self)
diff --git a/graphforge/materializer/__init__.py b/graphforge/materializer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d568b7de75e53c047058946a163cf90d6b40d80
--- /dev/null
+++ b/graphforge/materializer/__init__.py
@@ -0,0 +1,20 @@
+"""Graph -> Python source projection.
+
+Responsibilities (PROPOSAL.md ยง3.3):
+
+ * Emit one ``.py`` per declared module.
+ * Emit functions in :attr:`Node.decl_order` order.
+ * Compute ``from import `` lines from cross-module edges,
+ deduplicated and sorted.
+ * Expand body templates with the node's ``body_template_args`` to produce
+ a runnable function body.
+
+The materializer is total over well-formed graphs: every dispatcher-accepted
+graph must produce parseable source. Round-trip correctness (the produced
+source re-parses to the same graph) is enforced by tests in
+:mod:`graphforge.parser` (TODO).
+"""
+
+from graphforge.materializer.materialize import materialize
+
+__all__ = ["materialize"]
diff --git a/graphforge/materializer/codegen.py b/graphforge/materializer/codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83e9b64b61cfa6751fbe8487cd8b2b479c25102
--- /dev/null
+++ b/graphforge/materializer/codegen.py
@@ -0,0 +1,169 @@
+"""Per-template body codegen.
+
+Each public ``render_`` function takes the host node, its outgoing
+edges in deterministic order, and returns a multi-line indented body suitable
+for inserting after a ``def`` line. Bodies use only stdlib and never reference
+unresolved names (the orchestrator ensures imports + pattern constants are
+in scope).
+
+Codegen is intentionally simple: the goal is *runnable, readable* Python that
+respects template semantics, not optimal idiomatic code.
+"""
+
+from __future__ import annotations
+
+from graphforge.graph.schema import Edge, Graph, Node
+from graphforge.materializer import patterns
+
+INDENT = " "
+
+
+# ---- helpers ---------------------------------------------------------
+
+
+def _kwargs_for(edge: Edge) -> str:
+ """Render an edge's arg_mapping as ``param=arg, param2=arg2``."""
+ return ", ".join(f"{m.callee_param}={m.caller_arg}" for m in edge.arg_mapping)
+
+
+def _callee_name(edge: Edge) -> str:
+ """The local symbol used at the call site (just the function name).
+
+ The orchestrator emits ``from import `` for cross-module
+ callees, so the call site can always use the bare name.
+ """
+ return edge.callee.split(".", 1)[1]
+
+
+def _indent(lines: list[str]) -> str:
+ return "\n".join(INDENT + line for line in lines)
+
+
+# ---- per-template renderers -----------------------------------------
+
+
+def render_passthrough_call(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if len(out_edges) != 1:
+ raise ValueError(
+ f"passthrough_call on {node.qualified_name} requires 1 out-edge, "
+ f"got {len(out_edges)}"
+ )
+ e = out_edges[0]
+ return _indent([f"return {_callee_name(e)}({_kwargs_for(e)})"])
+
+
+def render_sequential_calls(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if not out_edges:
+ raise ValueError(
+ f"sequential_calls on {node.qualified_name} requires >=1 out-edge"
+ )
+ lines: list[str] = []
+ for e in out_edges[:-1]:
+ lines.append(f"{_callee_name(e)}({_kwargs_for(e)})")
+ last = out_edges[-1]
+ lines.append(f"return {_callee_name(last)}({_kwargs_for(last)})")
+ return _indent(lines)
+
+
+def render_validate_with_regex(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if out_edges:
+ raise ValueError(
+ f"validate_with_regex on {node.qualified_name} must have 0 out-edges"
+ )
+ pattern_name = str(node.body_template_args.get("pattern", ""))
+ if patterns.get_pattern(pattern_name) is None:
+ raise ValueError(
+ f"unknown regex pattern {pattern_name!r} on {node.qualified_name}; "
+ f"known: {patterns.known_patterns()}"
+ )
+ constant = patterns.constant_name(pattern_name)
+ # The host signature is expected to be (s: str) -> bool โ but we just use
+ # the first parameter name, whatever it is, to be tolerant.
+ from graphforge.actions.signature import parse_signature
+ parsed = parse_signature(node.signature)
+ if not parsed.parameters:
+ raise ValueError(
+ f"validate_with_regex on {node.qualified_name} requires "
+ f"at least one parameter"
+ )
+ arg = parsed.parameters[0].name
+ return _indent([f"return re.match({constant}, {arg}) is not None"])
+
+
+def render_early_return_guard(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if len(out_edges) != 1:
+ raise ValueError(
+ f"early_return_guard on {node.qualified_name} requires 1 out-edge"
+ )
+ condition = str(node.body_template_args.get("condition", "True"))
+ e = out_edges[0]
+ return _indent(
+ [
+ f"if not ({condition}):",
+ f"{INDENT}return None",
+ f"return {_callee_name(e)}({_kwargs_for(e)})",
+ ]
+ )
+
+
+def render_try_call_with_fallback(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if len(out_edges) != 2:
+ raise ValueError(
+ f"try_call_with_fallback on {node.qualified_name} requires "
+ f"exactly 2 out-edges (primary, fallback)"
+ )
+ primary, fallback = out_edges
+ return _indent(
+ [
+ "try:",
+ f"{INDENT}return {_callee_name(primary)}({_kwargs_for(primary)})",
+ "except Exception:",
+ f"{INDENT}return {_callee_name(fallback)}({_kwargs_for(fallback)})",
+ ]
+ )
+
+
+def render_leaf_constant(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+ if out_edges:
+ raise ValueError(
+ f"leaf_constant on {node.qualified_name} must have 0 out-edges"
+ )
+ if "value" not in node.body_template_args:
+ raise ValueError(
+ f"leaf_constant on {node.qualified_name} requires args.value"
+ )
+ value = node.body_template_args["value"]
+ return _indent([f"return {value!r}"])
+
+
+# ---- registry --------------------------------------------------------
+
+
+_RENDERERS: dict[str, object] = {
+ "passthrough_call": render_passthrough_call,
+ "sequential_calls": render_sequential_calls,
+ "validate_with_regex": render_validate_with_regex,
+ "early_return_guard": render_early_return_guard,
+ "try_call_with_fallback": render_try_call_with_fallback,
+ "leaf_constant": render_leaf_constant,
+}
+
+
+def render_body(node: Node, out_edges: list[Edge], graph: Graph) -> str:
+ """Render the body for ``node`` based on its attached body template."""
+ if node.body_template is None:
+ # No body attached yet โ emit a placeholder so the file still parses.
+ return _indent(['raise NotImplementedError("body not attached")'])
+ fn = _RENDERERS.get(node.body_template)
+ if fn is None:
+ raise ValueError(
+ f"no codegen for template {node.body_template!r} on {node.qualified_name}"
+ )
+ return fn(node, out_edges, graph) # type: ignore[operator]
+
+
+def template_imports(template: str | None) -> set[str]:
+ """Stdlib imports a template needs, beyond cross-module function imports."""
+ if template == "validate_with_regex":
+ return {"re"}
+ return set()
diff --git a/graphforge/materializer/materialize.py b/graphforge/materializer/materialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..c827fcfa0a16f68d17d5da4a8f7553b8e2c5b917
--- /dev/null
+++ b/graphforge/materializer/materialize.py
@@ -0,0 +1,134 @@
+"""Materialize a :class:`Graph` into a dict of ``{filename: source}``.
+
+Determinism guarantees:
+
+ * One file per module, named ``.py``.
+ * Within a file, functions emitted in :attr:`Node.decl_order`.
+ * Imports sorted: stdlib first (alpha), then ``from import ``
+ (alpha by module, alpha by name).
+ * Pattern constants emitted only if used, in alpha order.
+ * Out-edges of a node iterated in insertion order, which matters for
+ ``sequential_calls`` and ``try_call_with_fallback`` semantics.
+
+The orchestrator is a pure function: same graph in, same source out.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Iterable
+
+from graphforge.graph.schema import Edge, Graph, Node
+from graphforge.materializer import codegen, patterns
+
+HEADER = '"""Auto-generated by graphforge.materializer. Do not edit by hand."""\n'
+FUTURE = "from __future__ import annotations\n"
+
+
+# ---- helpers ---------------------------------------------------------
+
+
+def _out_edges_in_order(graph: Graph, qualified: str) -> list[Edge]:
+ """Out-edges of ``qualified`` in insertion order."""
+ return [e for e in graph.edges if e.caller == qualified]
+
+
+def _nodes_by_module(graph: Graph) -> dict[str, list[Node]]:
+ """Map module-name -> nodes in decl_order."""
+ by_mod: dict[str, list[Node]] = defaultdict(list)
+ for n in graph.nodes:
+ by_mod[n.module].append(n)
+ for ns in by_mod.values():
+ ns.sort(key=lambda n: (n.decl_order, n.name))
+ return by_mod
+
+
+def _cross_module_imports(graph: Graph, module: str) -> list[tuple[str, str]]:
+ """``[(callee_module, callee_name), ...]`` needed by ``module``."""
+ pairs: set[tuple[str, str]] = set()
+ for e in graph.edges:
+ caller_mod = e.caller.split(".", 1)[0]
+ if caller_mod != module:
+ continue
+ callee_mod, callee_name = e.callee.split(".", 1)
+ if callee_mod != module:
+ pairs.add((callee_mod, callee_name))
+ return sorted(pairs)
+
+
+def _stdlib_imports_for(nodes: Iterable[Node]) -> list[str]:
+ """Stdlib imports the templates in this module require."""
+ needed: set[str] = set()
+ for n in nodes:
+ needed |= codegen.template_imports(n.body_template)
+ return sorted(needed)
+
+
+def _patterns_used_by(nodes: Iterable[Node]) -> list[str]:
+ """Named patterns referenced by validate_with_regex nodes in this module."""
+ used: set[str] = set()
+ for n in nodes:
+ if n.body_template == "validate_with_regex":
+ name = str(n.body_template_args.get("pattern", ""))
+ if patterns.get_pattern(name) is not None:
+ used.add(name)
+ return sorted(used)
+
+
+# ---- core ------------------------------------------------------------
+
+
+def materialize(graph: Graph) -> dict[str, str]:
+ """Project ``graph`` to a ``{filename: source}`` map.
+
+ Modules with zero nodes are still emitted as empty files (just header +
+ future import) so that downstream import-resolution sees them.
+ """
+ by_mod = _nodes_by_module(graph)
+ files: dict[str, str] = {}
+ for module in graph.modules:
+ nodes = by_mod.get(module.name, [])
+ files[f"{module.name}.py"] = _render_module(graph, module.name, nodes)
+ return files
+
+
+def _render_module(graph: Graph, module_name: str, nodes: list[Node]) -> str:
+ parts: list[str] = [HEADER, FUTURE, "\n"]
+
+ # Stdlib imports.
+ for imp in _stdlib_imports_for(nodes):
+ parts.append(f"import {imp}\n")
+
+ # Cross-module function imports.
+ for callee_mod, callee_name in _cross_module_imports(graph, module_name):
+ parts.append(f"from {callee_mod} import {callee_name}\n")
+
+ if (
+ any(_stdlib_imports_for(nodes))
+ or _cross_module_imports(graph, module_name)
+ ):
+ parts.append("\n")
+
+ # Pattern constants used in this module. We emit a plain string literal
+ # (not a raw-string-prefixed one) because ``repr()`` already produces a
+ # valid Python string literal โ wrapping it in ``r"..."`` would double
+ # the backslashes and break regex metacharacters like ``\s`` and ``\d``.
+ used_patterns = _patterns_used_by(nodes)
+ for name in used_patterns:
+ regex = patterns.get_pattern(name)
+ constant = patterns.constant_name(name)
+ parts.append(f"{constant} = {regex!r}\n")
+ if used_patterns:
+ parts.append("\n")
+
+ # Functions.
+ for i, node in enumerate(nodes):
+ out_edges = _out_edges_in_order(graph, node.qualified_name)
+ body = codegen.render_body(node, out_edges, graph)
+ parts.append(f"def {node.name}{node.signature}:\n{body}\n")
+ if i != len(nodes) - 1:
+ parts.append("\n")
+
+ source = "".join(parts)
+ # Ensure exactly one trailing newline.
+ return source.rstrip("\n") + "\n"
diff --git a/graphforge/materializer/patterns.py b/graphforge/materializer/patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..d541de30b6de46e954f39d660b5e8854b8211fea
--- /dev/null
+++ b/graphforge/materializer/patterns.py
@@ -0,0 +1,34 @@
+"""Named regex patterns for ``validate_with_regex`` template.
+
+Patterns are referenced by name in the graph (e.g. ``args={"pattern": "EMAIL"}``)
+and resolved here at materialization time. The registry keeps task definitions
+domain-agnostic โ a task constraint can name a pattern without leaking the
+regex itself into the graph schema.
+
+Add new patterns sparingly; every name here becomes part of the constraint
+vocabulary that tasks can use.
+"""
+
+from __future__ import annotations
+
+# name -> (regex string, brief description)
+_PATTERNS: dict[str, str] = {
+ "EMAIL": r"[^@\s]+@[^@\s]+\.[^@\s]+",
+ "HEXCOLOR": r"#[0-9a-fA-F]{6}",
+ "PHONE": r"\+?\d{10,15}",
+ "ALPHANUM": r"[A-Za-z0-9]+",
+ "URL": r"https?://[^\s]+",
+}
+
+
+def known_patterns() -> list[str]:
+ return sorted(_PATTERNS.keys())
+
+
+def get_pattern(name: str) -> str | None:
+ return _PATTERNS.get(name)
+
+
+def constant_name(name: str) -> str:
+ """Module-level constant name we emit for a given pattern name."""
+ return f"_PATTERN_{name}"
diff --git a/graphforge/parser/__init__.py b/graphforge/parser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a2d52bb37c14d5d30e6fe18298d4f0ac9373ca3
--- /dev/null
+++ b/graphforge/parser/__init__.py
@@ -0,0 +1,27 @@
+"""Round-trip parser: Python source -> Graph.
+
+Responsibilities (PROPOSAL.md ยง3.4):
+
+ * Walk an AST per module file.
+ * Recover function declarations as :class:`Node` objects.
+ * Recover ``from x import y`` lines as cross-module edges (best-effort).
+ * Recognize body templates by structural pattern matching against the
+ template library, and recover ``body_template`` + ``body_template_args``.
+ * Produce a :class:`Graph` identical (per ``structural_hash``) to the one
+ that produced the source via :mod:`graphforge.materializer`.
+
+The round-trip parser is unit-tested against every body template + every
+constraint pattern. If it fails to round-trip, the materializer emits a
+warning and the graph is treated as canonical.
+
+Public surface (TODO):
+
+ parse_program(files: dict[str, str]) -> Graph
+ parse_directory(path: Path) -> Graph
+"""
+
+from __future__ import annotations
+
+
+def parse_program(files: dict[str, str]) -> object: # pragma: no cover โ TODO
+ raise NotImplementedError("round-trip parser TODO โ see PROPOSAL.md ยง3.4")
diff --git a/graphforge/repo_parser.py b/graphforge/repo_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0400c35ac88bcb46c7014968247d4f6c22cb024
--- /dev/null
+++ b/graphforge/repo_parser.py
@@ -0,0 +1,271 @@
+"""Parse a Python repository (directory tree) into a KnowledgeGraph.
+
+Usage
+-----
+ from graphforge.repo_parser import parse_repo
+ kg = parse_repo("/path/to/my_package")
+
+What it extracts
+----------------
+ Nodes : repo, package, module, class, function, method
+ Edges : contains, calls (same-file), imports, inherits
+
+Cross-file call resolution is best-effort: if function A in file X calls
+function B and B appears anywhere in the graph, an edge is added.
+"""
+
+from __future__ import annotations
+
+import ast
+import os
+from pathlib import Path
+from typing import Any
+
+from graphforge.knowledge_graph import KGEdge, KGNode, KnowledgeGraph
+
+
+# โโ helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _node_id(node_type: str, file_path: str, *names: str) -> str:
+ parts = [node_type, file_path] + list(names)
+ return ":".join(p for p in parts if p)
+
+
+def _sig(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
+ args = []
+ for arg in node.args.args:
+ ann = f": {ast.unparse(arg.annotation)}" if arg.annotation else ""
+ args.append(f"{arg.arg}{ann}")
+ ret = f" -> {ast.unparse(node.returns)}" if node.returns else ""
+ return f"({', '.join(args)}){ret}"
+
+
+def _source_slice(source_lines: list[str], start: int, end: int) -> str:
+ """1-indexed, inclusive."""
+ return "\n".join(source_lines[start - 1 : end])
+
+
+def _direct_calls(func_node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
+ """Collect names of directly called functions (Name-style calls only)."""
+ calls: set[str] = set()
+ for node in ast.walk(func_node):
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
+ calls.add(node.func.id)
+ return calls
+
+
+# โโ single-file parser โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _parse_file(
+ file_path: str, # relative to repo root
+ abs_path: str,
+ kg: KnowledgeGraph,
+ parent_id: str,
+) -> None:
+ try:
+ source = Path(abs_path).read_text(encoding="utf-8", errors="replace")
+ except Exception:
+ return
+
+ try:
+ tree = ast.parse(source, filename=abs_path)
+ except SyntaxError:
+ return
+
+ lines = source.splitlines()
+ mod_id = _node_id("module", file_path)
+
+ # Module node
+ mod_doc = ast.get_docstring(tree) or ""
+ kg.add_node(KGNode(
+ node_id=mod_id,
+ node_type="module",
+ name=Path(file_path).stem,
+ file_path=file_path,
+ line_start=1,
+ line_end=len(lines),
+ source=source,
+ docstring=mod_doc,
+ ))
+ kg.add_edge(KGEdge("contains", parent_id, mod_id))
+
+ # Import edges (resolve module names)
+ for node in ast.walk(tree):
+ if isinstance(node, ast.Import):
+ for alias in node.names:
+ imp_id = _node_id("module", alias.name.replace(".", "/") + ".py")
+ kg.add_edge(KGEdge("imports", mod_id, imp_id))
+ elif isinstance(node, ast.ImportFrom) and node.module:
+ imp_id = _node_id("module", node.module.replace(".", "/") + ".py")
+ kg.add_edge(KGEdge("imports", mod_id, imp_id))
+
+ # Top-level classes and functions
+ func_name_to_id: dict[str, str] = {} # for call resolution within file
+
+ for stmt in tree.body:
+ if isinstance(stmt, ast.ClassDef):
+ _parse_class(stmt, file_path, lines, kg, mod_id, func_name_to_id)
+ elif isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ _parse_function(stmt, file_path, lines, kg, mod_id, func_name_to_id)
+
+ # Same-file call edges
+ _resolve_calls(func_name_to_id, kg)
+
+
+def _parse_class(
+ cls_node: ast.ClassDef,
+ file_path: str,
+ lines: list[str],
+ kg: KnowledgeGraph,
+ parent_id: str,
+ func_name_to_id: dict[str, str],
+) -> None:
+ cls_id = _node_id("class", file_path, cls_node.name)
+ doc = ast.get_docstring(cls_node) or ""
+ kg.add_node(KGNode(
+ node_id=cls_id,
+ node_type="class",
+ name=cls_node.name,
+ file_path=file_path,
+ line_start=cls_node.lineno,
+ line_end=cls_node.end_lineno,
+ source=_source_slice(lines, cls_node.lineno, cls_node.end_lineno),
+ docstring=doc,
+ ))
+ kg.add_edge(KGEdge("contains", parent_id, cls_id))
+
+ # Inheritance edges
+ for base in cls_node.bases:
+ if isinstance(base, ast.Name):
+ base_id = _node_id("class", file_path, base.id)
+ kg.add_edge(KGEdge("inherits", cls_id, base_id))
+
+ # Methods
+ for item in cls_node.body:
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ _parse_method(item, file_path, lines, kg, cls_id, cls_node.name, func_name_to_id)
+
+
+def _parse_function(
+ fn: ast.FunctionDef | ast.AsyncFunctionDef,
+ file_path: str,
+ lines: list[str],
+ kg: KnowledgeGraph,
+ parent_id: str,
+ func_name_to_id: dict[str, str],
+) -> None:
+ fn_id = _node_id("function", file_path, fn.name)
+ doc = ast.get_docstring(fn) or ""
+ kg.add_node(KGNode(
+ node_id=fn_id,
+ node_type="function",
+ name=fn.name,
+ file_path=file_path,
+ line_start=fn.lineno,
+ line_end=fn.end_lineno,
+ source=_source_slice(lines, fn.lineno, fn.end_lineno),
+ docstring=doc,
+ metadata={"signature": _sig(fn), "calls": list(_direct_calls(fn))},
+ ))
+ kg.add_edge(KGEdge("contains", parent_id, fn_id))
+ func_name_to_id[fn.name] = fn_id
+
+
+def _parse_method(
+ fn: ast.FunctionDef | ast.AsyncFunctionDef,
+ file_path: str,
+ lines: list[str],
+ kg: KnowledgeGraph,
+ parent_id: str,
+ class_name: str,
+ func_name_to_id: dict[str, str],
+) -> None:
+ method_id = _node_id("method", file_path, class_name, fn.name)
+ doc = ast.get_docstring(fn) or ""
+ kg.add_node(KGNode(
+ node_id=method_id,
+ node_type="method",
+ name=fn.name,
+ file_path=file_path,
+ line_start=fn.lineno,
+ line_end=fn.end_lineno,
+ source=_source_slice(lines, fn.lineno, fn.end_lineno),
+ docstring=doc,
+ metadata={"signature": _sig(fn), "calls": list(_direct_calls(fn))},
+ ))
+ kg.add_edge(KGEdge("contains", parent_id, method_id))
+ # register under unqualified name too for call resolution
+ func_name_to_id[fn.name] = method_id
+
+
+def _resolve_calls(func_name_to_id: dict[str, str], kg: KnowledgeGraph) -> None:
+ """Add calls edges based on direct-call names collected during parse."""
+ for fn_id, node in [(nid, n) for nid, n in kg._nodes.items()
+ if n.node_type in ("function", "method")]:
+ calls: list[str] = node.metadata.get("calls", [])
+ for callee_name in calls:
+ if callee_name in func_name_to_id:
+ callee_id = func_name_to_id[callee_name]
+ if callee_id != fn_id:
+ kg.add_edge(KGEdge("calls", fn_id, callee_id))
+
+
+# โโ repo walker โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def parse_repo(repo_path: str, exclude_dirs: set[str] | None = None) -> KnowledgeGraph:
+ """Walk repo_path recursively and return a KnowledgeGraph.
+
+ Parameters
+ ----------
+ repo_path : str
+ Absolute or relative path to the root of the repo.
+ exclude_dirs : set[str], optional
+ Directory names to skip (e.g. {"__pycache__", ".git", "tests"}).
+ """
+ if exclude_dirs is None:
+ exclude_dirs = {"__pycache__", ".git", ".venv", "venv", "env",
+ "node_modules", ".mypy_cache", ".pytest_cache", "dist", "build"}
+
+ abs_root = str(Path(repo_path).resolve())
+ kg = KnowledgeGraph(repo_path=repo_path)
+
+ # Root repo node
+ repo_name = Path(abs_root).name
+ repo_id = _node_id("repo", "", repo_name)
+ kg.add_node(KGNode(
+ node_id=repo_id,
+ node_type="repo",
+ name=repo_name,
+ file_path="",
+ ))
+
+ # Walk directory tree
+ for dirpath, dirnames, filenames in os.walk(abs_root):
+ # Prune excluded dirs in-place (modifies os.walk traversal)
+ dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
+
+ rel_dir = os.path.relpath(dirpath, abs_root)
+ if rel_dir == ".":
+ rel_dir = ""
+
+ parent_id = repo_id
+ if rel_dir:
+ pkg_id = _node_id("package", rel_dir)
+ if pkg_id not in kg._nodes:
+ kg.add_node(KGNode(
+ node_id=pkg_id,
+ node_type="package",
+ name=Path(rel_dir).name,
+ file_path=rel_dir,
+ ))
+ kg.add_edge(KGEdge("contains", repo_id, pkg_id))
+ parent_id = pkg_id
+
+ for fname in sorted(filenames):
+ if not fname.endswith(".py"):
+ continue
+ rel_file = os.path.join(rel_dir, fname) if rel_dir else fname
+ abs_file = os.path.join(dirpath, fname)
+ _parse_file(rel_file, abs_file, kg, parent_id)
+
+ return kg
diff --git a/graphforge/repo_registry.py b/graphforge/repo_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..9312f3925055ed2fdd6bf48b1f6b05e09e5d16d4
--- /dev/null
+++ b/graphforge/repo_registry.py
@@ -0,0 +1,145 @@
+"""Registry of training repos with their clone URLs and source paths.
+
+Add a new repo by appending to REGISTRY. The pipeline will clone it,
+parse it, and auto-generate tasks from its doctests.
+
+Each entry:
+ name short identifier used in task_ids
+ url git clone URL (depth-1 clone)
+ src_hint subdirectory containing the Python package
+ (tried as: /, /src/, )
+ n_tasks max tasks to pull from this repo
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class RepoSpec:
+ name: str
+ url: str
+ src_hint: str
+ n_tasks: int = 6
+
+
+REGISTRY: list[RepoSpec] = [
+ # โโ string / text โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ RepoSpec(
+ name="humanize",
+ url="https://github.com/jmoiron/humanize.git",
+ src_hint="src/humanize",
+ n_tasks=6,
+ ),
+ RepoSpec(
+ name="wcwidth",
+ url="https://github.com/jquast/wcwidth.git",
+ src_hint="wcwidth",
+ n_tasks=6,
+ ),
+ RepoSpec(
+ name="inflect",
+ url="https://github.com/jaraco/inflect.git",
+ src_hint="inflect",
+ n_tasks=4,
+ ),
+
+ # โโ iteration / functional โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ RepoSpec(
+ name="boltons",
+ url="https://github.com/mahmoud/boltons.git",
+ src_hint="boltons",
+ n_tasks=10,
+ ),
+ RepoSpec(
+ name="more-itertools",
+ url="https://github.com/more-itertools/more-itertools.git",
+ src_hint="more_itertools",
+ n_tasks=8,
+ ),
+ RepoSpec(
+ name="toolz",
+ url="https://github.com/pytoolz/toolz.git",
+ src_hint="toolz",
+ n_tasks=6,
+ ),
+
+ # โโ data transformation / ETL โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ RepoSpec(
+ name="petl",
+ url="https://github.com/petl-developers/petl.git",
+ src_hint="src/petl",
+ n_tasks=8,
+ ),
+ RepoSpec(
+ name="pydash",
+ url="https://github.com/dgilland/pydash.git",
+ src_hint="src/pydash",
+ n_tasks=8,
+ ),
+
+]
+
+# Repos that were evaluated and produced 0 tasks (no literal-eval-able doctests):
+# num2words, parse, dateutil โ omitted from REGISTRY
+
+
+def _find_src(clone_dir: str, hint: str) -> str:
+ for candidate in [
+ f"{clone_dir}/{hint}",
+ f"{clone_dir}/src/{hint}",
+ clone_dir,
+ ]:
+ if Path(candidate).is_dir():
+ return candidate
+ return clone_dir
+
+
+def load_all_tasks(
+ clone_root: str = "/tmp/train_repos",
+ registry: list[RepoSpec] | None = None,
+ verbose: bool = True,
+) -> list:
+ """Clone every repo in the registry and return all AutoTask objects.
+
+ Args:
+ clone_root: Directory under which repos are cloned.
+ registry: Use a custom registry; defaults to REGISTRY.
+ verbose: Print progress.
+
+ Returns:
+ Flat list of AutoTask objects from all repos.
+ """
+ import subprocess
+ from pathlib import Path
+ from graphforge.task_generator import generate_tasks
+
+ specs = registry or REGISTRY
+ all_tasks = []
+ Path(clone_root).mkdir(parents=True, exist_ok=True)
+
+ for spec in specs:
+ clone_dir = str(Path(clone_root) / spec.name)
+ if not Path(clone_dir).exists():
+ if verbose:
+ print(f"Cloning {spec.name} ...")
+ subprocess.check_call(
+ ["git", "clone", "--depth", "1", "-q", spec.url, clone_dir]
+ )
+
+ src = _find_src(clone_dir, spec.src_hint)
+ try:
+ kg, tasks = generate_tasks(src, n_tasks=spec.n_tasks)
+ all_tasks.extend(tasks)
+ if verbose:
+ print(f" {spec.name}: {len(tasks)} tasks "
+ f"(DAG {len(kg._nodes)} nodes)")
+ except Exception as exc:
+ if verbose:
+ print(f" {spec.name}: SKIPPED โ {exc}")
+
+ if verbose:
+ print(f"\nTotal auto-tasks: {len(all_tasks)}")
+ return all_tasks
diff --git a/graphforge/reward/__init__.py b/graphforge/reward/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0fa9699178a6fcd1554e700405c38c72660e82
--- /dev/null
+++ b/graphforge/reward/__init__.py
@@ -0,0 +1,45 @@
+"""Reward engine โ see :mod:`graphforge.reward.engine`.
+
+Per-turn (dense, small) and terminal (sparse, large) reward computation
+following PROPOSAL.md ยง5.
+"""
+
+from graphforge.reward.engine import (
+ ActionOutcome,
+ ALL_BEHAVIORAL_BONUS,
+ ALL_STRUCTURAL_BONUS,
+ ALPHA_TOKEN_COST,
+ BEHAVIORAL_PER_PASS,
+ DUPLICATE_ACTION,
+ MATERIALIZE_FAIL_PENALTY,
+ MUTATION_FAIL,
+ PER_TURN_COST,
+ SCHEMA_REJECTION,
+ STRUCTURAL_PER_SAT,
+ TYPE_CHECK_BONUS,
+ TOKEN_EFFICIENCY_MAX,
+ TerminalReward,
+ TurnReward,
+ score_terminal,
+ score_turn,
+)
+
+__all__ = [
+ "ALPHA_TOKEN_COST",
+ "ALL_BEHAVIORAL_BONUS",
+ "ALL_STRUCTURAL_BONUS",
+ "ActionOutcome",
+ "BEHAVIORAL_PER_PASS",
+ "DUPLICATE_ACTION",
+ "MATERIALIZE_FAIL_PENALTY",
+ "MUTATION_FAIL",
+ "PER_TURN_COST",
+ "SCHEMA_REJECTION",
+ "STRUCTURAL_PER_SAT",
+ "TOKEN_EFFICIENCY_MAX",
+ "TYPE_CHECK_BONUS",
+ "TerminalReward",
+ "TurnReward",
+ "score_terminal",
+ "score_turn",
+]
diff --git a/graphforge/reward/engine.py b/graphforge/reward/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..581e5b4cce4942e24c3724fa58ef21501a8350b9
--- /dev/null
+++ b/graphforge/reward/engine.py
@@ -0,0 +1,211 @@
+"""Reward engine โ per-turn (dense, small) and terminal (sparse, large).
+
+Implementation follows PROPOSAL.md ยง5 verbatim. The two halves are pure
+functions over lightweight envelopes so the server can call them without
+threading state through the reward module.
+
+Decisions worth flagging:
+
+* ``All-behavioral-passing`` bonus is awarded only when there is at least
+ one behavioral test. The gate for the token-efficiency bonus, however,
+ treats zero behavioral tests as vacuously satisfied (so a tier-0 task
+ with no behavioral tests can still earn token-efficiency reward).
+* ``type_checks_ok`` is tri-state: ``True`` / ``False`` / ``None``. ``None``
+ means the type-check gate didn't run (e.g. mypy isn't wired yet); the
+ +3 bonus is suppressed in that case.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+
+# Coefficients (PROPOSAL.md ยง5.1). Override at call time if you want.
+ALPHA_TOKEN_COST: float = 0.0008
+PER_TURN_COST: float = -0.1
+MUTATION_FAIL: float = -2.0
+SCHEMA_REJECTION: float = -2.0
+DUPLICATE_ACTION: float = -1.0
+
+# Terminal magnitudes (ยง5.2)
+STRUCTURAL_PER_SAT: float = 1.0
+BEHAVIORAL_PER_PASS: float = 3.0
+ALL_STRUCTURAL_BONUS: float = 5.0
+ALL_BEHAVIORAL_BONUS: float = 5.0
+TYPE_CHECK_BONUS: float = 3.0
+MATERIALIZE_FAIL_PENALTY: float = -8.0
+TOKEN_EFFICIENCY_MAX: float = 5.0
+
+
+# ---- per-turn -------------------------------------------------------
+
+
+class ActionOutcome(str, Enum):
+ """Coarse classification used by ``score_turn``.
+
+ ``SUCCESS`` โ mutation or info action returned ``ok=True``.
+ ``FAILURE`` โ handler raised :class:`ActionError` (rollback path).
+ ``MALFORMED`` โ pydantic schema rejected the action at parse time.
+ """
+
+ SUCCESS = "success"
+ FAILURE = "failure"
+ MALFORMED = "malformed"
+
+
+@dataclass(frozen=True)
+class TurnReward:
+ base: float # outcome-dependent component
+ duplicate: float # 0 or DUPLICATE_ACTION
+ per_turn: float # PER_TURN_COST
+ token_cost: float # alpha * tokens_returned, negated
+
+ @property
+ def total(self) -> float:
+ return self.base + self.duplicate + self.per_turn + self.token_cost
+
+ def to_dict(self) -> dict[str, float]:
+ return {
+ "base": self.base,
+ "duplicate": self.duplicate,
+ "per_turn": self.per_turn,
+ "token_cost": self.token_cost,
+ "total": self.total,
+ }
+
+
+def score_turn(
+ *,
+ outcome: ActionOutcome,
+ is_duplicate: bool,
+ tokens_returned: int,
+ alpha: float = ALPHA_TOKEN_COST,
+ per_turn_cost: float = PER_TURN_COST,
+) -> TurnReward:
+ if outcome is ActionOutcome.SUCCESS:
+ base = 0.0
+ elif outcome is ActionOutcome.FAILURE:
+ base = MUTATION_FAIL
+ else: # MALFORMED
+ base = SCHEMA_REJECTION
+ return TurnReward(
+ base=base,
+ duplicate=DUPLICATE_ACTION if is_duplicate else 0.0,
+ per_turn=per_turn_cost,
+ token_cost=-alpha * max(0, tokens_returned),
+ )
+
+
+# ---- terminal -------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class TerminalReward:
+ structural: float # +1 per structural constraint satisfied
+ behavioral: float # +3 per behavioral test passing
+ bonus_all_structural: float
+ bonus_all_behavioral: float
+ bonus_type_checks: float
+ penalty_materialize: float # 0 or MATERIALIZE_FAIL_PENALTY
+ efficiency: float # gated by all-structural AND all-behavioral
+
+ components: dict[str, object] = field(default_factory=dict)
+
+ @property
+ def total(self) -> float:
+ return (
+ self.structural
+ + self.behavioral
+ + self.bonus_all_structural
+ + self.bonus_all_behavioral
+ + self.bonus_type_checks
+ + self.penalty_materialize
+ + self.efficiency
+ )
+
+ def to_dict(self) -> dict[str, object]:
+ return {
+ "structural": self.structural,
+ "behavioral": self.behavioral,
+ "bonus_all_structural": self.bonus_all_structural,
+ "bonus_all_behavioral": self.bonus_all_behavioral,
+ "bonus_type_checks": self.bonus_type_checks,
+ "penalty_materialize": self.penalty_materialize,
+ "efficiency": self.efficiency,
+ "total": self.total,
+ "components": self.components,
+ }
+
+
+def score_terminal(
+ *,
+ n_structural_satisfied: int,
+ n_structural_total: int,
+ n_behavioral_passing: int,
+ n_behavioral_total: int,
+ materialization_ok: bool,
+ type_checks_ok: bool | None,
+ tokens_used: int,
+ budget: int,
+) -> TerminalReward:
+ if n_structural_satisfied < 0 or n_structural_total < 0:
+ raise ValueError("structural counts must be non-negative")
+ if n_behavioral_passing < 0 or n_behavioral_total < 0:
+ raise ValueError("behavioral counts must be non-negative")
+ if budget <= 0:
+ raise ValueError("budget must be positive")
+
+ structural = STRUCTURAL_PER_SAT * n_structural_satisfied
+ behavioral = BEHAVIORAL_PER_PASS * n_behavioral_passing
+
+ all_structural = (
+ n_structural_total > 0 and n_structural_satisfied == n_structural_total
+ )
+ all_behavioral_present_and_passing = (
+ n_behavioral_total > 0 and n_behavioral_passing == n_behavioral_total
+ )
+ bonus_all_structural = ALL_STRUCTURAL_BONUS if all_structural else 0.0
+ bonus_all_behavioral = (
+ ALL_BEHAVIORAL_BONUS if all_behavioral_present_and_passing else 0.0
+ )
+
+ if type_checks_ok is True:
+ bonus_type_checks = TYPE_CHECK_BONUS
+ else:
+ bonus_type_checks = 0.0
+
+ penalty_materialize = (
+ 0.0 if materialization_ok else MATERIALIZE_FAIL_PENALTY
+ )
+
+ # Efficiency bonus is gated on all-structural AND all-behavioral satisfied.
+ # When n_behavioral_total == 0 the behavioral half is vacuously satisfied
+ # for the gate's purposes (otherwise tier-0 tasks could never earn it).
+ behavioral_gate_ok = (
+ n_behavioral_total == 0
+ or n_behavioral_passing == n_behavioral_total
+ )
+ efficiency = 0.0
+ if all_structural and behavioral_gate_ok:
+ ratio = max(0.0, (budget - tokens_used) / budget)
+ efficiency = TOKEN_EFFICIENCY_MAX * ratio
+
+ return TerminalReward(
+ structural=structural,
+ behavioral=behavioral,
+ bonus_all_structural=bonus_all_structural,
+ bonus_all_behavioral=bonus_all_behavioral,
+ bonus_type_checks=bonus_type_checks,
+ penalty_materialize=penalty_materialize,
+ efficiency=efficiency,
+ components={
+ "n_structural_satisfied": n_structural_satisfied,
+ "n_structural_total": n_structural_total,
+ "n_behavioral_passing": n_behavioral_passing,
+ "n_behavioral_total": n_behavioral_total,
+ "materialization_ok": materialization_ok,
+ "type_checks_ok": type_checks_ok,
+ "tokens_used": tokens_used,
+ "budget": budget,
+ },
+ )
diff --git a/graphforge/sample_repos/humanize/__init__.py b/graphforge/sample_repos/humanize/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..318e5f69c69892e0d9a43c88f391cec6a926916d
--- /dev/null
+++ b/graphforge/sample_repos/humanize/__init__.py
@@ -0,0 +1,18 @@
+"""Humanize โ convert numbers, file sizes, and times to human-readable strings."""
+from graphforge.sample_repos.humanize.filesize import naturalsize
+from graphforge.sample_repos.humanize.number import (
+ apnumber,
+ clamp,
+ fractional,
+ intcomma,
+ intword,
+ ordinal,
+ scientific,
+)
+from graphforge.sample_repos.humanize.time import (
+ naturaldate,
+ naturalday,
+ naturaldelta,
+ naturaltime,
+ precisedelta,
+)
diff --git a/graphforge/sample_repos/humanize/filesize.py b/graphforge/sample_repos/humanize/filesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..26163fcd0fc31669dfaaf0e2f8789d99cdfd447b
--- /dev/null
+++ b/graphforge/sample_repos/humanize/filesize.py
@@ -0,0 +1,49 @@
+"""Bits and bytes related humanization."""
+
+suffixes = {
+ "decimal": ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"),
+ "binary": ("KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"),
+ "gnu": "KMGTPEZY",
+}
+
+
+def naturalsize(value, binary=False, gnu=False, format="%.1f"):
+ """Format a number of bytes as a human-readable file size (e.g. 10 kB).
+
+ By default, decimal suffixes (kB, MB) are used.
+
+ Examples:
+ >>> naturalsize(3000000)
+ '3.0 MB'
+ >>> naturalsize(300, False, True)
+ '300B'
+ >>> naturalsize(3000, True)
+ '2.9 KiB'
+ """
+ if gnu:
+ suffix = suffixes["gnu"]
+ elif binary:
+ suffix = suffixes["binary"]
+ else:
+ suffix = suffixes["decimal"]
+
+ base = 1024 if (gnu or binary) else 1000
+ bytes_ = float(value)
+ abs_bytes = abs(bytes_)
+
+ if abs_bytes == 1 and not gnu:
+ return "%d Byte" % bytes_
+ elif abs_bytes < base and not gnu:
+ return "%d Bytes" % bytes_
+ elif abs_bytes < base and gnu:
+ return "%dB" % bytes_
+
+ for i, s in enumerate(suffix):
+ unit = base ** (i + 2)
+ if abs_bytes < unit and not gnu:
+ return (format + " %s") % ((base * bytes_ / unit), s)
+ elif abs_bytes < unit and gnu:
+ return (format + "%s") % ((base * bytes_ / unit), s)
+ if gnu:
+ return (format + "%s") % ((base * bytes_ / unit), s)
+ return (format + " %s") % ((base * bytes_ / unit), s)
diff --git a/graphforge/sample_repos/humanize/number.py b/graphforge/sample_repos/humanize/number.py
new file mode 100644
index 0000000000000000000000000000000000000000..547b65c563b12a952eaf1e891bc8cf7d23d74b3f
--- /dev/null
+++ b/graphforge/sample_repos/humanize/number.py
@@ -0,0 +1,198 @@
+"""Humanizing functions for numbers."""
+
+import math
+import re
+from fractions import Fraction
+
+powers = [10**x for x in (3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 100)]
+human_powers = (
+ "thousand", "million", "billion", "trillion", "quadrillion",
+ "quintillion", "sextillion", "septillion", "octillion",
+ "nonillion", "decillion", "googol",
+)
+
+
+def ordinal(value):
+ """Convert an integer to its ordinal string (1 โ '1st', 2 โ '2nd', etc.).
+
+ Examples:
+ >>> ordinal(1)
+ '1st'
+ >>> ordinal(12)
+ '12th'
+ >>> ordinal(103)
+ '103rd'
+ """
+ try:
+ value = int(value)
+ except (TypeError, ValueError):
+ return value
+ t = ("th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th")
+ if value % 100 in (11, 12, 13):
+ return f"{value}th"
+ return f"{value}{t[value % 10]}"
+
+
+def intcomma(value, ndigits=None):
+ """Convert an integer to a string with commas every three digits.
+
+ Examples:
+ >>> intcomma(1000000)
+ '1,000,000'
+ >>> intcomma(1234567.25)
+ '1,234,567.25'
+ """
+ try:
+ if isinstance(value, str):
+ float(value.replace(",", ""))
+ else:
+ float(value)
+ except (TypeError, ValueError):
+ return value
+
+ if ndigits:
+ orig = "{0:.{1}f}".format(value, ndigits)
+ else:
+ orig = str(value)
+
+ new = re.sub(r"^(-?\d+)(\d{3})", r"\g<1>,\g<2>", orig)
+ if orig == new:
+ return new
+ return intcomma(new)
+
+
+def intword(value, format="%.1f"):
+ """Convert a large integer to a friendly text representation.
+
+ Examples:
+ >>> intword(1000000)
+ '1.0 million'
+ >>> intword(1200000000)
+ '1.2 billion'
+ """
+ try:
+ value = int(value)
+ except (TypeError, ValueError):
+ return value
+ if value < powers[0]:
+ return str(value)
+ for ordinal_idx, power in enumerate(powers[1:], 1):
+ if value < power:
+ chopped = value / float(powers[ordinal_idx - 1])
+ count = math.ceil(chopped)
+ label = human_powers[ordinal_idx - 1]
+ plural = label + "s" if count != 1 else label
+ if float(format % chopped) == float(10**3):
+ chopped = value / float(powers[ordinal_idx])
+ count = math.ceil(chopped)
+ label = human_powers[ordinal_idx]
+ plural = label + "s" if count != 1 else label
+ return (format + " %s") % (chopped, plural)
+ return (format + " %s") % (chopped, plural)
+ return str(value)
+
+
+def apnumber(value):
+ """Convert integers 0โ9 to their AP-style word equivalents.
+
+ Examples:
+ >>> apnumber(5)
+ 'five'
+ >>> apnumber(10)
+ '10'
+ """
+ words = ("zero", "one", "two", "three", "four",
+ "five", "six", "seven", "eight", "nine")
+ try:
+ value = int(value)
+ except (TypeError, ValueError):
+ return value
+ if not 0 <= value < 10:
+ return str(value)
+ return words[value]
+
+
+def fractional(value):
+ """Convert a float to a human-readable fractional string.
+
+ Examples:
+ >>> fractional(0.3)
+ '3/10'
+ >>> fractional(1.3)
+ '1 3/10'
+ >>> fractional(1)
+ '1'
+ """
+ try:
+ number = float(value)
+ except (TypeError, ValueError):
+ return value
+ whole = int(number)
+ frac = Fraction(number - whole).limit_denominator(1000)
+ n, d = frac.numerator, frac.denominator
+ if whole and not n and d == 1:
+ return f"{whole:.0f}"
+ elif not whole:
+ return f"{n:.0f}/{d:.0f}"
+ return f"{whole:.0f} {n:.0f}/{d:.0f}"
+
+
+def scientific(value, precision=2):
+ """Return a number in scientific notation (e.g. 5.00 x 10ยฒ).
+
+ Examples:
+ >>> scientific(500)
+ '5.00 x 10ยฒ'
+ >>> scientific(0.3)
+ '3.00 x 10โปยน'
+ """
+ exponents = {
+ "0": "โฐ", "1": "ยน", "2": "ยฒ", "3": "ยณ", "4": "โด",
+ "5": "โต", "6": "โถ", "7": "โท", "8": "โธ", "9": "โน",
+ "+": "โบ", "-": "โป",
+ }
+ negative = False
+ try:
+ if "-" in str(value):
+ value = str(value).replace("-", "")
+ negative = True
+ if isinstance(value, str):
+ value = float(value)
+ fmt = "{:.%se}" % str(int(precision))
+ n = fmt.format(value)
+ except (ValueError, TypeError):
+ return value
+ part1, part2 = n.split("e")
+ part2 = part2.replace("-0", "-").replace("+0", "")
+ new_part2 = []
+ if negative:
+ new_part2.append(exponents["-"])
+ for char in part2:
+ new_part2.append(exponents[char])
+ return part1 + " x 10" + "".join(new_part2)
+
+
+def clamp(value, format="{:}", floor=None, ceil=None, floor_token="<", ceil_token=">"):
+ """Return a number formatted and clamped between floor and ceil.
+
+ Examples:
+ >>> clamp(123.456)
+ '123.456'
+ >>> clamp(0.001, floor=0.01)
+ '<0.01'
+ >>> clamp(999, ceil=100)
+ '>100'
+ """
+ if value is None:
+ return None
+ if floor is not None and value < floor:
+ value, token = floor, floor_token
+ elif ceil is not None and value > ceil:
+ value, token = ceil, ceil_token
+ else:
+ token = ""
+ if isinstance(format, str):
+ return token + format.format(value)
+ elif callable(format):
+ return token + format(value)
+ raise ValueError("format must be a string or callable")
diff --git a/graphforge/sample_repos/humanize/time.py b/graphforge/sample_repos/humanize/time.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d89c2a7461250c344c90f4dfb0877b992b9b99
--- /dev/null
+++ b/graphforge/sample_repos/humanize/time.py
@@ -0,0 +1,225 @@
+"""Time humanizing functions."""
+
+import datetime as dt
+import math
+from enum import Enum
+from functools import total_ordering
+
+
+@total_ordering
+class Unit(Enum):
+ MICROSECONDS = 0
+ MILLISECONDS = 1
+ SECONDS = 2
+ MINUTES = 3
+ HOURS = 4
+ DAYS = 5
+ MONTHS = 6
+ YEARS = 7
+
+ def __lt__(self, other):
+ if self.__class__ is other.__class__:
+ return self.value < other.value
+ return NotImplemented
+
+
+def _now():
+ return dt.datetime.now()
+
+
+def _abs_timedelta(delta):
+ if delta.days < 0:
+ now = _now()
+ return now - (now + delta)
+ return delta
+
+
+def _date_and_delta(value, *, now=None):
+ if not now:
+ now = _now()
+ if isinstance(value, dt.datetime):
+ date = value
+ delta = now - value
+ elif isinstance(value, dt.timedelta):
+ date = now - value
+ delta = value
+ else:
+ try:
+ value = int(value)
+ delta = dt.timedelta(seconds=value)
+ date = now - delta
+ except (ValueError, TypeError):
+ return None, value
+ return date, _abs_timedelta(delta)
+
+
+def naturaldelta(value, months=True, minimum_unit="seconds") -> str:
+ """Return a natural representation of a timedelta or number of seconds.
+
+ Does not include tense (use naturaltime for past/future).
+
+ Examples:
+ >>> import datetime as dt
+ >>> naturaldelta(dt.timedelta(seconds=90))
+ 'a minute'
+ >>> naturaldelta(dt.timedelta(hours=2))
+ '2 hours'
+ >>> naturaldelta(dt.timedelta(days=400))
+ 'a year'
+ """
+ tmp = Unit[minimum_unit.upper()]
+ if tmp not in (Unit.SECONDS, Unit.MILLISECONDS, Unit.MICROSECONDS):
+ raise ValueError(f"Minimum unit '{minimum_unit}' not supported")
+ minimum_unit = tmp
+
+ if isinstance(value, dt.timedelta):
+ delta = value
+ else:
+ try:
+ value = int(value)
+ delta = dt.timedelta(seconds=value)
+ except (ValueError, TypeError):
+ return value
+
+ seconds = abs(delta.seconds)
+ days = abs(delta.days)
+ years = days // 365
+ days = days % 365
+ months_count = int(days // 30.5)
+
+ if not years and days < 1:
+ if seconds == 0:
+ return "a moment"
+ elif seconds == 1:
+ return "a second"
+ elif seconds < 60:
+ return f"{seconds} seconds" if seconds > 1 else "a second"
+ elif 60 <= seconds < 120:
+ return "a minute"
+ elif 120 <= seconds < 3600:
+ minutes = seconds // 60
+ return f"{minutes} minutes"
+ elif 3600 <= seconds < 7200:
+ return "an hour"
+ else:
+ hours = seconds // 3600
+ return f"{hours} hours"
+ elif years == 0:
+ if days == 1:
+ return "a day"
+ if not months or not months_count:
+ return f"{days} days"
+ elif months_count == 1:
+ return "a month"
+ return f"{months_count} months"
+ elif years == 1:
+ if not months_count and not days:
+ return "a year"
+ elif not months_count:
+ return f"1 year, {days} days" if days > 1 else "1 year, a day"
+ elif months_count == 1:
+ return "1 year, 1 month"
+ return f"1 year, {months_count} months"
+ return f"{years} years"
+
+
+def naturaltime(value, future=False, months=True, minimum_unit="seconds", when=None) -> str:
+ """Return a natural representation of a time relative to now.
+
+ Examples:
+ >>> import datetime as dt
+ >>> naturaltime(dt.timedelta(seconds=30))
+ '30 seconds ago'
+ >>> naturaltime(dt.timedelta(hours=1), future=True)
+ 'an hour from now'
+ """
+ now = when or _now()
+ date, delta = _date_and_delta(value, now=now)
+ if date is None:
+ return value
+ if isinstance(value, (dt.datetime, dt.timedelta)):
+ future = date > now
+ ago = "%s from now" if future else "%s ago"
+ delta_str = naturaldelta(delta, months, minimum_unit)
+ if delta_str == "a moment":
+ return "now"
+ return ago % delta_str
+
+
+def naturalday(value, format="%b %d") -> str:
+ """Return 'today', 'tomorrow', 'yesterday', or a formatted date string.
+
+ Examples:
+ >>> import datetime as dt
+ >>> naturalday(dt.date.today())
+ 'today'
+ """
+ try:
+ value = dt.date(value.year, value.month, value.day)
+ except (AttributeError, OverflowError, ValueError):
+ return value
+ delta = value - dt.date.today()
+ if delta.days == 0:
+ return "today"
+ elif delta.days == 1:
+ return "tomorrow"
+ elif delta.days == -1:
+ return "yesterday"
+ return value.strftime(format)
+
+
+def naturaldate(value) -> str:
+ """Like naturalday, but appends year for dates more than ~5 months away."""
+ try:
+ value = dt.date(value.year, value.month, value.day)
+ except (AttributeError, OverflowError, ValueError):
+ return value
+ delta = _abs_timedelta(value - dt.date.today())
+ if delta.days >= 5 * 365 / 12:
+ return naturalday(value, "%b %d %Y")
+ return naturalday(value)
+
+
+def precisedelta(value, minimum_unit="seconds", suppress=(), format="%0.2f") -> str:
+ """Return a precise, human-readable representation of a timedelta.
+
+ Examples:
+ >>> import datetime as dt
+ >>> precisedelta(dt.timedelta(seconds=3633, days=2))
+ '2 days and 1 hour and 33 seconds'
+ """
+ date, delta = _date_and_delta(value)
+ if date is None:
+ return value
+
+ suppress_units = {Unit[s.upper()] for s in suppress}
+ min_unit = Unit[minimum_unit.upper()]
+
+ days = delta.days
+ secs = delta.seconds
+
+ years, days = divmod(days, 365)
+ months_count = int(days // 30.5)
+ days = days % 30
+
+ hours, secs = divmod(secs, 3600)
+ minutes, secs = divmod(secs, 60)
+
+ parts = []
+ for count, singular, plural in [
+ (years, "year", "years"),
+ (months_count, "month", "months"),
+ (days, "day", "days"),
+ (hours, "hour", "hours"),
+ (minutes, "minute", "minutes"),
+ (secs, "second", "seconds"),
+ ]:
+ if count > 0:
+ label = singular if count == 1 else plural
+ parts.append(f"{count} {label}")
+
+ if not parts:
+ return "0 seconds"
+ if len(parts) == 1:
+ return parts[0]
+ return " and ".join(parts)
diff --git a/graphforge/sample_repos/task_manager/__init__.py b/graphforge/sample_repos/task_manager/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a24c1a609997961ec5a2445158f638db528ff71
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/__init__.py
@@ -0,0 +1 @@
+"""Task Manager โ a small synthetic package used as the training repo."""
diff --git a/graphforge/sample_repos/task_manager/api.py b/graphforge/sample_repos/task_manager/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd965018d69cac7a31c34bc7ca885cc76c1cd3c
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/api.py
@@ -0,0 +1,48 @@
+"""High-level API layer that wires models, storage, and validators together."""
+
+from __future__ import annotations
+
+from graphforge.sample_repos.task_manager.models import Task
+from graphforge.sample_repos.task_manager.storage import TaskStore
+from graphforge.sample_repos.task_manager.validators import validate_priority, validate_tags, validate_title
+
+_store = TaskStore()
+
+
+def create_task(
+ title: str,
+ priority: str = "medium",
+ tags: list[str] | None = None,
+) -> Task:
+ """Create and persist a new task.
+
+ Raises ValueError if title or tags are invalid.
+ """
+ if not validate_title(title):
+ raise ValueError(f"Invalid title: {title!r}")
+ resolved_tags = tags or []
+ if not validate_tags(resolved_tags):
+ raise ValueError(f"Invalid tags: {resolved_tags!r}")
+ task = Task(title=title, priority=priority, tags=resolved_tags)
+ _store.add(task)
+ return task
+
+
+def get_all_tasks() -> list[Task]:
+ """Return every task in the store."""
+ return _store.all()
+
+
+def complete_task(title: str) -> bool:
+ """Mark a task done by title. Returns True if found, False otherwise."""
+ task = _store.find_by_title(title)
+ if task:
+ task.complete()
+ return True
+ return False
+
+
+def reset_store() -> None:
+ """Clear the store โ used by tests between runs."""
+ global _store
+ _store = TaskStore()
diff --git a/graphforge/sample_repos/task_manager/models.py b/graphforge/sample_repos/task_manager/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..83566d175f358e01eb2a30fb439968f3010d7fb8
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/models.py
@@ -0,0 +1,47 @@
+"""Domain models for the task manager."""
+
+from __future__ import annotations
+
+from datetime import date
+from typing import Optional
+
+
+class Task:
+ """A single task in the task manager."""
+
+ def __init__(
+ self,
+ title: str,
+ priority: str,
+ tags: list[str],
+ due_date: Optional[date] = None,
+ ) -> None:
+ self.title = title
+ self.priority = priority # expected: "low" | "medium" | "high"
+ self.tags = tags
+ self.due_date = due_date
+ self.done = False
+
+ def complete(self) -> None:
+ """Mark this task as done."""
+ self.done = True
+
+ def to_dict(self) -> dict:
+ return {
+ "title": self.title,
+ "priority": self.priority,
+ "tags": self.tags,
+ "done": self.done,
+ "due_date": str(self.due_date) if self.due_date else None,
+ }
+
+
+class User:
+ """A user who owns tasks."""
+
+ def __init__(self, username: str, email: str) -> None:
+ self.username = username
+ self.email = email
+
+ def display(self) -> str:
+ return f"{self.username} <{self.email}>"
diff --git a/graphforge/sample_repos/task_manager/storage.py b/graphforge/sample_repos/task_manager/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b896e96f42aab5f546f1d0ca988be86d8913e00
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/storage.py
@@ -0,0 +1,37 @@
+"""In-memory task storage."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from graphforge.sample_repos.task_manager.models import Task
+
+
+class TaskStore:
+ """Simple in-memory list-backed store for Task objects."""
+
+ def __init__(self) -> None:
+ self._tasks: list[Task] = []
+
+ def add(self, task: Task) -> None:
+ """Append task to the store."""
+ self._tasks.append(task)
+
+ def all(self) -> list[Task]:
+ """Return all tasks."""
+ return list(self._tasks)
+
+ def find_by_title(self, title: str) -> Optional[Task]:
+ """Return the first task whose title matches, or None."""
+ for t in self._tasks:
+ if t.title == title:
+ return t
+ return None
+
+ def find_done(self) -> list[Task]:
+ """Return all completed tasks."""
+ return [t for t in self._tasks if t.done]
+
+ def find_pending(self) -> list[Task]:
+ """Return all incomplete tasks."""
+ return [t for t in self._tasks if not t.done]
diff --git a/graphforge/sample_repos/task_manager/validators.py b/graphforge/sample_repos/task_manager/validators.py
new file mode 100644
index 0000000000000000000000000000000000000000..4083f55b3b0a22c6d73ae960cf3551ac3c7bd446
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/validators.py
@@ -0,0 +1,25 @@
+"""Input validation functions for the task manager."""
+
+from __future__ import annotations
+
+VALID_PRIORITIES = {"low", "medium", "high"}
+
+
+def validate_title(title: str) -> bool:
+ """Return True if title is a non-empty string <= 200 chars."""
+ return isinstance(title, str) and 0 < len(title) <= 200
+
+
+def validate_tags(tags: object) -> bool:
+ """Return True if tags is a list of strings."""
+ return isinstance(tags, list) and all(isinstance(t, str) for t in tags)
+
+
+def validate_email(email: str) -> bool:
+ """Return True if email looks like a valid address (contains @ and .)."""
+ return isinstance(email, str) and "@" in email and "." in email.split("@")[-1]
+
+
+def validate_priority(priority: str) -> bool:
+ """Return True if priority is one of 'low', 'medium', or 'high'."""
+ return priority in VALID_PRIORITIES
diff --git a/graphforge/server/__init__.py b/graphforge/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e126ae8dbab1d0c8872cf47255788f1f10e092
--- /dev/null
+++ b/graphforge/server/__init__.py
@@ -0,0 +1,19 @@
+"""FastAPI OpenEnv server.
+
+Endpoints (PROPOSAL.md ยง6.1):
+
+ POST /reset -> create a fresh episode, return initial observation
+ POST /step -> apply an Action, return (observation, reward, done, info)
+ GET /state -> snapshot the current episode state for debugging
+ POST /close -> tear down the episode
+
+The server is a thin shell: it owns episode state (graph, task spec,
+action history, token counter, turn counter, materialization cache) and
+delegates the work to the dispatcher, reward engine, and validators.
+
+The training-side OpenEnv client calls this over HTTP at localhost:8000.
+"""
+
+from graphforge.server.app import app
+
+__all__ = ["app"]
diff --git a/graphforge/server/app.py b/graphforge/server/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32e8c89a3985abf5a9c71076fb24b895bbd7c1d
--- /dev/null
+++ b/graphforge/server/app.py
@@ -0,0 +1,124 @@
+"""FastAPI application โ the OpenEnv server.
+
+Endpoints (PROPOSAL.md ยง6.1):
+
+ POST /reset { task_id?: str | None, seed?: int }
+ -> { episode_id, observation }
+ POST /step { episode_id, action: Action }
+ -> { observation, reward, done, info }
+ GET /state?episode_id=...
+ -> { ... full snapshot ... }
+ POST /close { episode_id }
+ -> { closed: bool }
+
+The handlers are thin: routing, request validation, episode lookup. The
+actual per-step orchestration lives in :mod:`graphforge.server.runner`.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+from graphforge.actions.schema import Action
+from graphforge.server.episode import GLOBAL_STORE, Episode, EpisodeStore
+from graphforge.server.runner import step as runner_step
+from graphforge.tasks import default_task, get_task
+
+app = FastAPI(
+ title="GraphForge OpenEnv server",
+ version="0.0.1",
+ description="See graphforge.server for the wire shape.",
+)
+
+
+# ---- request / response models --------------------------------------
+
+
+class ResetRequest(BaseModel):
+ task_id: Optional[str] = None
+ seed: Optional[int] = None # reserved for variant generation, unused for tier-0
+
+
+class StepRequest(BaseModel):
+ episode_id: str
+ # ``Action`` is itself an Annotated discriminated union; no need to
+ # re-declare the discriminator on this field.
+ action: Action
+
+
+class CloseRequest(BaseModel):
+ episode_id: str
+
+
+# ---- store wiring (overridable for tests) ---------------------------
+
+
+def _store() -> EpisodeStore:
+ return GLOBAL_STORE
+
+
+# ---- helpers --------------------------------------------------------
+
+
+def _require_episode(episode_id: str) -> Episode:
+ ep = _store().get(episode_id)
+ if ep is None:
+ raise HTTPException(status_code=404, detail=f"unknown episode_id: {episode_id!r}")
+ return ep
+
+
+def _initial_observation(ep: Episode) -> dict[str, Any]:
+ return {
+ "episode_id": ep.id,
+ "task": ep.task.visible_payload(),
+ "turns_total": 0,
+ "tokens_used_total": 0,
+ "budget": ep.task.budget,
+ "episode_cap": ep.task.episode_cap,
+ }
+
+
+# ---- endpoints ------------------------------------------------------
+
+
+@app.post("/reset")
+def reset(req: ResetRequest) -> dict:
+ if req.task_id is None:
+ task = default_task()
+ else:
+ t = get_task(req.task_id)
+ if t is None:
+ raise HTTPException(status_code=404, detail=f"unknown task_id: {req.task_id!r}")
+ task = t
+ ep = Episode.new(task=task)
+ _store().put(ep)
+ return {
+ "episode_id": ep.id,
+ "observation": _initial_observation(ep),
+ }
+
+
+@app.post("/step")
+def step(req: StepRequest) -> dict:
+ ep = _require_episode(req.episode_id)
+ return runner_step(ep, req.action)
+
+
+@app.get("/state")
+def state(episode_id: str) -> dict:
+ ep = _require_episode(episode_id)
+ return ep.state_snapshot()
+
+
+@app.post("/close")
+def close(req: CloseRequest) -> dict:
+ closed = _store().drop(req.episode_id)
+ return {"closed": closed}
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+ return {"status": "ok", "version": app.version}
diff --git a/graphforge/server/episode.py b/graphforge/server/episode.py
new file mode 100644
index 0000000000000000000000000000000000000000..3817525be8850498a2d3ff8c507c031876d7e0dd
--- /dev/null
+++ b/graphforge/server/episode.py
@@ -0,0 +1,171 @@
+"""Episode state โ one per active OpenEnv session.
+
+The server holds episodes in an in-memory dict keyed by ``episode_id``.
+Episodes are entirely self-contained: they own a :class:`Graph`, a
+:class:`Task`, and the running history. There is no leakage between
+episodes (PROPOSAL.md ยง6.2 โ "episode isolation").
+
+Token accounting is a server-side concern. We use a coarse character-based
+estimate (``len(json) // 4``) until a real tokenizer is wired in. The
+estimate is consistent across baseline and trained runs because both go
+through the same envelope.
+"""
+
+from __future__ import annotations
+
+import json
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+from graphforge.actions.dispatcher import ActionResult
+from graphforge.graph.schema import Graph
+from graphforge.reward.engine import ActionOutcome, TurnReward
+from graphforge.tasks.schema import Task
+
+
+# ---- token estimation -----------------------------------------------
+
+
+def estimate_tokens(payload: Any) -> int:
+ """Coarse token estimate over a JSON-serializable payload.
+
+ ~4 chars / token is the GPT-style rule of thumb. The exact tokenizer
+ matters for training-time reward magnitudes; this estimate is a
+ placeholder that's monotone in the size of the payload, which is
+ enough to drive the 'prefer cheap queries over expensive ones' shaping
+ while we wait on the real Qwen tokenizer.
+ """
+ try:
+ s = json.dumps(payload, default=str)
+ except Exception:
+ s = str(payload)
+ return max(0, len(s) // 4)
+
+
+# ---- history records ------------------------------------------------
+
+
+@dataclass
+class TurnRecord:
+ turn: int
+ action_kind: str
+ action_args: dict[str, Any]
+ outcome: str # ActionOutcome value
+ ok: bool
+ reward: float
+ payload: dict[str, Any] = field(default_factory=dict)
+ is_duplicate: bool = False
+ tokens_returned: int = 0
+
+
+# ---- episode --------------------------------------------------------
+
+
+@dataclass
+class Episode:
+ id: str
+ task: Task
+ graph: Graph = field(default_factory=Graph.empty)
+ history: list[TurnRecord] = field(default_factory=list)
+ tokens_used: int = 0
+ turns: int = 0
+ terminated: bool = False
+ terminal_reward: float | None = None
+ terminal_payload: dict[str, Any] | None = None
+
+ @classmethod
+ def new(cls, task: Task) -> "Episode":
+ return cls(id=str(uuid.uuid4()), task=task)
+
+ # ----- duplicate detection ---------------------------------------
+
+ def is_duplicate(self, kind: str, args: dict[str, Any]) -> bool:
+ """True iff an identical (kind, args) action was tried this episode."""
+ for r in self.history:
+ if r.action_kind == kind and r.action_args == args:
+ return True
+ return False
+
+ # ----- bookkeeping -----------------------------------------------
+
+ def record_turn(
+ self,
+ kind: str,
+ args: dict[str, Any],
+ result: ActionResult,
+ outcome: ActionOutcome,
+ turn_reward: TurnReward,
+ is_duplicate: bool,
+ tokens_returned: int,
+ ) -> TurnRecord:
+ rec = TurnRecord(
+ turn=self.turns,
+ action_kind=kind,
+ action_args=args,
+ outcome=outcome.value,
+ ok=result.ok,
+ reward=turn_reward.total,
+ payload=result.payload,
+ is_duplicate=is_duplicate,
+ tokens_returned=tokens_returned,
+ )
+ self.history.append(rec)
+ self.turns += 1
+ self.tokens_used += tokens_returned
+ return rec
+
+ # ----- snapshot --------------------------------------------------
+
+ def state_snapshot(self) -> dict[str, Any]:
+ return {
+ "episode_id": self.id,
+ "task": self.task.visible_payload(),
+ "turns": self.turns,
+ "tokens_used": self.tokens_used,
+ "budget": self.task.budget,
+ "episode_cap": self.task.episode_cap,
+ "terminated": self.terminated,
+ "graph": {
+ "modules": [m.model_dump() for m in self.graph.modules],
+ "nodes": [n.model_dump() for n in self.graph.nodes],
+ "edges": [e.model_dump() for e in self.graph.edges],
+ },
+ "history": [
+ {
+ "turn": r.turn,
+ "action_kind": r.action_kind,
+ "ok": r.ok,
+ "reward": r.reward,
+ }
+ for r in self.history
+ ],
+ "terminal_reward": self.terminal_reward,
+ }
+
+
+# ---- in-memory store ------------------------------------------------
+
+
+class EpisodeStore:
+ """Thin wrapper around a dict so we can swap in a TTL cache later."""
+
+ def __init__(self) -> None:
+ self._eps: dict[str, Episode] = {}
+
+ def put(self, ep: Episode) -> None:
+ self._eps[ep.id] = ep
+
+ def get(self, episode_id: str) -> Episode | None:
+ return self._eps.get(episode_id)
+
+ def drop(self, episode_id: str) -> bool:
+ return self._eps.pop(episode_id, None) is not None
+
+ def __len__(self) -> int:
+ return len(self._eps)
+
+
+# Singleton store. The server module holds onto this for the lifetime of
+# the process. Tests can construct their own EpisodeStore for isolation.
+GLOBAL_STORE = EpisodeStore()
diff --git a/graphforge/server/runner.py b/graphforge/server/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8499d03a84eabfa8cfac897aa8048e45cf33d3
--- /dev/null
+++ b/graphforge/server/runner.py
@@ -0,0 +1,144 @@
+"""Episode runner โ the per-step orchestration the server endpoints use.
+
+Pulls together dispatcher, reward engine, constraint checker, and episode
+state. Kept separate from the FastAPI app so it can be unit-tested without
+spinning up an HTTP server.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from graphforge.actions import dispatch
+from graphforge.actions.schema import Action, Submit
+from graphforge.constraints import evaluate_all
+from graphforge.materializer import materialize
+from graphforge.reward.engine import (
+ ActionOutcome,
+ TurnReward,
+ score_terminal,
+ score_turn,
+)
+from graphforge.server.episode import (
+ Episode,
+ TurnRecord,
+ estimate_tokens,
+)
+from graphforge.validator import full_check
+
+
+def _classify_outcome(action: Action, ok: bool) -> ActionOutcome:
+ # Schema rejection happens before this function (caught by FastAPI's
+ # pydantic validation). What we see here is a successfully-parsed
+ # action that either succeeded or failed at handler-time.
+ return ActionOutcome.SUCCESS if ok else ActionOutcome.FAILURE
+
+
+def _render_observation(ep: Episode, turn_record: TurnRecord) -> dict[str, Any]:
+ return {
+ "turn": turn_record.turn,
+ "ok": turn_record.ok,
+ "outcome": turn_record.outcome,
+ "payload": turn_record.payload,
+ "reward": turn_record.reward,
+ "is_duplicate": turn_record.is_duplicate,
+ "tokens_returned": turn_record.tokens_returned,
+ "tokens_used_total": ep.tokens_used,
+ "turns_total": ep.turns,
+ "budget_remaining": max(0, ep.task.budget - ep.tokens_used),
+ "episode_cap_remaining": max(0, ep.task.episode_cap - ep.turns),
+ }
+
+
+def step(ep: Episode, action: Action) -> dict[str, Any]:
+ """Apply ``action`` to ``ep``. Auto-terminates on submit or cap.
+
+ Returns a dict in the OpenEnv ``/step`` response shape:
+ ``{observation, reward, done, info}``.
+ """
+ if ep.terminated:
+ return {
+ "observation": {},
+ "reward": 0.0,
+ "done": True,
+ "info": {"error": "episode_already_terminated"},
+ }
+
+ args = action.model_dump(exclude={"kind"})
+ kind = action.kind # type: ignore[attr-defined]
+ is_duplicate = ep.is_duplicate(kind, args)
+
+ result = dispatch(ep.graph, action)
+ tokens_returned = estimate_tokens(result.payload)
+ outcome = _classify_outcome(action, result.ok)
+ turn_reward = score_turn(
+ outcome=outcome,
+ is_duplicate=is_duplicate,
+ tokens_returned=tokens_returned,
+ )
+ rec = ep.record_turn(
+ kind=kind,
+ args=args,
+ result=result,
+ outcome=outcome,
+ turn_reward=turn_reward,
+ is_duplicate=is_duplicate,
+ tokens_returned=tokens_returned,
+ )
+
+ done = False
+ info: dict[str, Any] = {}
+
+ # Terminate on Submit.
+ if isinstance(action, Submit):
+ done = True
+ terminal = _score_terminal(ep)
+ ep.terminated = True
+ ep.terminal_reward = terminal["total"]
+ ep.terminal_payload = terminal
+ info["terminal"] = terminal
+
+ # Terminate on episode cap.
+ if not done and ep.turns >= ep.task.episode_cap:
+ done = True
+ terminal = _score_terminal(ep)
+ ep.terminated = True
+ ep.terminal_reward = terminal["total"]
+ ep.terminal_payload = terminal
+ info["terminal"] = terminal
+ info["reason"] = "episode_cap_reached"
+
+ return {
+ "observation": _render_observation(ep, rec),
+ "reward": rec.reward + (info.get("terminal", {}).get("total", 0.0) if done else 0.0),
+ "done": done,
+ "info": info,
+ }
+
+
+def _score_terminal(ep: Episode) -> dict[str, Any]:
+ """Compute terminal reward + return a serialized payload."""
+ sat = evaluate_all(ep.graph, ep.task.all_constraints)
+ structural, behavioral = sat.split_by_family()
+
+ # materialization gate: try to materialize + parse-check.
+ materialization_ok = False
+ try:
+ files = materialize(ep.graph)
+ materialization_ok = full_check(files).ok
+ except Exception:
+ materialization_ok = False
+
+ reward = score_terminal(
+ n_structural_satisfied=len(structural.satisfied),
+ n_structural_total=structural.total,
+ n_behavioral_passing=len(behavioral.satisfied),
+ n_behavioral_total=behavioral.total,
+ materialization_ok=materialization_ok,
+ type_checks_ok=None, # mypy not wired yet
+ tokens_used=ep.tokens_used,
+ budget=ep.task.budget,
+ )
+ out = reward.to_dict()
+ out["satisfaction"] = sat.to_dict()
+ return out
diff --git a/graphforge/task_generator.py b/graphforge/task_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..08eff48ddaf8b4409a1f4bcf9c66d1dca6dd32b4
--- /dev/null
+++ b/graphforge/task_generator.py
@@ -0,0 +1,227 @@
+"""Auto-generate training tasks from any Python repository.
+
+Pipeline
+--------
+1. Parse the repo with AST โ KnowledgeGraph
+2. Find public functions that have doctest examples (>>> in docstring)
+3. Extract those examples as runnable assertions
+4. Replace the function body with `raise NotImplementedError` โ the agent
+ must re-implement it from the docstring alone
+5. Return RepoTask objects ready for GRPO training โ no hand-writing needed
+
+Usage
+-----
+ from graphforge.task_generator import generate_tasks
+ tasks = generate_tasks("/tmp/humanize/src/humanize", n_tasks=6)
+ for t in tasks:
+ print(t.task_id, "โ", t.description[:60])
+"""
+
+from __future__ import annotations
+
+import ast
+import doctest
+import textwrap
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from graphforge.knowledge_graph import KGNode, KnowledgeGraph
+from graphforge.repo_parser import parse_repo
+
+
+# โโ Task dataclass (mirrors env.tasks.RepoTask but lives here to avoid circular import) โโ
+
+@dataclass
+class AutoTask:
+ task_id: str
+ repo_name: str
+ repo_path: str # absolute path to the repo source directory
+ description: str
+ test_code: str # uses short import: from . import
+ stubbed_node_id: str # the node whose body was replaced
+ original_source: str # saved so env can restore on reset
+ max_turns: int = 12
+ difficulty: int = 0
+ hints: list[str] = field(default_factory=list)
+
+
+# โโ Doctest extraction โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _extract_all_examples(docstring: str) -> list[tuple[str, str]]:
+ """Return ALL doctest lines as (source, want) โ want is '' for setup lines."""
+ if not docstring:
+ return []
+ parser = doctest.DocTestParser()
+ try:
+ examples = parser.get_examples(docstring, name="")
+ return [(ex.source.strip(), ex.want.strip()) for ex in examples]
+ except Exception:
+ return []
+
+
+def _to_assertion(expr: str, expected: str) -> str | None:
+ """Convert one doctest example to a Python assertion.
+
+ - True/False expected โ assert (expr) is True/False
+ - Traceback expected โ skip
+ - Non-literal โ skip
+ """
+ if not expected or expected.startswith("Traceback"):
+ return None
+ if expected in ("True", "False"):
+ return f"assert ({expr}) is {expected}, f'got {{repr({expr})}}'"
+ try:
+ ast.literal_eval(expected)
+ except (ValueError, SyntaxError):
+ return None
+ return f"assert {expr} == {expected}, f'got {{repr({expr})}}'"
+
+
+def _build_test_code(func_name: str, module_stem: str, repo_name: str,
+ all_examples: list[tuple[str, str]]) -> str | None:
+ """Build complete test code including setup lines then assertions."""
+ import_line = f"from {repo_name}.{module_stem} import {func_name}"
+ setup_lines: list[str] = []
+ assertion_lines: list[str] = []
+
+ for expr, expected in all_examples:
+ if not expected:
+ setup_lines.append(expr)
+ else:
+ a = _to_assertion(expr, expected)
+ if a and func_name in a: # only keep assertions that call our function
+ assertion_lines.append(a)
+
+ if len(assertion_lines) < 2:
+ return None
+ parts = [import_line] + setup_lines + assertion_lines
+ return "\n".join(parts)
+
+
+# โโ Function stubbing โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _stub_function(source: str) -> str:
+ """Replace a function body with `raise NotImplementedError`, keeping signature + docstring."""
+ dedented = textwrap.dedent(source)
+ try:
+ tree = ast.parse(dedented)
+ except SyntaxError:
+ return source
+
+ lines = dedented.splitlines()
+ for node in ast.walk(tree):
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ continue
+
+ body = node.body
+ indent = " " * (node.col_offset // 4 + 1)
+
+ # Keep signature lines (everything up to and including the colon)
+ sig_end = body[0].lineno - 1 # 0-indexed line where body starts
+
+ # Keep docstring if present
+ if body and isinstance(body[0], ast.Expr) and isinstance(body[0].value, ast.Constant):
+ keep_until = body[0].end_lineno # inclusive, 1-indexed
+ else:
+ keep_until = sig_end
+
+ kept = "\n".join(lines[:keep_until])
+ stub = kept.rstrip() + f"\n{indent}raise NotImplementedError\n"
+ return stub
+
+ return source
+
+
+# โโ Candidate selection โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def _score_candidate(node: KGNode, examples: list) -> int:
+ """Higher = better training signal. Prefer more examples and longer docstrings."""
+ return len(examples) * 3 + min(len(node.docstring or ""), 200) // 20
+
+
+def _find_candidates(kg: KnowledgeGraph, repo_name: str) -> list[tuple[KGNode, str, int]]:
+ """Return (node, test_code, score) for all viable candidates."""
+ candidates = []
+ for node in kg.all_nodes("function"):
+ if node.name.startswith("_"):
+ continue
+ if not node.docstring or not node.source:
+ continue
+ module_stem = Path(node.file_path).stem if node.file_path else None
+ if not module_stem:
+ continue
+
+ examples = _extract_all_examples(node.docstring)
+ if not examples:
+ continue
+
+ test_code = _build_test_code(node.name, module_stem, repo_name, examples)
+ if not test_code:
+ continue
+
+ score = _score_candidate(node, examples)
+ candidates.append((node, test_code, score))
+
+ candidates.sort(key=lambda x: x[2], reverse=True)
+ return candidates
+
+
+# โโ Main entry point โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+def generate_tasks(
+ repo_source_dir: str,
+ n_tasks: int = 4,
+ max_turns: int = 12,
+) -> tuple[KnowledgeGraph, list[AutoTask]]:
+ """Parse a Python repo directory and auto-generate training tasks.
+
+ Args:
+ repo_source_dir: Path to the Python package source directory.
+ e.g. '/tmp/humanize/src/humanize'
+ n_tasks: How many tasks to generate (picks highest-scoring candidates).
+ max_turns: Max turns per episode.
+
+ Returns:
+ (kg, tasks) โ the Knowledge Graph and the list of AutoTask objects.
+ """
+ repo_source_dir = str(Path(repo_source_dir).resolve())
+ repo_name = Path(repo_source_dir).name
+ kg = parse_repo(repo_source_dir)
+
+ candidates = _find_candidates(kg, repo_name)
+ if not candidates:
+ raise ValueError(
+ f"No suitable candidates found in {repo_source_dir}. "
+ "Make sure functions have doctest examples (>>> in docstring)."
+ )
+
+ selected = candidates[:n_tasks]
+ tasks: list[AutoTask] = []
+
+ for node, test_code, score in selected:
+ stubbed = _stub_function(node.source)
+ desc = textwrap.dedent(f"""\
+ Implement the function `{node.name}` in `{node.file_path}`.
+
+ {node.docstring.strip() if node.docstring else 'No docstring available.'}
+ """).strip()
+
+ task = AutoTask(
+ task_id=f"auto.{repo_name}.{node.name}",
+ repo_name=repo_name,
+ repo_path=repo_source_dir,
+ description=desc,
+ test_code=test_code,
+ stubbed_node_id=node.node_id,
+ original_source=node.source,
+ max_turns=max_turns,
+ difficulty=min(2, max(0, score // 8)),
+ hints=[
+ f"Look at {node.file_path} to understand the module style.",
+ f"The function signature is: {node.name}{node.metadata.get('signature', '(...)')}",
+ ],
+ )
+ tasks.append(task)
+
+ return kg, tasks
diff --git a/graphforge/tasks/__init__.py b/graphforge/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4c6af16cabc3ad690189c94f5ad961efdcd8c2
--- /dev/null
+++ b/graphforge/tasks/__init__.py
@@ -0,0 +1,10 @@
+"""Task bank and variant generator.
+
+Tier-0 ships one hand-written task. Tier-1+ tasks and parametric variant
+generation are TODO. See PROPOSAL.md ยง2.1, ยง2.3 for the full design.
+"""
+
+from graphforge.tasks.bank import default_task, get_task, list_tasks
+from graphforge.tasks.schema import Task
+
+__all__ = ["Task", "default_task", "get_task", "list_tasks"]
diff --git a/graphforge/tasks/bank.py b/graphforge/tasks/bank.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3ae66bb274be9695ff591f7b65e58a35302036
--- /dev/null
+++ b/graphforge/tasks/bank.py
@@ -0,0 +1,71 @@
+"""Tier-0 task bank.
+
+A single hand-written task that exercises every implemented subsystem
+end-to-end: build a one-module ``validators`` package with an ``is_email``
+function attached to ``validate_with_regex(EMAIL)``. Tier-1+ tasks land in
+follow-up modules.
+
+Variant generation (PROPOSAL.md ยง2.3 โ ~50 concrete variants per template
+ร domain vocabulary) is also TODO; for now we hand-author tasks until the
+env's reward-signal shape is validated end-to-end.
+"""
+
+from __future__ import annotations
+
+from graphforge.constraints.schema import (
+ AcyclicImports,
+ Materializes,
+ ModuleCount,
+ ModuleResponsibility,
+ ModuleSizeMax,
+ NodeAbsent,
+ NodeExists,
+)
+from graphforge.tasks.schema import Task
+
+
+TIER_0_EMAIL_VALIDATOR = Task(
+ id="t0.email_validator",
+ tier=0,
+ description=(
+ "Build a tiny single-module package called 'validators'. It should "
+ "expose a function `is_email(s: str) -> bool` that returns True for "
+ "well-formed email addresses and False otherwise. Use the "
+ "`validate_with_regex` body template with the EMAIL pattern. The "
+ "module must materialize cleanly to runnable Python."
+ ),
+ visible_constraints=[
+ ModuleCount(n=1),
+ ModuleResponsibility(module="validators", responsibility="validation"),
+ NodeExists(name="is_email", module="validators"),
+ Materializes(),
+ ],
+ hidden_constraints=[
+ # The visible constraints already pin most of this; the hidden set
+ # adds shape constraints the agent must infer from the description.
+ ModuleSizeMax(module="validators", n=1),
+ NodeAbsent(name="main", module="validators"),
+ AcyclicImports(),
+ ],
+ behavioral_test_names=[], # tier-0 has no behavioral tests
+ budget=4000,
+ episode_cap=20,
+)
+
+
+_TASKS: dict[str, Task] = {
+ TIER_0_EMAIL_VALIDATOR.id: TIER_0_EMAIL_VALIDATOR,
+}
+
+
+def list_tasks() -> list[Task]:
+ return list(_TASKS.values())
+
+
+def get_task(task_id: str) -> Task | None:
+ return _TASKS.get(task_id)
+
+
+def default_task() -> Task:
+ """The task `/reset` picks when no ``task_id`` is specified."""
+ return TIER_0_EMAIL_VALIDATOR
diff --git a/graphforge/tasks/schema.py b/graphforge/tasks/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d9a7b2e907371328790cd06f27339a64c7b250
--- /dev/null
+++ b/graphforge/tasks/schema.py
@@ -0,0 +1,45 @@
+"""Task data model.
+
+A *task* is the agent-facing unit of work. The visible portion is what the
+agent sees at reset โ natural-language description plus the visible subset
+of constraints. The hidden portion drives reward but is invisible to the
+policy, forcing the agent to interpret the description rather than mechanically
+satisfying a fully-revealed checklist (PROPOSAL.md ยง2.1).
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.constraints.schema import Constraint
+
+
+class Task(BaseModel):
+ model_config = ConfigDict(extra="forbid", frozen=True)
+
+ id: str = Field(..., min_length=1)
+ tier: int = Field(..., ge=0, le=3)
+ description: str = Field(..., min_length=1)
+ visible_constraints: list[Constraint] = Field(default_factory=list)
+ hidden_constraints: list[Constraint] = Field(default_factory=list)
+ # Behavioral test names are visible to the agent at reset; bodies live in
+ # the test runner (TODO) and are hidden. Empty for tier-0.
+ behavioral_test_names: list[str] = Field(default_factory=list)
+ budget: int = Field(..., gt=0)
+ episode_cap: int = Field(..., gt=0)
+
+ @property
+ def all_constraints(self) -> list[Constraint]:
+ return list(self.visible_constraints) + list(self.hidden_constraints)
+
+ def visible_payload(self) -> dict[str, object]:
+ """Subset of the task that's exposed to the agent at reset."""
+ return {
+ "id": self.id,
+ "tier": self.tier,
+ "description": self.description,
+ "visible_constraints": [c.model_dump() for c in self.visible_constraints],
+ "behavioral_test_names": list(self.behavioral_test_names),
+ "budget": self.budget,
+ "episode_cap": self.episode_cap,
+ }
diff --git a/graphforge/templates/__init__.py b/graphforge/templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c91735af7c6c7bdcb2bd8b09144e09979c5001d
--- /dev/null
+++ b/graphforge/templates/__init__.py
@@ -0,0 +1,15 @@
+"""Body template library.
+
+Templates are the constrained building blocks for function bodies. See
+PROPOSAL.md ยง3.2. The seed set is in :mod:`graphforge.templates.registry`;
+the full ~25-entry library and codegen live in :mod:`library` (TODO).
+"""
+
+from graphforge.templates.registry import (
+ TemplateSpec,
+ get_template,
+ known_templates,
+ validate_args,
+)
+
+__all__ = ["TemplateSpec", "get_template", "known_templates", "validate_args"]
diff --git a/graphforge/templates/registry.py b/graphforge/templates/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ec30bf45955f231f9edc7c720081bee9b27b78f
--- /dev/null
+++ b/graphforge/templates/registry.py
@@ -0,0 +1,104 @@
+"""Body template registry.
+
+The full library is roughly 25 templates spanning common patterns
+(passthrough_call, sequential_calls, validate_with_regex, dispatch_on_type,
+try_call_with_fallback, accumulate, compose, ...). See PROPOSAL.md ยง3.2.
+
+This file holds a *seed* registry sufficient for the action dispatcher and
+its tests to exercise the attach_body code path. Each entry declares only
+the metadata the dispatcher needs:
+
+ * ``args_schema`` โ required arg names and their JSON-shape hint
+ * ``required_edges`` โ predicate that the node has the right edges to
+ support this template (e.g., passthrough_call needs exactly one out-edge)
+
+Codegen (template -> Python source) and full type signatures live in
+:mod:`graphforge.templates.library` and are TODO.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class TemplateSpec:
+ name: str
+ args_schema: dict[str, type]
+ description: str
+ # Predicate that takes (out_degree, in_degree) of the host node and
+ # returns True iff the template is attachable. Real validation
+ # (template <-> signature compatibility, type flow) is the type engine.
+ edges_ok: Callable[[int, int], bool] = lambda out_d, in_d: True # noqa: E731
+
+
+_REGISTRY: dict[str, TemplateSpec] = {
+ "passthrough_call": TemplateSpec(
+ name="passthrough_call",
+ args_schema={},
+ description="Call exactly one downstream function and return its result.",
+ edges_ok=lambda out_d, in_d: out_d == 1,
+ ),
+ "sequential_calls": TemplateSpec(
+ name="sequential_calls",
+ args_schema={},
+ description="Call each downstream function in declaration order; return the last.",
+ edges_ok=lambda out_d, in_d: out_d >= 1,
+ ),
+ "validate_with_regex": TemplateSpec(
+ name="validate_with_regex",
+ args_schema={"pattern": str},
+ description="Apply a named regex pattern to the input; return bool.",
+ edges_ok=lambda out_d, in_d: out_d == 0,
+ ),
+ "early_return_guard": TemplateSpec(
+ name="early_return_guard",
+ args_schema={"condition": str},
+ description="Guard with an early-return; otherwise delegate to one downstream call.",
+ edges_ok=lambda out_d, in_d: out_d == 1,
+ ),
+ "try_call_with_fallback": TemplateSpec(
+ name="try_call_with_fallback",
+ args_schema={},
+ description="Try the first out-edge; on exception, delegate to the second.",
+ edges_ok=lambda out_d, in_d: out_d == 2,
+ ),
+ "leaf_constant": TemplateSpec(
+ name="leaf_constant",
+ args_schema={"value": object},
+ description="Return a literal constant. Leaf node.",
+ edges_ok=lambda out_d, in_d: out_d == 0,
+ ),
+}
+
+
+def known_templates() -> list[str]:
+ return sorted(_REGISTRY.keys())
+
+
+def get_template(name: str) -> TemplateSpec | None:
+ return _REGISTRY.get(name)
+
+
+def validate_args(name: str, args: dict[str, object]) -> list[str]:
+ """Return a list of human-readable problems with ``args``.
+
+ Empty list means the args satisfy the schema.
+ """
+ spec = _REGISTRY.get(name)
+ if spec is None:
+ return [f"unknown template: {name!r}"]
+ problems: list[str] = []
+ extra = set(args) - set(spec.args_schema)
+ missing = set(spec.args_schema) - set(args)
+ for k in sorted(missing):
+ problems.append(f"missing arg {k!r}")
+ for k in sorted(extra):
+ problems.append(f"unexpected arg {k!r}")
+ for k, T in spec.args_schema.items():
+ if k in args and T is not object and not isinstance(args[k], T):
+ problems.append(
+ f"arg {k!r} should be {T.__name__}, got {type(args[k]).__name__}"
+ )
+ return problems
diff --git a/graphforge/training/__init__.py b/graphforge/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd90292153e559b37fe30ccae955be9f68d31eca
--- /dev/null
+++ b/graphforge/training/__init__.py
@@ -0,0 +1,56 @@
+"""Training: multi-turn rollout for GRPO / SFT.
+
+Public surface:
+
+ EnvClient โ protocol; HttpEnvClient or InProcessEnvClient
+ Policy โ protocol; ScriptedPolicy or HfPolicy
+ rollout(...) โ drive one episode, return Trajectory
+ Trajectory, TurnSample โ per-turn (prompt, completion, reward, return)
+
+The rollout is environment-agnostic and policy-agnostic โ see
+PROPOSAL.md ยง7.2 for the GRPOTrainer integration story.
+"""
+
+from graphforge.training.client import (
+ EnvClient,
+ HttpEnvClient,
+ InProcessEnvClient,
+)
+from graphforge.training.policy import HfPolicy, Policy, ScriptedPolicy
+from graphforge.training.protocol import (
+ ParseFailure,
+ ParseSuccess,
+ parse_completion,
+ render_action,
+)
+from graphforge.training.rollout import (
+ Trajectory,
+ TurnSample,
+ rollout,
+ trajectory_summary,
+)
+
+__all__ = [
+ "EnvClient",
+ "HfPolicy",
+ "HttpEnvClient",
+ "InProcessEnvClient",
+ "ParseFailure",
+ "ParseSuccess",
+ "Policy",
+ "ScriptedPolicy",
+ "Trajectory",
+ "TurnSample",
+ "parse_completion",
+ "render_action",
+ "rollout",
+ "trajectory_summary",
+]
+
+
+def train_grpo(config: object) -> None: # pragma: no cover โ TODO
+ raise NotImplementedError("GRPO training TODO โ see PROPOSAL.md ยง7")
+
+
+def train_sft(config: object) -> None: # pragma: no cover โ TODO
+ raise NotImplementedError("SFT plan B TODO โ see PROPOSAL.md ยง7.4")
diff --git a/graphforge/training/client.py b/graphforge/training/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..778a65ac69dcc63112a9f3a2e42b52289d238c83
--- /dev/null
+++ b/graphforge/training/client.py
@@ -0,0 +1,122 @@
+"""Env client adapters.
+
+Two implementations of the same contract:
+
+ * :class:`HttpEnvClient` โ talks to a running FastAPI server over HTTP
+ (``localhost:8000`` during training).
+ * :class:`InProcessEnvClient` โ drives the same FastAPI app via
+ ``fastapi.testclient.TestClient``, no socket required. Used by tests
+ and by single-process training jobs.
+
+Both expose the same three operations: ``reset``, ``step``, ``close``. The
+rollout code only depends on the protocol, so swapping transports doesn't
+ripple through anything else.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class EnvClient(Protocol):
+ """Minimal client surface the rollout depends on."""
+
+ def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]: ...
+
+ def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]: ...
+
+ def close(self, episode_id: str) -> dict[str, Any]: ...
+
+
+# ---- HTTP transport --------------------------------------------------
+
+
+class HttpEnvClient:
+ """Thin httpx wrapper. Use during training when the env server runs out-of-process."""
+
+ def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 30.0) -> None:
+ # Defer the import so the dep is optional for users who only do
+ # in-process drives in tests / notebooks.
+ import httpx
+
+ self._client = httpx.Client(base_url=base_url, timeout=timeout)
+
+ def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]:
+ body: dict[str, Any] = {}
+ if task_id is not None:
+ body["task_id"] = task_id
+ if seed is not None:
+ body["seed"] = seed
+ r = self._client.post("/reset", json=body)
+ r.raise_for_status()
+ return r.json()
+
+ def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]:
+ r = self._client.post("/step", json={"episode_id": episode_id, "action": action})
+ # 422 = malformed action payload; surface as a structured response
+ # rather than raising, because the agent's job is to learn from it.
+ if r.status_code == 422:
+ return {
+ "observation": {},
+ "reward": 0.0, # caller will overlay with MALFORMED scoring
+ "done": False,
+ "info": {"error": "schema_rejection", "detail": r.json()},
+ }
+ r.raise_for_status()
+ return r.json()
+
+ def close(self, episode_id: str) -> dict[str, Any]:
+ r = self._client.post("/close", json={"episode_id": episode_id})
+ r.raise_for_status()
+ return r.json()
+
+ def __enter__(self) -> "HttpEnvClient":
+ return self
+
+ def __exit__(self, *_exc: object) -> None:
+ self._client.close()
+
+
+# ---- in-process transport -------------------------------------------
+
+
+class InProcessEnvClient:
+ """Drive the FastAPI app via ``TestClient`` without a real socket."""
+
+ def __init__(self, app: object | None = None) -> None:
+ from fastapi.testclient import TestClient
+
+ if app is None:
+ from graphforge.server.app import app as default_app
+ app = default_app
+ self._client = TestClient(app) # type: ignore[arg-type]
+
+ def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]:
+ body: dict[str, Any] = {}
+ if task_id is not None:
+ body["task_id"] = task_id
+ if seed is not None:
+ body["seed"] = seed
+ r = self._client.post("/reset", json=body)
+ r.raise_for_status()
+ return r.json()
+
+ def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]:
+ r = self._client.post(
+ "/step", json={"episode_id": episode_id, "action": action}
+ )
+ if r.status_code == 422:
+ return {
+ "observation": {},
+ "reward": 0.0,
+ "done": False,
+ "info": {"error": "schema_rejection", "detail": r.json()},
+ }
+ r.raise_for_status()
+ return r.json()
+
+ def close(self, episode_id: str) -> dict[str, Any]:
+ r = self._client.post("/close", json={"episode_id": episode_id})
+ r.raise_for_status()
+ return r.json()
diff --git a/graphforge/training/policy.py b/graphforge/training/policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbbebc4c1a06ae147d99cb42e2768186b6757802
--- /dev/null
+++ b/graphforge/training/policy.py
@@ -0,0 +1,112 @@
+"""Policy interface and stub policies.
+
+A *policy* is anything that, given a list of messages, returns a single
+completion string. The rollout doesn't care whether that string came from
+a 0.5B Qwen sample, a hand-written script, or random noise โ only that
+the contract is honored.
+
+This file provides:
+
+ * :class:`Policy` โ a runtime-checkable Protocol.
+ * :class:`ScriptedPolicy` โ yields a fixed list of completions in order.
+ Useful for tests and for building oracle trajectories during rejection-
+ sampling SFT (PROPOSAL.md ยง7.4 plan B).
+ * :class:`HfPolicy` โ wraps an HF causal LM + tokenizer; the real thing.
+ Defined here so consumers can swap it in once we hook up Qwen, but
+ deliberately not imported at module-load time.
+"""
+
+from __future__ import annotations
+
+from typing import Iterator, Protocol, runtime_checkable
+
+from graphforge.training.prompt import Message
+
+
+@runtime_checkable
+class Policy(Protocol):
+ def sample(self, messages: list[Message]) -> str: ...
+
+
+# ---- scripted -------------------------------------------------------
+
+
+class ScriptedPolicy:
+ """Returns each item of ``completions`` in order.
+
+ If the rollout asks for more turns than there are scripted completions,
+ raises :class:`StopIteration` โ that's a test bug, not an env bug.
+ """
+
+ def __init__(self, completions: list[str]) -> None:
+ self._iter: Iterator[str] = iter(completions)
+ self._n = len(completions)
+
+ def sample(self, _messages: list[Message]) -> str:
+ return next(self._iter)
+
+
+# ---- HF (lazy) ------------------------------------------------------
+
+
+class HfPolicy:
+ """A real LM-backed policy. Imports torch / transformers lazily.
+
+ Constructor args::
+
+ model โ a HF AutoModelForCausalLM
+ tokenizer โ the matching tokenizer
+ max_new_tokens โ generation cap per turn (PROPOSAL.md ยง7.1: 384)
+ temperature, top_p โ sampling knobs
+ """
+
+ def __init__(
+ self,
+ model: object,
+ tokenizer: object,
+ *,
+ max_new_tokens: int = 384,
+ temperature: float = 0.7,
+ top_p: float = 0.95,
+ ) -> None:
+ self.model = model
+ self.tokenizer = tokenizer
+ self.max_new_tokens = max_new_tokens
+ self.temperature = temperature
+ self.top_p = top_p
+
+ def sample(self, messages: list[Message]) -> str:
+ # Defer heavy imports.
+ import torch # noqa: F401 โ required for inputs / device
+
+ # Critical for trained-eval correctness: ensure the model is in
+ # eval mode (no dropout) and that KV-cache is enabled (post-SFT,
+ # gradient checkpointing may have set use_cache=False).
+ self.model.eval() # type: ignore[attr-defined]
+ if hasattr(self.model, "config"):
+ self.model.config.use_cache = True # type: ignore[attr-defined]
+
+ tok = self.tokenizer
+ # Render to text first, then tokenize. ``apply_chat_template`` 's
+ # return type drifted across transformers versions (sometimes a raw
+ # tensor, sometimes a BatchEncoding); going through ``tok(text)`` is
+ # the canonical pattern and works on all of them.
+ text = tok.apply_chat_template( # type: ignore[attr-defined]
+ messages, add_generation_prompt=True, tokenize=False
+ )
+ inputs = tok(text, return_tensors="pt") # type: ignore[operator]
+ inputs = {k: v.to(self.model.device) for k, v in inputs.items()} # type: ignore[attr-defined]
+
+ with torch.no_grad():
+ out_ids = self.model.generate( # type: ignore[attr-defined]
+ **inputs,
+ max_new_tokens=self.max_new_tokens,
+ do_sample=True,
+ temperature=self.temperature,
+ top_p=self.top_p,
+ pad_token_id=tok.eos_token_id, # type: ignore[attr-defined]
+ use_cache=True,
+ )
+ prompt_len = inputs["input_ids"].shape[-1]
+ gen = out_ids[0, prompt_len:]
+ return tok.decode(gen, skip_special_tokens=True) # type: ignore[attr-defined]
diff --git a/graphforge/training/prompt.py b/graphforge/training/prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1c80842e7809e4be0d22442e28cb217bcce435
--- /dev/null
+++ b/graphforge/training/prompt.py
@@ -0,0 +1,160 @@
+"""Prompt / conversation builder.
+
+Produces the message list the policy sees, in HF chat-template-compatible
+shape: ``[{"role": "system", "content": ...}, {"role": "user", ...}, ...]``.
+
+The system prompt is short and stable across episodes; the per-task user
+turn is the natural-language description plus the visible constraints
+(rendered compactly so we don't burn context on JSON).
+
+After each step, the env's observation is appended as a ``user`` turn โ
+this is the role that's typically used for tool-result injection in the
+absence of a dedicated ``tool`` role in the chat template.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from graphforge.training.protocol import ACTION_CLOSE, ACTION_OPEN
+
+Message = dict[str, str]
+
+
+SYSTEM_PROMPT = f"""You are an agent that builds Python programs by mutating a typed function-call graph.
+
+You don't write source code directly. Instead, each turn you emit exactly one tool call.
+The environment applies the call to a graph, replies with an observation, and the cycle repeats.
+At the end, the graph is materialized into Python and scored against a hidden specification.
+
+# Tool call format
+
+Your reply each turn should end with one tool call like this:
+
+ {ACTION_OPEN}
+ {{"kind": "add_module", "name": "validators", "responsibility": "validation"}}
+ {ACTION_CLOSE}
+
+Reasoning before the call is fine; the parser takes the last block.
+Malformed output (no tag, bad JSON, missing 'kind') costs reward.
+
+# Available tools
+
+Graph mutations:
+ add_module(name, responsibility)
+ remove_module(name)
+ add_node(name, module, signature, purity?, error_policy?)
+ remove_node(name, module)
+ set_node_module(name, current_module, new_module)
+ attach_body(name, module, template, args?)
+ add_edge(caller, callee, arg_mapping?) # caller/callee are "."
+ remove_edge(caller, callee)
+
+Information (cheap):
+ query_subgraph(scope) # "module:" | "neighbors:" | "path::"
+ query_spec(constraint_kind?) # how many constraints satisfied
+ query_types(scope) # type view (TODO)
+
+Information (expensive โ token cost):
+ materialize_and_validate() # project graph to Python, parse-check
+ run_behavioral_tests() # property tests (TODO)
+
+Terminal:
+ submit() # ends episode and triggers final scoring
+
+# Reward shape
+
+Per turn:
+ successful mutation 0
+ failed mutation -2
+ malformed output -2
+ duplicate of prior action -1
+ per-turn cost -0.1
+ token cost on response -0.0008 * tokens
+
+Terminal:
+ +1 per structural constraint satisfied
+ +5 if all structural constraints satisfied
+ +5 * (budget_remaining / budget) if all satisfied (token-efficiency bonus)
+ -8 if materialization fails
+
+Plan before you act. Failed actions and reading expensive responses cost reward."""
+
+
+def initial_messages(task_visible: dict[str, Any]) -> list[Message]:
+ """Build the conversation seed for a fresh episode.
+
+ ``task_visible`` is the dict returned by ``Task.visible_payload()``.
+ """
+ return [
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {"role": "user", "content": _format_task_user_turn(task_visible)},
+ ]
+
+
+def append_observation(
+ messages: list[Message], observation: dict[str, Any]
+) -> list[Message]:
+ """Append an env observation as a user turn. Returns a new list."""
+ return list(messages) + [
+ {"role": "user", "content": _format_observation(observation)},
+ ]
+
+
+def append_completion(messages: list[Message], completion: str) -> list[Message]:
+ return list(messages) + [{"role": "assistant", "content": completion}]
+
+
+# ---- formatting -----------------------------------------------------
+
+
+def _format_task_user_turn(task_visible: dict[str, Any]) -> str:
+ desc = task_visible.get("description", "(no description)")
+ cs = task_visible.get("visible_constraints", [])
+ rendered = "\n".join(f" - {_format_constraint(c)}" for c in cs) or " (none)"
+ tier = task_visible.get("tier")
+ cap = task_visible.get("episode_cap")
+ budget = task_visible.get("budget")
+ return (
+ f"# Task (tier {tier})\n"
+ f"{desc}\n\n"
+ f"# Visible constraints (the spec also has hidden constraints; you must "
+ f"interpret the description, not just satisfy this checklist)\n"
+ f"{rendered}\n\n"
+ f"# Limits\n"
+ f" episode_cap: {cap} turns\n"
+ f" budget: {budget} tokens\n"
+ )
+
+
+def _format_constraint(c: dict[str, Any]) -> str:
+ kind = c.get("kind", "?")
+ rest = {k: v for k, v in c.items() if k != "kind"}
+ if not rest:
+ return kind
+ inside = ", ".join(f"{k}={v!r}" for k, v in rest.items())
+ return f"{kind}({inside})"
+
+
+def _format_observation(obs: dict[str, Any]) -> str:
+ """Render a /step observation tersely โ the agent doesn't need every field.
+
+ Returns a multi-line string with the action outcome, the payload, and
+ running counters. Kept concise to control token cost.
+ """
+ payload_text = json.dumps(obs.get("payload", {}), indent=2, default=str)
+ if len(payload_text) > 800:
+ payload_text = payload_text[:800] + "\n โฆ(truncated)"
+ return (
+ f"# Observation\n"
+ f" ok: {obs.get('ok')}\n"
+ f" outcome: {obs.get('outcome')}\n"
+ f" duplicate: {obs.get('is_duplicate')}\n"
+ f" reward: {obs.get('reward')}\n"
+ f" turns_total: {obs.get('turns_total')}\n"
+ f" tokens_used_total: {obs.get('tokens_used_total')}\n"
+ f" budget_remaining: {obs.get('budget_remaining')}\n"
+ f" episode_cap_remaining: {obs.get('episode_cap_remaining')}\n"
+ f" payload: {payload_text}\n"
+ )
diff --git a/graphforge/training/protocol.py b/graphforge/training/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f25e829ae18623ba309039e60f968a7df474ca
--- /dev/null
+++ b/graphforge/training/protocol.py
@@ -0,0 +1,98 @@
+"""Tool-call wire format.
+
+The agent emits a single tool call per turn as a JSON object wrapped in
+``...`` tags::
+
+ Some optional reasoning text the model writes before the call.
+
+ {"kind": "add_module", "name": "validators", "responsibility": "validation"}
+
+
+Why this format and not OpenAI / Qwen native tool-calling:
+
+* It's tokenizer-agnostic. We don't depend on any chat-template's tool-call
+ hooks, so we can swap models freely.
+* It's easy for a 0.5B model to emit reliably with a few in-context examples.
+* It's easy to fail cleanly: malformed output produces a structured
+ ``ParseFailure`` that maps to MALFORMED in the reward engine.
+
+If the model emits multiple ```` blocks we take the *last* one; this
+matches "the agent reasoned, then committed to one action" and avoids
+rewarding an early stutter.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+
+ACTION_OPEN = ""
+ACTION_CLOSE = ""
+
+_ACTION_RE = re.compile(r"\s*(.*?)\s*", re.DOTALL)
+
+
+@dataclass(frozen=True)
+class ParseSuccess:
+ action: dict[str, object]
+ raw: str # the JSON text we extracted, for debugging
+
+
+@dataclass(frozen=True)
+class ParseFailure:
+ code: str
+ message: str
+ raw: str
+
+
+ParseResult = ParseSuccess | ParseFailure
+
+
+def parse_completion(text: str) -> ParseResult:
+ """Extract a tool call from a model completion.
+
+ On success, returns ``ParseSuccess`` whose ``action`` is a JSON dict
+ suitable to forward to ``/step``. On any failure path returns a
+ ``ParseFailure`` with a stable code:
+
+ * ``no_action_tag`` โ neither tag found
+ * ``unclosed_tag`` โ open tag without close
+ * ``invalid_json`` โ tags found but body wasn't JSON
+ * ``not_an_object`` โ JSON parsed but isn't a dict
+ * ``missing_kind`` โ dict is missing the ``kind`` field
+ """
+ if ACTION_OPEN not in text:
+ return ParseFailure("no_action_tag", "no tag found", raw=text)
+ if ACTION_CLOSE not in text:
+ return ParseFailure("unclosed_tag", " tag never closed", raw=text)
+
+ matches = _ACTION_RE.findall(text)
+ if not matches:
+ return ParseFailure(
+ "no_action_tag",
+ " tags present but body could not be extracted",
+ raw=text,
+ )
+ body = matches[-1].strip() # take the last action emitted
+ try:
+ obj = json.loads(body)
+ except json.JSONDecodeError as e:
+ return ParseFailure("invalid_json", f"json error: {e.msg}", raw=body)
+
+ if not isinstance(obj, dict):
+ return ParseFailure(
+ "not_an_object",
+ f"action body must be a JSON object, got {type(obj).__name__}",
+ raw=body,
+ )
+ if "kind" not in obj:
+ return ParseFailure("missing_kind", "action object lacks 'kind' field", raw=body)
+
+ return ParseSuccess(action=obj, raw=body)
+
+
+def render_action(action: dict[str, object]) -> str:
+ """Render an action dict in the on-the-wire format. Used by tests and
+ by scripted policies."""
+ return f"{ACTION_OPEN}\n{json.dumps(action)}\n{ACTION_CLOSE}"
diff --git a/graphforge/training/rollout.py b/graphforge/training/rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3c8d7c7c70e8cb03a07adce2217a0fdfdd5af
--- /dev/null
+++ b/graphforge/training/rollout.py
@@ -0,0 +1,238 @@
+"""Multi-turn rollout โ the bridge between the env and a policy.
+
+For each turn:
+
+ 1. The policy is sampled, given the conversation so far. It returns a
+ single text completion.
+ 2. The completion is parsed to extract the tool call. If parsing fails,
+ a synthetic ``schema_rejection`` step is recorded with the reward
+ engine's MALFORMED magnitude and the loop continues.
+ 3. The tool call is forwarded to the env via ``EnvClient.step``. The env
+ returns ``{observation, reward, done, info}``.
+ 4. The observation is appended to the conversation as a user turn.
+ 5. We stop on ``done`` or when ``episode_cap`` is reached.
+
+After the loop we compute discounted returns from each turn and produce a
+list of ``TurnSample(prompt_messages, completion_text, reward, return_)``
+tuples โ exactly the shape ``trl.GRPOTrainer`` consumes when wrapped with
+a custom reward function.
+
+The rollout is environment-agnostic via :class:`EnvClient` and
+policy-agnostic via :class:`Policy`. Both come from sibling modules; the
+rollout function never imports torch or httpx directly.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from graphforge.reward.engine import (
+ DUPLICATE_ACTION,
+ PER_TURN_COST,
+ SCHEMA_REJECTION,
+)
+from graphforge.training.client import EnvClient
+from graphforge.training.policy import Policy
+from graphforge.training.prompt import (
+ Message,
+ append_completion,
+ append_observation,
+ initial_messages,
+)
+from graphforge.training.protocol import (
+ ParseFailure,
+ ParseSuccess,
+ parse_completion,
+)
+
+
+# ---- per-turn record -------------------------------------------------
+
+
+@dataclass
+class TurnSample:
+ """Single (prompt, completion, reward, return) tuple for the trainer.
+
+ ``prompt_messages`` is the conversation up to (but not including) the
+ assistant's completion at this turn.
+ """
+
+ turn: int
+ prompt_messages: list[Message]
+ completion_text: str
+ reward: float
+ return_: float = 0.0
+
+ # Diagnostics; not consumed by the trainer.
+ parse_ok: bool = True
+ parse_failure_code: str | None = None
+ env_response: dict[str, Any] = field(default_factory=dict)
+ done: bool = False
+
+
+@dataclass
+class Trajectory:
+ episode_id: str
+ task_id: str
+ samples: list[TurnSample] = field(default_factory=list)
+ terminated_naturally: bool = False
+ terminal_total: float | None = None
+
+ @property
+ def total_reward(self) -> float:
+ return sum(s.reward for s in self.samples)
+
+ def __len__(self) -> int:
+ return len(self.samples)
+
+
+# ---- rollout ---------------------------------------------------------
+
+
+def rollout(
+ *,
+ policy: Policy,
+ env: EnvClient,
+ task_id: str | None = None,
+ seed: int | None = None,
+ gamma: float = 0.97,
+ max_turns: int | None = None,
+ auto_close: bool = True,
+) -> Trajectory:
+ """Run one episode end-to-end. Returns a :class:`Trajectory`.
+
+ ``max_turns`` overrides the task's ``episode_cap`` if specified
+ (useful for unit tests). Otherwise the env's own cap fires first.
+ ``auto_close`` calls ``env.close`` when the episode ends.
+ """
+ reset_resp = env.reset(task_id=task_id, seed=seed)
+ episode_id = reset_resp["episode_id"]
+ task_visible = reset_resp["observation"]["task"]
+ cap = max_turns or task_visible["episode_cap"]
+
+ messages = initial_messages(task_visible)
+ samples: list[TurnSample] = []
+ done = False
+ terminal_total: float | None = None
+
+ for turn_idx in range(cap):
+ # 1. Sample the policy.
+ completion = policy.sample(messages)
+ prompt_at_turn = list(messages) # snapshot before appending the assistant turn
+
+ # 2. Parse the tool call.
+ parsed = parse_completion(completion)
+
+ if isinstance(parsed, ParseFailure):
+ # Synthetic step โ env never sees the action. Reward mirrors
+ # the MALFORMED branch of score_turn (no token cost because
+ # nothing came back from the env).
+ reward = SCHEMA_REJECTION + PER_TURN_COST
+ sample = TurnSample(
+ turn=turn_idx,
+ prompt_messages=prompt_at_turn,
+ completion_text=completion,
+ reward=reward,
+ parse_ok=False,
+ parse_failure_code=parsed.code,
+ )
+ samples.append(sample)
+ messages = append_completion(messages, completion)
+ messages = append_observation(
+ messages,
+ {
+ "ok": False,
+ "outcome": "malformed",
+ "is_duplicate": False,
+ "reward": reward,
+ "payload": {"error": parsed.code, "message": parsed.message},
+ "turns_total": turn_idx + 1,
+ "tokens_used_total": 0,
+ "budget_remaining": task_visible["budget"],
+ "episode_cap_remaining": cap - (turn_idx + 1),
+ },
+ )
+ continue
+
+ # 3. Forward to env.
+ assert isinstance(parsed, ParseSuccess)
+ env_resp = env.step(episode_id, parsed.action)
+
+ info = env_resp.get("info", {})
+ # The env client returns a synthetic response on FastAPI 422 โ that's
+ # a schema_rejection (e.g. unknown kind, missing required field).
+ # Score it the same as a parse-side malformed completion.
+ is_schema_rejection = info.get("error") == "schema_rejection"
+ if is_schema_rejection:
+ reward = SCHEMA_REJECTION + PER_TURN_COST
+ else:
+ reward = float(env_resp.get("reward", 0.0))
+ done = bool(env_resp.get("done", False))
+
+ # The embedded observation carries duplicate flags etc.
+ obs = env_resp.get("observation", {})
+
+ sample = TurnSample(
+ turn=turn_idx,
+ prompt_messages=prompt_at_turn,
+ completion_text=completion,
+ reward=reward,
+ env_response=env_resp,
+ done=done,
+ parse_ok=not is_schema_rejection,
+ parse_failure_code="env_schema_rejection" if is_schema_rejection else None,
+ )
+ samples.append(sample)
+
+ messages = append_completion(messages, completion)
+ messages = append_observation(messages, obs)
+
+ if done:
+ terminal_total = info.get("terminal", {}).get("total")
+ break
+
+ if auto_close:
+ try:
+ env.close(episode_id)
+ except Exception:
+ pass
+
+ _fill_returns(samples, gamma=gamma)
+
+ return Trajectory(
+ episode_id=episode_id,
+ task_id=task_visible.get("id", ""),
+ samples=samples,
+ terminated_naturally=done,
+ terminal_total=terminal_total,
+ )
+
+
+# ---- discounted returns ---------------------------------------------
+
+
+def _fill_returns(samples: list[TurnSample], *, gamma: float) -> None:
+ """In-place fill of ``return_`` on each sample.
+
+ return_t = r_t + gamma * return_{t+1}, with return_{T+1} = 0.
+ """
+ running = 0.0
+ for s in reversed(samples):
+ running = s.reward + gamma * running
+ s.return_ = running
+
+
+# ---- helper for stub-policy demo ------------------------------------
+
+
+def trajectory_summary(traj: Trajectory) -> dict[str, Any]:
+ return {
+ "episode_id": traj.episode_id,
+ "task_id": traj.task_id,
+ "n_turns": len(traj),
+ "total_reward": traj.total_reward,
+ "terminated_naturally": traj.terminated_naturally,
+ "terminal_total": traj.terminal_total,
+ "parse_failures": sum(1 for s in traj.samples if not s.parse_ok),
+ }
diff --git a/graphforge/types/__init__.py b/graphforge/types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e520b4e44b6afec1abeccf4423c1ffad459c470
--- /dev/null
+++ b/graphforge/types/__init__.py
@@ -0,0 +1,42 @@
+"""Type engine.
+
+Responsibilities (PROPOSAL.md ยง3.1, ยง4.1, ยง5.2):
+
+ * Parse function signatures into a typed parameter list.
+ * Validate that every edge's ``arg_mapping`` has type-compatible flow
+ between caller's available bindings and callee's parameter types.
+ * Validate that every body template's expected types match the host
+ node's signature and outgoing edges.
+ * Detect ``Any`` usage for the ``no_any_types`` constraint.
+ * Surface a typed view of the graph for ``query_types``.
+
+The cheap signature parser at :mod:`graphforge.actions.signature` extracts
+parameter names; this module subsumes it with full annotation parsing using
+``ast.parse`` over a synthetic ``def`` so that we get Python's own grammar
+for free.
+
+Public surface (TODO):
+
+ parse_typed_signature(sig: str) -> TypedSignature
+ edge_type_flow(graph, edge) -> list[TypeIssue]
+ type_view(graph, scope) -> dict
+ has_any(graph) -> list[str]
+"""
+
+from __future__ import annotations
+
+
+def parse_typed_signature(sig: str) -> object: # pragma: no cover โ TODO
+ raise NotImplementedError("type engine โ parse_typed_signature TODO")
+
+
+def edge_type_flow(graph: object, edge: object) -> list[object]: # pragma: no cover
+ raise NotImplementedError("type engine โ edge_type_flow TODO")
+
+
+def type_view(graph: object, scope: str) -> dict[str, object]: # pragma: no cover
+ raise NotImplementedError("type engine โ type_view TODO")
+
+
+def has_any(graph: object) -> list[str]: # pragma: no cover
+ raise NotImplementedError("type engine โ has_any TODO")
diff --git a/graphforge/validator/__init__.py b/graphforge/validator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9577ad74f05a2ba88e8dc992db1c0b39f2c15d
--- /dev/null
+++ b/graphforge/validator/__init__.py
@@ -0,0 +1,25 @@
+"""Materialization validator: parse + import + mypy --strict.
+
+Responsibilities (PROPOSAL.md ยง6.2 โ "subprocess validation is bounded"):
+
+ * ``parse_check`` โ call ``compile(source, ...)`` per file; report errors.
+ * ``import_check`` โ write to a temp directory, attempt
+ ``importlib.import_module`` in a subprocess; report errors. (TODO)
+ * ``mypy_check`` โ invoke ``mypy --strict`` in a subprocess against the
+ materialized tree, capturing exit code and parsed errors. (TODO)
+ * Hard timeouts: 8s per type-check, 12s for behavioral runs.
+ * Cache results keyed on ``Graph.structural_hash`` so we don't re-run
+ mypy after non-structural changes.
+
+Currently only parse-checking is implemented; ``full_check`` will be extended
+in place as the deeper gates land.
+"""
+
+from graphforge.validator.parse import (
+ ParseError,
+ ValidationReport,
+ full_check,
+ parse_check,
+)
+
+__all__ = ["ParseError", "ValidationReport", "full_check", "parse_check"]
diff --git a/graphforge/validator/parse.py b/graphforge/validator/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..119a90a21fc5fc29afa1e8d70aa8a77e04ec6dca
--- /dev/null
+++ b/graphforge/validator/parse.py
@@ -0,0 +1,75 @@
+"""Parse-only validator.
+
+Calls Python's own ``compile()`` on each materialized file and reports any
+syntax / lexer errors. This is the cheapest gate; the agent receives this
+feedback as part of ``materialize_and_validate``.
+
+Heavier checks (import-resolution, ``mypy --strict``, behavioral tests) live
+elsewhere in :mod:`graphforge.validator` and :mod:`graphforge.behavioral` โ
+intentionally separated so the agent can pay for verification incrementally.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(frozen=True)
+class ParseError:
+ filename: str
+ line: int | None
+ column: int | None
+ message: str
+
+
+@dataclass
+class ValidationReport:
+ parse_errors: list[ParseError] = field(default_factory=list)
+
+ @property
+ def ok(self) -> bool:
+ return not self.parse_errors
+
+ def to_dict(self) -> dict[str, object]:
+ return {
+ "ok": self.ok,
+ "parse_errors": [
+ {
+ "filename": e.filename,
+ "line": e.line,
+ "column": e.column,
+ "message": e.message,
+ }
+ for e in self.parse_errors
+ ],
+ }
+
+
+def parse_check(files: dict[str, str]) -> list[ParseError]:
+ """Compile each ``files[name]`` source. Return collected errors.
+
+ An empty list means every file parsed cleanly.
+ """
+ errors: list[ParseError] = []
+ for filename, source in files.items():
+ try:
+ compile(source, filename, "exec")
+ except SyntaxError as e:
+ errors.append(
+ ParseError(
+ filename=filename,
+ line=e.lineno,
+ column=e.offset,
+ message=e.msg,
+ )
+ )
+ return errors
+
+
+def full_check(files: dict[str, str]) -> ValidationReport:
+ """Run every validator gate that's currently implemented.
+
+ Today: parse-only. ``mypy --strict`` and import-resolution are added in
+ follow-up commits but the report shape stays the same.
+ """
+ return ValidationReport(parse_errors=parse_check(files))
diff --git a/openenv.yaml b/openenv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25126be5f116f9e5e11fafe7f2c634240ccd4425
--- /dev/null
+++ b/openenv.yaml
@@ -0,0 +1,50 @@
+# OpenEnv manifest โ required by the hackathon and by the OpenEnv spec.
+
+name: repo-edit
+version: 0.3.0
+description: >
+ Multi-turn repository-editing environment for long-horizon RL.
+ An LLM agent receives a Knowledge Graph of a real Python repo
+ (nodes: repo / package / module / class / function / method;
+ edges: contains / calls / imports / inherits โ all parsed from AST)
+ and must navigate it across multiple turns to apply a code change.
+ Reward is sparse: only granted when submit() passes all unit tests.
+ Designed to push agents beyond shallow reasoning toward structured
+ planning, graph navigation, and durable state tracking.
+
+client:
+ class_name: RepoEditEnv
+ module: env.client
+
+action:
+ class_name: RepoEditAction
+ module: env.actions
+
+observation:
+ class_name: RepoEditObservation
+ module: env.models
+
+state:
+ class_name: RepoEditState
+ module: env.models
+
+environment:
+ class_name: RepoEditEnvironment
+ module: env.environment
+
+default_image: ast-code-edit:latest
+spec_version: 1
+
+tags:
+ - code-generation
+ - ast
+ - dag
+ - grpo
+ - lora
+ - rl
+ - python
+ - single-step
+
+author: "Naga Nithin"
+license: MIT
+homepage: "https://github.com/nithin062006/scaler"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..61507d7a0f9a5492a5aef2e4add8b1b0aa00db0b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,76 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "graphforge"
+version = "0.0.1"
+description = "Graph-first code generation environment for long-horizon RL planning."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "Naga", email = "naganithin@poshmark.com" }]
+dependencies = [
+ "pydantic>=2.6",
+ "fastapi>=0.110",
+ "uvicorn[standard]>=0.27",
+ "httpx>=0.27",
+ "typing-extensions>=4.10",
+ "openenv-core>=0.1.0",
+ "pyyaml>=6.0",
+]
+
+[project.optional-dependencies]
+validation = [
+ "mypy>=1.10",
+]
+behavioral = [
+ "hypothesis>=6.100",
+]
+training = [
+ "torch>=2.2",
+ "transformers>=4.42",
+ "trl>=0.12",
+ "peft>=0.11",
+ "accelerate>=0.30",
+ "datasets>=2.18",
+ "matplotlib>=3.8",
+ "numpy>=1.26",
+]
+demo = [
+ "gradio>=4.30",
+]
+dev = [
+ "pytest>=8.0",
+ "pytest-cov>=5.0",
+ "ruff>=0.4",
+ "mypy>=1.10",
+ "hypothesis>=6.100",
+]
+
+[tool.setuptools.packages.find]
+include = ["graphforge*", "env*"]
+exclude = ["tests*", "notebooks*", "training*", "plots*", "space*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+addopts = "-ra --strict-markers"
+markers = [
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+ "subprocess: tests that spawn subprocesses (mypy, behavioral)",
+]
+
+[tool.mypy]
+python_version = "3.10"
+strict = true
+ignore_missing_imports = true
+exclude = ["build/", "dist/"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM"]
+ignore = ["E501"]