diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..149a453e2c06ad87de858c984702ed5f87027c15
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
+*.egg-info/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f47f7e010e9d11ae9807dac8b0445505137b485c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,27 @@
+# Hugging Face Space Dockerfile.
+# Mirrors the root Dockerfile, exists separately because HF Spaces looks for
+# the Dockerfile inside the Space root by default.
+
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY pyproject.toml ./
+COPY graphforge ./graphforge
+COPY env ./env
+COPY openenv.yaml ./
+
+RUN pip install --no-cache-dir \
+    "pydantic>=2.6" \
+    "fastapi>=0.110" \
+    "uvicorn[standard]>=0.27" \
+    "httpx>=0.27" \
+    "openenv-core>=0.1.0" \
+    "pyyaml>=6.0"
+
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+
+EXPOSE 7860
+
+CMD ["uvicorn", "env.server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 3cc7917620928b0e79b294f3799caa4552158a70..70a6c752fecbdcd5192a10d33cc76d8fef1975de 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,38 @@
 ---
-title: Graphforge Openenv
-emoji: 💻
-colorFrom: green
+title: GraphForge OpenEnv
+emoji: 🧱
+colorFrom: indigo
 colorTo: purple
 sdk: docker
+app_port: 8000
 pinned: false
 license: mit
-short_description: A graph-first code-editing RL environment for Python repos.
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# GraphForge — OpenEnv server
+
+Live deployment of the GraphForge environment for the Meta PyTorch OpenEnv
+Hackathon. The server hosts the OpenEnv-compliant `/reset`, `/step`, `/state`
+endpoints over HTTP. Anything that speaks the OpenEnv client protocol (or
+plain JSON) can drive episodes.
+
+See the main project repo for the architecture overview, training notebook,
+plots, and writeup.
+
+## Endpoints
+
+```
+POST /reset             → GraphForgeObservation
+POST /step  { ... }     → { observation, reward, done }
+GET  /state             → GraphForgeState
+GET  /healthz
+```
+
+## Quick smoke test
+
+```bash
+EID=$(curl -s -X POST $SPACE_URL/reset | python3 -c "import sys,json; print(json.load(sys.stdin)['episode_id'])")
+curl -s -X POST $SPACE_URL/step -H 'content-type: application/json' \
+  -d '{"kind": "add_module", "payload": {"name": "validators", "responsibility": "validation"}}' \
+  | python3 -m json.tool
+```
diff --git a/env/__init__.py b/env/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22926036d1f56a4b6f462040d73b683b1d1d6a85
--- /dev/null
+++ b/env/__init__.py
@@ -0,0 +1,34 @@
+"""Multi-turn repo-editing OpenEnv environment.
+
+Public surface:
+    RepoEditAction, RepoEditObservation, RepoEditState  — wire models
+    RepoEditEnvironment                                 — OpenEnv environment
+    RepoEditEnv                                         — HTTP client
+"""
+
+from env.actions import (
+    AddNodeAction,
+    InspectAction,
+    QueryAction,
+    RemoveNodeAction,
+    RepoEditAction,
+    SubmitAction,
+    UpdateNodeAction,
+)
+from env.client import RepoEditEnv
+from env.environment import RepoEditEnvironment
+from env.models import RepoEditObservation, RepoEditState
+
+__all__ = [
+    "AddNodeAction",
+    "InspectAction",
+    "QueryAction",
+    "RemoveNodeAction",
+    "RepoEditAction",
+    "RepoEditEnv",
+    "RepoEditEnvironment",
+    "RepoEditObservation",
+    "RepoEditState",
+    "SubmitAction",
+    "UpdateNodeAction",
+]
diff --git a/env/actions.py b/env/actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..82717404d0204eeaf29651c569c6c28037972a8d
--- /dev/null
+++ b/env/actions.py
@@ -0,0 +1,90 @@
+"""Action schema for the multi-turn repo-editing environment.
+
+All actions are expressed as JSON dicts with a "kind" discriminator.
+The agent emits one action per turn inside <action>...</action> XML tags.
+
+Actions
+-------
+query        Search the knowledge graph for relevant nodes.
+inspect      View the full source of a specific node.
+add_node     Insert a new function or class into a module/class.
+update_node  Replace the source of an existing node.
+remove_node  Delete a node from the graph.
+submit       Apply all pending changes, run tests, end the episode.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+_cfg = ConfigDict(extra="forbid")
+
+
+class QueryAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["query"] = "query"
+    keywords: str
+    node_type: str = "all"   # "all" | "function" | "class" | "module" | "method"
+
+
+class InspectAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["inspect"] = "inspect"
+    node_id: str
+
+
+class AddNodeAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["add_node"] = "add_node"
+    parent_id: str       # node_id of the parent (module or class)
+    name: str            # name of the new function/class
+    node_type: str       # "function" | "class"
+    code: str            # full source of the new node (incl. def/class line)
+
+
+class UpdateNodeAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["update_node"] = "update_node"
+    node_id: str         # which node to replace
+    new_code: str        # full replacement source (incl. def/class line)
+
+
+class RemoveNodeAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["remove_node"] = "remove_node"
+    node_id: str
+
+
+class SubmitAction(BaseModel):
+    model_config = _cfg
+    kind: Literal["submit"] = "submit"
+
+
+RepoEditAction = (
+    QueryAction
+    | InspectAction
+    | AddNodeAction
+    | UpdateNodeAction
+    | RemoveNodeAction
+    | SubmitAction
+)
+
+
+def parse_action(raw: dict) -> RepoEditAction:
+    """Dispatch raw dict to the correct action model."""
+    kind = raw.get("kind", "")
+    mapping = {
+        "query": QueryAction,
+        "inspect": InspectAction,
+        "add_node": AddNodeAction,
+        "update_node": UpdateNodeAction,
+        "remove_node": RemoveNodeAction,
+        "submit": SubmitAction,
+    }
+    cls = mapping.get(kind)
+    if cls is None:
+        raise ValueError(f"Unknown action kind: {kind!r}. Valid: {list(mapping)}")
+    return cls.model_validate(raw)
diff --git a/env/ast_parser.py b/env/ast_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a7e51d8f06020d084fa0558bdefd9c05aee3f
--- /dev/null
+++ b/env/ast_parser.py
@@ -0,0 +1,249 @@
+"""AST-based DAG parser and code injection utilities.
+
+parse_source(source, module_name) -> CodeDAG
+    Parses a Python source string and returns a structured DAG with nodes
+    (module, function, imported_module) and typed edges (contains, calls, imports).
+
+inject_function_body(source, func_name, new_body) -> str
+    Replaces the body of func_name in source with new_body, preserving the
+    def line and any docstring. Used by the environment's step() method.
+"""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass, field
+
+
+# ── DAG data model ────────────────────────────────────────────────────────────
+
+@dataclass
+class DAGNode:
+    name: str
+    node_type: str    # "module" | "function" | "class" | "imported_module"
+    signature: str = ""
+    is_stub: bool = False
+    body_summary: str = ""
+
+
+@dataclass
+class DAGEdge:
+    edge_type: str    # "contains" | "calls" | "imports"
+    source: str
+    target: str
+
+
+@dataclass
+class FunctionInfo:
+    name: str
+    signature: str
+    is_stub: bool
+    start_line: int   # 1-indexed
+    end_line: int     # 1-indexed, inclusive
+    has_docstring: bool
+    docstring_end_line: int   # 1-indexed; == start_line when no docstring
+
+
+@dataclass
+class CodeDAG:
+    module_name: str
+    nodes: list[DAGNode] = field(default_factory=list)
+    edges: list[DAGEdge] = field(default_factory=list)
+    function_infos: dict[str, FunctionInfo] = field(default_factory=dict)
+
+    def callers_of(self, func_name: str) -> list[str]:
+        return [e.source for e in self.edges if e.edge_type == "calls" and e.target == func_name]
+
+    def callees_of(self, func_name: str) -> list[str]:
+        return [e.target for e in self.edges if e.edge_type == "calls" and e.source == func_name]
+
+    def stub_functions(self) -> list[str]:
+        return [n.name for n in self.nodes if n.node_type == "function" and n.is_stub]
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
+    parts = []
+    for arg in node.args.args:
+        ann = f": {ast.unparse(arg.annotation)}" if arg.annotation else ""
+        parts.append(f"{arg.arg}{ann}")
+    ret = f" -> {ast.unparse(node.returns)}" if node.returns else ""
+    return f"({', '.join(parts)}){ret}"
+
+
+def _is_stub(node: ast.FunctionDef | ast.AsyncFunctionDef, source: str) -> bool:
+    func_src = "\n".join(source.splitlines()[node.lineno - 1:node.end_lineno])
+    if "# STUB" in func_src:
+        return True
+    # body that is just "raise NotImplementedError"
+    stmts = [s for s in node.body
+              if not (isinstance(s, ast.Expr) and isinstance(s.value, ast.Constant))]
+    if len(stmts) == 1 and isinstance(stmts[0], ast.Raise):
+        exc = stmts[0].exc
+        if isinstance(exc, ast.Name) and exc.id == "NotImplementedError":
+            return True
+        if isinstance(exc, ast.Call) and isinstance(exc.func, ast.Name) and exc.func.id == "NotImplementedError":
+            return True
+    return False
+
+
+def _extract_calls(node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
+    calls: set[str] = set()
+    for child in ast.walk(node):
+        if isinstance(child, ast.Call):
+            if isinstance(child.func, ast.Name):
+                calls.add(child.func.id)
+    return calls
+
+
+# ── main parser ───────────────────────────────────────────────────────────────
+
+def parse_source(source: str, module_name: str = "module") -> CodeDAG:
+    """Parse Python source into a CodeDAG."""
+    tree = ast.parse(source)
+    dag = CodeDAG(module_name=module_name)
+    dag.nodes.append(DAGNode(name=module_name, node_type="module"))
+
+    func_names: set[str] = set()
+
+    # imports
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                imp = alias.asname or alias.name
+                dag.nodes.append(DAGNode(name=imp, node_type="imported_module"))
+                dag.edges.append(DAGEdge("imports", module_name, imp))
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            dag.nodes.append(DAGNode(name=node.module, node_type="imported_module"))
+            dag.edges.append(DAGEdge("imports", module_name, node.module))
+
+    # top-level functions and classes
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            sig = _signature(node)
+            stub = _is_stub(node, source)
+            has_doc = (
+                bool(node.body)
+                and isinstance(node.body[0], ast.Expr)
+                and isinstance(node.body[0].value, ast.Constant)
+            )
+            doc_end = node.body[0].end_lineno if has_doc else node.lineno
+
+            dag.nodes.append(DAGNode(
+                name=node.name,
+                node_type="function",
+                signature=sig,
+                is_stub=stub,
+                body_summary="STUB — needs implementation" if stub else "(implemented)",
+            ))
+            dag.edges.append(DAGEdge("contains", module_name, node.name))
+            dag.function_infos[node.name] = FunctionInfo(
+                name=node.name,
+                signature=sig,
+                is_stub=stub,
+                start_line=node.lineno,
+                end_line=node.end_lineno,
+                has_docstring=has_doc,
+                docstring_end_line=doc_end,
+            )
+            func_names.add(node.name)
+
+        elif isinstance(node, ast.ClassDef):
+            dag.nodes.append(DAGNode(name=node.name, node_type="class"))
+            dag.edges.append(DAGEdge("contains", module_name, node.name))
+            for item in node.body:
+                if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                    qname = f"{node.name}.{item.name}"
+                    dag.nodes.append(DAGNode(
+                        name=qname,
+                        node_type="function",
+                        signature=_signature(item),
+                        is_stub=_is_stub(item, source),
+                    ))
+                    dag.edges.append(DAGEdge("contains", node.name, qname))
+                    func_names.add(qname)
+
+    # call edges (same-module only)
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            for callee in _extract_calls(node):
+                if callee in func_names and callee != node.name:
+                    dag.edges.append(DAGEdge("calls", node.name, callee))
+
+    return dag
+
+
+# ── code injection ────────────────────────────────────────────────────────────
+
+def inject_function_body(source: str, func_name: str, new_body: str) -> str:
+    """Replace the body of func_name in source with new_body.
+
+    Preserves the def line and any docstring. new_body should be the raw body
+    text (with or without indentation — we normalise it).
+    """
+    tree = ast.parse(source)
+    lines = source.splitlines(keepends=True)
+
+    for node in tree.body:
+        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            continue
+        if node.name != func_name:
+            continue
+
+        # Determine where to keep up to (def line + optional docstring)
+        has_doc = (
+            bool(node.body)
+            and isinstance(node.body[0], ast.Expr)
+            and isinstance(node.body[0].value, ast.Constant)
+        )
+        keep_until = node.body[0].end_lineno if has_doc else node.lineno
+        # keep_until is 1-indexed; lines[:keep_until] gives 0..keep_until-1
+
+        before = lines[:keep_until]
+        after = lines[node.end_lineno:]   # everything after the function
+
+        # Normalise body indent: strip common leading whitespace, then re-add 4 spaces.
+        raw_lines = new_body.splitlines()
+        # find minimum indent of non-empty lines
+        min_indent = min(
+            (len(l) - len(l.lstrip()) for l in raw_lines if l.strip()),
+            default=0,
+        )
+        body_lines: list[str] = []
+        for raw_line in raw_lines:
+            if raw_line.strip():
+                body_lines.append("    " + raw_line[min_indent:] + "\n")
+            else:
+                body_lines.append("\n")
+
+        if not body_lines:
+            body_lines = ["    pass\n"]
+
+        return "".join(before + body_lines + after)
+
+    raise ValueError(f"Function {func_name!r} not found in source")
+
+
+# ── DAG → text description (for prompts) ─────────────────────────────────────
+
+def dag_to_text(dag: CodeDAG) -> str:
+    """Render the DAG as a concise human-readable block for the agent prompt."""
+    lines: list[str] = [f"## Module: {dag.module_name}", "", "### Nodes"]
+
+    for n in dag.nodes:
+        if n.node_type == "module":
+            lines.append(f"- [MODULE]   {n.name}")
+        elif n.node_type == "function":
+            status = "[ STUB ]" if n.is_stub else "[ready ]"
+            lines.append(f"- [FUNC]  {status}  {n.name}{n.signature}")
+        elif n.node_type == "class":
+            lines.append(f"- [CLASS]   {n.name}")
+        elif n.node_type == "imported_module":
+            lines.append(f"- [IMPORT]  {n.name}")
+
+    lines += ["", "### Edges"]
+    for e in dag.edges:
+        lines.append(f"- {e.source}  --{e.edge_type}-->  {e.target}")
+
+    return "\n".join(lines)
diff --git a/env/client.py b/env/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d69d4c658cc5af002b73492e04ccacfdcbdbf37
--- /dev/null
+++ b/env/client.py
@@ -0,0 +1,36 @@
+"""HTTP client for the repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+from env.models import RepoEditObservation, RepoEditState
+
+
+class RepoEditEnv:
+    def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 60.0) -> None:
+        self._client = httpx.Client(base_url=base_url.rstrip("/"), timeout=timeout)
+
+    def reset(self, task_id: str | None = None) -> RepoEditObservation:
+        params = {"task_id": task_id} if task_id else {}
+        r = self._client.post("/reset", params=params)
+        r.raise_for_status()
+        return RepoEditObservation.model_validate(r.json())
+
+    def step(self, action_dict: dict[str, Any]) -> dict[str, Any]:
+        r = self._client.post("/step", json=action_dict)
+        r.raise_for_status()
+        return r.json()
+
+    def state(self) -> RepoEditState:
+        r = self._client.get("/state")
+        r.raise_for_status()
+        return RepoEditState.model_validate(r.json())
+
+    def __enter__(self) -> "RepoEditEnv":
+        return self
+
+    def __exit__(self, *_: object) -> None:
+        self._client.close()
diff --git a/env/environment.py b/env/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f389ec7109b6497bb93792b14b0ee78781c1742
--- /dev/null
+++ b/env/environment.py
@@ -0,0 +1,467 @@
+"""Multi-turn repo-editing OpenEnv environment.
+
+Episode flow
+------------
+reset()   Parse the target repo into a KnowledgeGraph. Return an observation
+          containing the full graph overview and the task description.
+
+step()    The agent emits one RepoEditAction per turn:
+          - query       → search results (information, no graph mutation)
+          - inspect     → full node source (information)
+          - add_node    → insert new function/class into the live graph
+          - update_node → replace a node's source in the live graph
+          - remove_node → delete a node
+          - submit      → materialise all changes back to disk (temp), run tests,
+                          compute reward, end episode
+
+Reward structure (sparse — designed for long-horizon RL)
+---------------------------------------------------------
+  Per-turn cost    : -0.05   (forces efficiency)
+  Malformed action : -0.2
+  On submit
+    all tests pass : +1.0
+    partial pass   : +0.5 * (n_pass / n_total)
+    compile error  : 0.0
+  Episode cap hit  : 0.0
+
+This sparse reward deliberately requires the agent to plan, navigate, and
+execute across many turns — it cannot succeed by guessing on the first turn.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import os
+import re
+import sys
+import tempfile
+import textwrap
+import traceback
+import uuid
+from pathlib import Path
+from typing import Any
+
+from env.actions import (
+    AddNodeAction,
+    InspectAction,
+    QueryAction,
+    RemoveNodeAction,
+    RepoEditAction,
+    SubmitAction,
+    UpdateNodeAction,
+    parse_action,
+)
+from env.models import RepoEditObservation, RepoEditState
+from env.tasks import SAMPLE_REPOS_DIR, TASK_BANK, RepoTask, all_task_ids, get_task
+from graphforge.knowledge_graph import KGEdge, KGNode, KnowledgeGraph
+from graphforge.repo_parser import parse_repo, _node_id
+
+try:
+    from openenv.core import Environment  # type: ignore
+    _HAS_OPENENV = True
+except Exception:
+    _HAS_OPENENV = False
+    from typing import Generic, TypeVar
+    A = TypeVar("A")
+    O = TypeVar("O")
+    S = TypeVar("S")
+
+    class Environment(Generic[A, O, S]):  # type: ignore[no-redef]
+        def reset(self) -> O: ...
+        def step(self, action: A) -> tuple[O, float, bool]: ...
+        def get_state(self) -> S: ...
+
+
+# ── constants ─────────────────────────────────────────────────────────────────
+
+PER_TURN_COST = -0.05
+MALFORMED_PENALTY = -0.2
+
+
+# ── materialiser (graph → disk) ───────────────────────────────────────────────
+
+def _materialise_changes(
+    kg: KnowledgeGraph,
+    repo_src_path: Path,
+    tmp_dir: str,
+) -> dict[str, str]:
+    """Write mutated module sources to tmp_dir. Returns {rel_path: source}."""
+    files: dict[str, str] = {}
+    for node in kg.all_nodes("module"):
+        if not node.file_path:
+            continue
+        # Re-assemble module source from its children's current sources
+        # For simplicity: use the node.source field (which we keep in sync)
+        files[node.file_path] = node.source
+        dest = Path(tmp_dir) / node.file_path
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        dest.write_text(node.source, encoding="utf-8")
+    # Copy non-py files (like __init__.py markers) from original
+    for root, _, fnames in os.walk(str(repo_src_path)):
+        for fname in fnames:
+            if fname.endswith(".py"):
+                continue
+            src = Path(root) / fname
+            rel = src.relative_to(repo_src_path)
+            dst = Path(tmp_dir) / rel
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            dst.write_bytes(src.read_bytes())
+    return files
+
+
+# ── code injection into module source ─────────────────────────────────────────
+
+def _apply_add_node(
+    module_source: str,
+    code: str,
+    class_name: str | None = None,
+) -> str:
+    """Insert code into module_source.
+
+    If class_name is given, the code is indented and appended inside the class
+    body. Otherwise it is appended at module level.
+    """
+    new_code = textwrap.dedent(code).strip()
+    if class_name is None:
+        return module_source.rstrip() + "\n\n\n" + new_code + "\n"
+
+    # Insert indented method just before the end of the class block
+    indented = "\n".join("    " + line for line in new_code.splitlines())
+    # Find the class definition via AST and splice
+    try:
+        tree = ast.parse(module_source)
+        lines = module_source.splitlines(keepends=True)
+        for node in tree.body:
+            if isinstance(node, ast.ClassDef) and node.name == class_name:
+                insert_at = node.end_lineno  # 1-indexed, inclusive last line of class
+                before = "".join(lines[:insert_at])
+                after = "".join(lines[insert_at:])
+                return before.rstrip() + "\n\n" + indented + "\n" + after
+    except Exception:
+        pass
+    # Fallback: append at module level
+    return module_source.rstrip() + "\n\n\n" + indented + "\n"
+
+
+def _apply_update_node(
+    module_source: str,
+    old_source: str,
+    new_code: str,
+) -> str:
+    """Replace old_source verbatim in module_source with new_code."""
+    new_code_clean = textwrap.dedent(new_code).strip()
+    if old_source in module_source:
+        return module_source.replace(old_source, new_code_clean, 1)
+    # Fallback: try stripping indentation differences
+    return module_source + "\n\n# PATCHED\n" + new_code_clean + "\n"
+
+
+def _apply_remove_node(module_source: str, old_source: str) -> str:
+    if old_source in module_source:
+        return module_source.replace(old_source, "", 1)
+    return module_source
+
+
+def _validate_python(source: str) -> tuple[bool, str]:
+    try:
+        ast.parse(source)
+        return True, ""
+    except SyntaxError as exc:
+        return False, str(exc)
+
+
+# ── environment ───────────────────────────────────────────────────────────────
+
+class RepoEditEnvironment(
+    Environment[RepoEditAction, RepoEditObservation, RepoEditState]
+):
+    """Multi-turn OpenEnv environment for repository-level code editing.
+
+    The agent receives a Knowledge Graph of a real Python repo and must
+    navigate it to find the right location, then apply the correct edit.
+    Reward is sparse: only granted on a passing submit().
+    """
+
+    def __init__(self, task_id: str | None = None) -> None:
+        self._configured_task_id = task_id
+        self._task: RepoTask | None = None
+        self._kg: KnowledgeGraph | None = None
+        self._episode_id: str | None = None
+        self._turn: int = 0
+        self._done: bool = False
+        self._total_reward: float = 0.0
+        self._history: list[dict[str, Any]] = []
+
+    # ----- OpenEnv contract ---------------------------------------------------
+
+    def reset(self, task_id: str | None = None, task: Any = None) -> RepoEditObservation:
+        """Reset the environment.
+
+        Pass either task_id (looks up TASK_BANK) or a task object directly
+        (supports AutoTask from graphforge.task_generator).
+        """
+        if task is not None:
+            tid = task.task_id
+        else:
+            tid = task_id or self._configured_task_id or _pick_random_task()
+            task = TASK_BANK.get(tid)
+            if task is None:
+                raise ValueError(f"Unknown task_id: {tid!r}. Available: {all_task_ids()}")
+
+        # Resolve the repo path: use task.repo_path if set, else fall back to sample_repos/
+        if getattr(task, "repo_path", None):
+            repo_path = task.repo_path
+        else:
+            repo_path = str(SAMPLE_REPOS_DIR / task.repo_name)
+
+        self._task = task
+        self._kg = parse_repo(repo_path)
+        self._episode_id = str(uuid.uuid4())[:8]
+        self._turn = 0
+        self._done = False
+        self._total_reward = 0.0
+        self._history = []
+
+        return RepoEditObservation(
+            episode_id=self._episode_id,
+            task_id=tid,
+            turn=0,
+            max_turns=task.max_turns,
+            graph_overview=self._kg.overview(),
+            task_description=task.description,
+            action_result="Episode started. Use query/inspect to navigate, then add_node/update_node to edit, then submit.",
+            done=False,
+        )
+
+    def step(self, action: RepoEditAction) -> tuple[RepoEditObservation, float, bool]:
+        if self._task is None or self._kg is None:
+            raise RuntimeError("step() called before reset()")
+        if self._done:
+            return self._terminal_obs("Episode already done."), 0.0, True
+
+        self._turn += 1
+        turn_reward = PER_TURN_COST
+
+        # Dispatch
+        try:
+            result_text, extra_reward, done = self._dispatch(action)
+            turn_reward += extra_reward
+        except Exception as exc:
+            result_text = f"[ERROR] {exc}"
+            turn_reward += MALFORMED_PENALTY
+            done = False
+
+        self._total_reward += turn_reward
+
+        # Episode cap
+        if not done and self._turn >= self._task.max_turns:
+            done = True
+            result_text += f"\n[Episode cap reached: {self._task.max_turns} turns]"
+
+        self._done = done
+        self._history.append({
+            "turn": self._turn,
+            "action_kind": getattr(action, "kind", "unknown"),
+            "reward": turn_reward,
+        })
+
+        obs = RepoEditObservation(
+            episode_id=self._episode_id,
+            task_id=self._task.task_id,
+            turn=self._turn,
+            max_turns=self._task.max_turns,
+            graph_overview=self._kg.overview(),
+            task_description=self._task.description,
+            action_result=result_text,
+            turn_reward=turn_reward,
+            total_reward=self._total_reward,
+            done=done,
+        )
+        return obs, turn_reward, done
+
+    def get_state(self) -> RepoEditState:
+        return RepoEditState(
+            episode_id=self._episode_id,
+            task_id=self._task.task_id if self._task else None,
+            turn=self._turn,
+            done=self._done,
+            total_reward=self._total_reward,
+        )
+
+    @property
+    def state(self) -> RepoEditState:
+        return self.get_state()
+
+    # ----- action dispatch ----------------------------------------------------
+
+    def _dispatch(
+        self, action: RepoEditAction
+    ) -> tuple[str, float, bool]:
+        """Returns (result_text, extra_reward, done)."""
+        kg = self._kg
+        assert kg is not None
+
+        if isinstance(action, QueryAction):
+            nt = None if action.node_type == "all" else action.node_type
+            results = kg.search(action.keywords, node_type=nt)
+            if not results:
+                return f"No nodes found for query: {action.keywords!r}", 0.0, False
+            lines = [f"Found {len(results)} node(s) matching {action.keywords!r}:"]
+            for n in results[:10]:
+                lines.append(f"  {n.node_id}  ({n.file_path}:{n.line_start})")
+            return "\n".join(lines), 0.0, False
+
+        if isinstance(action, InspectAction):
+            detail = kg.node_detail(action.node_id)
+            return detail, 0.0, False
+
+        if isinstance(action, AddNodeAction):
+            parent = kg.get_node(action.parent_id)
+            if parent is None:
+                return f"[ERROR] parent_id {action.parent_id!r} not found.", MALFORMED_PENALTY, False
+            ok, err = _validate_python(action.code)
+            if not ok:
+                return f"[SYNTAX ERROR in your code] {err}", MALFORMED_PENALTY, False
+
+            # Append to parent module's source
+            module_node = _find_module_for(kg, action.parent_id)
+            if module_node is None:
+                return f"[ERROR] could not find module for parent {action.parent_id!r}", MALFORMED_PENALTY, False
+
+            parent_node = kg.get_node(action.parent_id)
+            class_name = parent_node.name if parent_node and parent_node.node_type == "class" else None
+            module_node.source = _apply_add_node(module_node.source, action.code, class_name=class_name)
+
+            # Register the new node in the KG
+            ntype = action.node_type if action.node_type in ("function", "class", "method") else "function"
+            new_id = _node_id(ntype, module_node.file_path, action.name)
+            new_node = KGNode(
+                node_id=new_id,
+                node_type=ntype,
+                name=action.name,
+                file_path=module_node.file_path,
+                line_start=module_node.line_end,
+                line_end=module_node.line_end + action.code.count("\n") + 1,
+                source=textwrap.dedent(action.code).strip(),
+            )
+            kg.insert_node(action.parent_id, new_node)
+            return f"Added {ntype} `{action.name}` to `{module_node.file_path}`.\nNew node_id: {new_id}", 0.0, False
+
+        if isinstance(action, UpdateNodeAction):
+            target = kg.get_node(action.node_id)
+            if target is None:
+                return f"[ERROR] node_id {action.node_id!r} not found.", MALFORMED_PENALTY, False
+            ok, err = _validate_python(action.new_code)
+            if not ok:
+                return f"[SYNTAX ERROR in your code] {err}", MALFORMED_PENALTY, False
+
+            module_node = _find_module_for(kg, action.node_id)
+            if module_node is None:
+                return f"[ERROR] could not find module for {action.node_id!r}", MALFORMED_PENALTY, False
+
+            old_source = target.source
+            module_node.source = _apply_update_node(module_node.source, old_source, action.new_code)
+            target.source = textwrap.dedent(action.new_code).strip()
+            return f"Updated `{action.node_id}`.", 0.0, False
+
+        if isinstance(action, RemoveNodeAction):
+            target = kg.get_node(action.node_id)
+            if target is None:
+                return f"[ERROR] node_id {action.node_id!r} not found.", MALFORMED_PENALTY, False
+            module_node = _find_module_for(kg, action.node_id)
+            if module_node:
+                module_node.source = _apply_remove_node(module_node.source, target.source)
+            kg.remove_node(action.node_id)
+            return f"Removed `{action.node_id}`.", 0.0, False
+
+        if isinstance(action, SubmitAction):
+            return self._run_submit()
+
+        return f"[ERROR] unrecognised action type: {type(action)}", MALFORMED_PENALTY, False
+
+    def _run_submit(self) -> tuple[str, float, bool]:
+        """Write modified sources to a temp dir, run tests there, clean up."""
+        kg = self._kg
+        task = self._task
+        assert kg is not None and task is not None
+
+        reward, msg = _run_tests_in_tempdir(kg, task.test_code, task.repo_name)
+        return f"[SUBMIT RESULT]\n{msg}", reward, True
+
+    def _terminal_obs(self, msg: str) -> RepoEditObservation:
+        return RepoEditObservation(
+            episode_id=self._episode_id,
+            task_id=self._task.task_id if self._task else None,
+            turn=self._turn,
+            max_turns=self._task.max_turns if self._task else 0,
+            graph_overview="",
+            task_description="",
+            action_result=msg,
+            done=True,
+            total_reward=self._total_reward,
+        )
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _find_module_for(kg: KnowledgeGraph, node_id: str) -> KGNode | None:
+    """Walk up the parent chain until we hit a module node."""
+    current_id = node_id
+    seen: set[str] = set()
+    while current_id and current_id not in seen:
+        seen.add(current_id)
+        node = kg.get_node(current_id)
+        if node and node.node_type == "module":
+            return node
+        parent = kg.parent_of(current_id)
+        if parent is None:
+            break
+        current_id = parent.node_id
+    return None
+
+
+def _run_tests_in_tempdir(
+    kg: KnowledgeGraph, test_code: str, pkg_name: str
+) -> tuple[float, str]:
+    """Write mutated module sources to a temp dir, import from there, run tests.
+
+    This works for ANY Python repo — no hardcoded package paths needed.
+    The test_code must use short imports: `from <pkg_name>.<module> import ...`
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pkg_dir = Path(tmpdir) / pkg_name
+        pkg_dir.mkdir(parents=True)
+        (pkg_dir / "__init__.py").write_text("")
+
+        # Write each module's current (potentially mutated) source
+        for node in kg.all_nodes("module"):
+            if not node.file_path or node.file_path == "__init__.py":
+                continue
+            dest = pkg_dir / node.file_path
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_text(node.source, encoding="utf-8")
+
+        # Remove any stale cached copies of this package
+        stale = [k for k in sys.modules if k == pkg_name or k.startswith(pkg_name + ".")]
+        for k in stale:
+            del sys.modules[k]
+
+        sys.path.insert(0, tmpdir)
+        try:
+            exec(compile(test_code, "<tests>", "exec"), {})  # noqa: S102
+            return 1.0, "✓ All tests passed!"
+        except AssertionError as exc:
+            return 0.0, f"✗ Test failed: {exc}"
+        except Exception:
+            return 0.0, f"✗ Exception during tests:\n{traceback.format_exc(limit=5)}"
+        finally:
+            sys.path.remove(tmpdir)
+            stale = [k for k in sys.modules if k == pkg_name or k.startswith(pkg_name + ".")]
+            for k in stale:
+                del sys.modules[k]
+
+
+def _pick_random_task() -> str:
+    import random
+    return random.choice(all_task_ids())
diff --git a/env/models.py b/env/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d5fd9fe80a50acae0ce5d41c5301ae5c7a0f69f
--- /dev/null
+++ b/env/models.py
@@ -0,0 +1,46 @@
+"""Pydantic wire models for the multi-turn repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+_cfg = ConfigDict(extra="ignore")
+
+
+class RepoEditObservation(BaseModel):
+    """What the env returns after reset() or step().
+
+    Contains the current graph overview + the result of the last action.
+    The agent should read action_result carefully before deciding the next step.
+    """
+
+    model_config = _cfg
+
+    episode_id: Optional[str] = None
+    task_id: Optional[str] = None
+    turn: int = 0
+    max_turns: int = 15
+
+    graph_overview: str = ""       # compact text view of the entire repo KG
+    task_description: str = ""     # what the agent needs to accomplish
+    action_result: str = ""        # feedback from the last action
+
+    turn_reward: float = 0.0
+    total_reward: float = 0.0
+    done: bool = False
+
+    info: dict[str, Any] = Field(default_factory=dict)
+
+
+class RepoEditState(BaseModel):
+    """Episode-level state snapshot."""
+
+    model_config = _cfg
+
+    episode_id: Optional[str] = None
+    task_id: Optional[str] = None
+    turn: int = 0
+    done: bool = False
+    total_reward: float = 0.0
diff --git a/env/server.py b/env/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..60361013b7257977ef44d6de2e3471758fb59af1
--- /dev/null
+++ b/env/server.py
@@ -0,0 +1,44 @@
+"""FastAPI server for the multi-turn repo-editing environment."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import FastAPI, HTTPException
+
+from env.actions import RepoEditAction, parse_action
+from env.environment import RepoEditEnvironment
+from env.models import RepoEditObservation, RepoEditState
+
+_env = RepoEditEnvironment()
+
+
+def _make_app() -> FastAPI:
+    app = FastAPI(title="Repo-Edit OpenEnv", version="0.3.0")
+
+    @app.post("/reset", response_model=RepoEditObservation)
+    def reset(task_id: str | None = None) -> RepoEditObservation:
+        return _env.reset(task_id=task_id)
+
+    @app.post("/step")
+    def step(action_dict: dict[str, Any]) -> dict[str, Any]:
+        try:
+            action = parse_action(action_dict)
+            obs, reward, done = _env.step(action)
+        except (ValueError, RuntimeError) as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+        return {"observation": obs.model_dump(), "reward": reward, "done": done}
+
+    @app.get("/state", response_model=RepoEditState)
+    def state() -> RepoEditState:
+        return _env.get_state()
+
+    @app.get("/healthz")
+    def healthz() -> dict[str, Any]:
+        return {"status": "ok"}
+
+    return app
+
+
+app = _make_app()
+__all__ = ["app"]
diff --git a/env/tasks.py b/env/tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ef484d5d0ae069d487081c7bd0c0bc73c8eb48
--- /dev/null
+++ b/env/tasks.py
@@ -0,0 +1,363 @@
+"""Multi-turn repo-editing tasks.
+
+Each Task specifies:
+  - A target repo to work on (points to a sample_repos/ subdir)
+  - A natural-language description of the change to make
+  - A set of test functions (Python code strings) that verify the change
+  - The maximum number of turns allowed
+
+Training tasks are deliberately structured to require multi-step navigation:
+  1. The agent must QUERY the graph to find relevant nodes
+  2. INSPECT nodes to understand the existing code
+  3. ADD or UPDATE nodes to implement the change
+  4. SUBMIT to trigger compilation + test execution
+
+This sparse reward structure forces the agent to develop structured planning
+and state tracking across long trajectories — the core theme of this project.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import textwrap
+import traceback
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+SAMPLE_REPOS_DIR = Path(__file__).resolve().parent.parent / "graphforge" / "sample_repos"
+
+
+@dataclass
+class RepoTask:
+    task_id: str
+    repo_name: str                    # package name (used as tempdir subdir)
+    description: str                  # natural-language task for the agent
+    test_code: str                    # Python assertions using short imports
+    max_turns: int = 15
+    difficulty: int = 0               # 0=easy, 1=medium, 2=hard
+    hints: list[str] = field(default_factory=list)
+    repo_path: str | None = None      # if set, full path to repo source dir
+
+
+TASK_BANK: dict[str, RepoTask] = {}
+
+
+def _reg(task: RepoTask) -> RepoTask:
+    TASK_BANK[task.task_id] = task
+    return task
+
+
+# ── Task 0: add validate_due_date ────────────────────────────────────────────
+
+_reg(RepoTask(
+    task_id="t0.validate_due_date",
+    repo_name="task_manager",
+    description=textwrap.dedent("""\
+        Add a function `validate_due_date(due_date) -> bool` to `validators.py`.
+
+        The function should return True if:
+          - due_date is None (no deadline), OR
+          - due_date is a datetime.date instance
+
+        It should return False for any other type (strings, integers, etc.).
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from datetime import date
+        from task_manager.validators import validate_due_date
+        assert validate_due_date(None)            is True,  "None is valid (no deadline)"
+        assert validate_due_date(date(2025, 1, 1)) is True,  "date object is valid"
+        assert validate_due_date("2025-01-01")    is False, "string is not valid"
+        assert validate_due_date(20250101)        is False, "int is not valid"
+        assert validate_due_date([])              is False, "list is not valid"
+    """).strip(),
+    max_turns=12,
+    hints=[
+        "Look in validators.py to see the style of existing validators.",
+        "The function signature should be: def validate_due_date(due_date) -> bool",
+        "Import datetime.date inside the function or at the top of validators.py.",
+    ],
+))
+
+# ── Task 1: add Task.is_overdue ───────────────────────────────────────────────
+
+_reg(RepoTask(
+    task_id="t1.is_overdue",
+    repo_name="task_manager",
+    description=textwrap.dedent("""\
+        Add a method `is_overdue(self, today: date) -> bool` to the `Task`
+        class in `models.py`.
+
+        The method should return True if:
+          - the task has a due_date AND
+          - today is strictly after the due_date AND
+          - the task is not yet done
+
+        It should return False if there is no due_date, or if the task is done,
+        or if today <= due_date.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from datetime import date
+        from task_manager.models import Task
+
+        t_past   = Task("x", "low", [], due_date=date(2020, 1, 1))
+        t_future = Task("y", "low", [], due_date=date(2099, 1, 1))
+        t_none   = Task("z", "low", [], due_date=None)
+        t_done   = Task("d", "low", [], due_date=date(2020, 1, 1))
+        t_done.complete()
+
+        today = date.today()
+        assert t_past.is_overdue(today)   is True,  "past due date → overdue"
+        assert t_future.is_overdue(today) is False, "future due date → not overdue"
+        assert t_none.is_overdue(today)   is False, "no due date → not overdue"
+        assert t_done.is_overdue(today)   is False, "done task → not overdue"
+    """).strip(),
+    max_turns=15,
+    difficulty=1,
+    hints=[
+        "The Task class is in models.py.",
+        "The method should check self.due_date, today, and self.done.",
+    ],
+))
+
+# ── Task 2: add TaskStore.find_by_tag ─────────────────────────────────────────
+
+_reg(RepoTask(
+    task_id="t2.find_by_tag",
+    repo_name="task_manager",
+    description=textwrap.dedent("""\
+        Add a method `find_by_tag(self, tag: str) -> list[Task]` to the
+        `TaskStore` class in `storage.py`.
+
+        The method should return a list of all tasks that have `tag` in
+        their `tags` list. Return an empty list if no tasks match.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from task_manager.models import Task
+        from task_manager.storage import TaskStore
+
+        store = TaskStore()
+        store.add(Task("t1", "high",   ["python", "backend"], None))
+        store.add(Task("t2", "low",    ["frontend"],          None))
+        store.add(Task("t3", "medium", ["python"],            None))
+
+        result = store.find_by_tag("python")
+        assert len(result) == 2, f"Expected 2, got {len(result)}"
+        titles = {t.title for t in result}
+        assert titles == {"t1", "t3"}, f"Wrong titles: {titles}"
+
+        empty = store.find_by_tag("devops")
+        assert empty == [], f"Expected [], got {empty}"
+    """).strip(),
+    max_turns=15,
+    difficulty=1,
+))
+
+# ── Task 3 (hard): enforce priority validation in api.create_task ─────────────
+
+_reg(RepoTask(
+    task_id="t3.enforce_priority",
+    repo_name="task_manager",
+    description=textwrap.dedent("""\
+        Update the `create_task` function in `api.py` so that it validates
+        the `priority` argument using `validate_priority` from `validators.py`.
+
+        If the priority is invalid, raise `ValueError` with a clear message.
+        The existing validations for title and tags must still work.
+
+        Note: `validate_priority` already exists in validators.py.
+        You must import and call it inside `create_task`.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from task_manager import api as _api
+        _api.reset_store()  # clean state between runs
+
+        # valid priority passes through
+        t = _api.create_task("Buy milk", priority="high")
+        assert t.priority == "high"
+
+        # invalid priority raises ValueError
+        raised = False
+        try:
+            _api.create_task("Bad task", priority="urgent")
+        except ValueError:
+            raised = True
+        assert raised, "create_task should raise ValueError for invalid priority"
+
+        # title validation still works
+        raised2 = False
+        try:
+            _api.create_task("", priority="low")
+        except ValueError:
+            raised2 = True
+        assert raised2, "create_task should still reject empty title"
+    """).strip(),
+    max_turns=18,
+    difficulty=2,
+    hints=[
+        "api.py already imports validate_title and validate_tags from validators.",
+        "You need to also import validate_priority and call it in create_task.",
+    ],
+))
+
+
+# ── Humanize tasks (real-world library) ──────────────────────────────────────
+
+_reg(RepoTask(
+    task_id="t4.intpercent",
+    repo_name="humanize",
+    description=textwrap.dedent("""\
+        Add a function `intpercent(value: float, decimal_places: int = 1) -> str`
+        to `number.py`.
+
+        The function should convert a fraction to a percentage string:
+          0.0   → "0.0%"
+          0.5   → "50.0%"
+          0.753 → "75.3%"
+          1.0   → "100.0%"
+
+        Use `decimal_places` to control how many digits appear after the decimal.
+        If decimal_places=0, return an integer percentage with no decimal point.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from humanize.number import intpercent
+        assert intpercent(0.0)   == "0.0%",   f"got {intpercent(0.0)!r}"
+        assert intpercent(0.5)   == "50.0%",  f"got {intpercent(0.5)!r}"
+        assert intpercent(0.753) == "75.3%",  f"got {intpercent(0.753)!r}"
+        assert intpercent(1.0)   == "100.0%", f"got {intpercent(1.0)!r}"
+        assert intpercent(0.5, decimal_places=0) == "50%", f"got {intpercent(0.5, decimal_places=0)!r}"
+    """).strip(),
+    max_turns=12,
+    difficulty=0,
+    hints=[
+        "Look at number.py — the existing functions show the style to follow.",
+        "Use f-string formatting: f'{value * 100:.{decimal_places}f}%'",
+    ],
+))
+
+_reg(RepoTask(
+    task_id="t5.naturalfilecount",
+    repo_name="humanize",
+    description=textwrap.dedent("""\
+        Add a function `naturalfilecount(n: int) -> str` to `filesize.py`.
+
+        The function should return a human-readable file count:
+          0  → "no files"
+          1  → "1 file"
+          2  → "2 files"
+          99 → "99 files"
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from humanize.filesize import naturalfilecount
+        assert naturalfilecount(0)  == "no files", f"got {naturalfilecount(0)!r}"
+        assert naturalfilecount(1)  == "1 file",   f"got {naturalfilecount(1)!r}"
+        assert naturalfilecount(2)  == "2 files",  f"got {naturalfilecount(2)!r}"
+        assert naturalfilecount(99) == "99 files", f"got {naturalfilecount(99)!r}"
+    """).strip(),
+    max_turns=12,
+    difficulty=0,
+    hints=[
+        "Look at filesize.py — naturalsize is the only function there.",
+        "This is a short function: handle n==0, n==1, and n>1 as three cases.",
+    ],
+))
+
+_reg(RepoTask(
+    task_id="t6.metric",
+    repo_name="humanize",
+    description=textwrap.dedent("""\
+        Add a function `metric(value: float, unit: str = "") -> str` to `number.py`.
+
+        The function should format a number using SI metric prefixes:
+          1_500_000 → "1.5 M"
+          2_000     → "2.0 k"
+          500       → "500"   (no prefix below 1000)
+
+        Supported prefixes (largest to smallest): T (10¹²), G (10⁹), M (10⁶), k (10³).
+        If a unit is provided, append it after the prefix: metric(1500, "Hz") → "1.5 kHz".
+        Always format the scaled number to 1 decimal place.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        from humanize.number import metric
+        assert metric(1_500_000) == "1.5 M",   f"got {metric(1_500_000)!r}"
+        assert metric(2_000)     == "2.0 k",   f"got {metric(2_000)!r}"
+        assert metric(500)       == "500",      f"got {metric(500)!r}"
+        assert metric(1_500, "Hz") == "1.5 kHz", f"got {metric(1_500, 'Hz')!r}"
+        assert metric(2e9, "W")    == "2.0 GW",  f"got {metric(2e9, 'W')!r}"
+    """).strip(),
+    max_turns=15,
+    difficulty=1,
+    hints=[
+        "Loop through prefixes from largest to smallest: (1e12,'T'), (1e9,'G'), (1e6,'M'), (1e3,'k').",
+        "If abs(value) >= threshold, scale and format; otherwise return str(int(value)).",
+    ],
+))
+
+_reg(RepoTask(
+    task_id="t7.age",
+    repo_name="humanize",
+    description=textwrap.dedent("""\
+        Add a function `age(birth_date) -> str` to `time.py`.
+
+        The function receives a `datetime.date` and returns a human-readable age:
+          - If the person is under 1 year old, return "X months old" (use 30-day months).
+          - If exactly 1 year, return "1 year old".
+          - Otherwise return "X years old".
+
+        Use `datetime.date.today()` as the reference point.
+        Assume birth_date is always a valid date in the past.
+    """).strip(),
+    test_code=textwrap.dedent("""\
+        import datetime as dt
+        from humanize.time import age
+
+        today = dt.date.today()
+        dob_25y  = today.replace(year=today.year - 25)
+        dob_1y   = today.replace(year=today.year - 1)
+        dob_6m   = today - dt.timedelta(days=182)
+        dob_2m   = today - dt.timedelta(days=61)
+
+        assert age(dob_25y) == "25 years old", f"got {age(dob_25y)!r}"
+        assert age(dob_1y)  == "1 year old",   f"got {age(dob_1y)!r}"
+        assert age(dob_6m)  == "6 months old", f"got {age(dob_6m)!r}"
+        assert age(dob_2m)  == "2 months old", f"got {age(dob_2m)!r}"
+    """).strip(),
+    max_turns=15,
+    difficulty=1,
+    hints=[
+        "import datetime as dt is already at the top of time.py.",
+        "days = (dt.date.today() - birth_date).days; years = days // 365; months = days // 30",
+    ],
+))
+
+
+# ── test runner ───────────────────────────────────────────────────────────────
+
+def run_tests(task: RepoTask) -> tuple[bool, str]:
+    """Execute task.test_code and return (passed, message)."""
+    # Reload all task_manager modules to pick up any source-level changes
+    _reload_task_manager()
+    try:
+        exec(compile(task.test_code, "<test>", "exec"), {})  # noqa: S102
+        return True, "All assertions passed."
+    except AssertionError as exc:
+        return False, f"AssertionError: {exc}"
+    except Exception:
+        return False, traceback.format_exc(limit=5)
+
+
+def _reload_task_manager() -> None:
+    """Force-reload all task_manager submodules so edits take effect."""
+    prefix = "graphforge.sample_repos.task_manager"
+    to_reload = [k for k in sys.modules if k.startswith(prefix)]
+    for mod_name in to_reload:
+        del sys.modules[mod_name]
+
+
+def all_task_ids() -> list[str]:
+    return list(TASK_BANK.keys())
+
+
+def get_task(task_id: str) -> RepoTask | None:
+    return TASK_BANK.get(task_id)
diff --git a/graphforge/__init__.py b/graphforge/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..785c7afa9ff4f84120664d4afb9cd98f5e0b2d03
--- /dev/null
+++ b/graphforge/__init__.py
@@ -0,0 +1,24 @@
+"""GraphForge — graph-first code generation environment for long-horizon RL.
+
+The agent constructs Python programs by mutating a typed function-call graph;
+source files are a deterministic projection of the canonical graph.
+
+Top-level subsystems:
+  graph         canonical graph schema (Modules, Nodes, Edges)
+  actions       eleven-action surface, atomic dispatcher with rollback
+  types         signature parser + edge type-flow validator
+  templates     ~25-template body library, parameterized
+  materializer  graph -> Python source
+  parser        Python source -> graph (round-trip)
+  validator     parse / import / mypy --strict gate
+  behavioral    hypothesis-based property test runner
+  constraints   per-kind constraint checker dispatch
+  reward        reward engine (per-turn + terminal)
+  tasks         task bank + variant generator
+  server        FastAPI OpenEnv server
+  training      GRPO multi-turn rollout
+
+See README.md for design rationale and PROPOSAL.md for the full spec.
+"""
+
+__version__ = "0.0.1"
diff --git a/graphforge/actions/__init__.py b/graphforge/actions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5b5bd770d975615e7daff3f36863d9ee89d2e9
--- /dev/null
+++ b/graphforge/actions/__init__.py
@@ -0,0 +1,15 @@
+"""Action surface for GraphForge.
+
+Public API:
+
+    from graphforge.actions import dispatch, ActionResult
+    from graphforge.actions.schema import Action, AddNode, ...
+    from graphforge.actions.errors import ActionError
+
+See PROPOSAL.md §4 for the full action vocabulary.
+"""
+
+from graphforge.actions.dispatcher import ActionResult, dispatch
+from graphforge.actions.errors import ActionError
+
+__all__ = ["ActionError", "ActionResult", "dispatch"]
diff --git a/graphforge/actions/dispatcher.py b/graphforge/actions/dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fbcbeb03f2833fc93eb4d562ee20aacc1c60789
--- /dev/null
+++ b/graphforge/actions/dispatcher.py
@@ -0,0 +1,442 @@
+"""Atomic action dispatcher.
+
+Applies an :class:`Action` to a :class:`Graph`. Every mutation is atomic:
+the dispatcher snapshots the graph before the handler runs and restores it on
+any failure. Failures surface as :class:`ActionError` with a stable code, never
+as silent partial state.
+
+Information actions (query_*, materialize_*, run_*) are routed but their
+implementations live in their respective subsystems and are stubbed for now.
+``submit`` returns a sentinel so the episode runner can recognize termination.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from graphforge.actions import errors as E
+from graphforge.actions.schema import (
+    Action,
+    AddEdge,
+    AddModule,
+    AddNode,
+    AttachBody,
+    MaterializeAndValidate,
+    QuerySpec,
+    QuerySubgraph,
+    QueryTypes,
+    RemoveEdge,
+    RemoveModule,
+    RemoveNode,
+    RunBehavioralTests,
+    SetNodeModule,
+    Submit,
+)
+from graphforge.actions.signature import parse_signature
+from graphforge.graph.schema import (
+    ArgMapping,
+    Edge,
+    Graph,
+    Module,
+    Node,
+)
+from graphforge.templates import get_template, validate_args
+
+
+# ---- result envelope -------------------------------------------------
+
+
+@dataclass
+class ActionResult:
+    """Envelope returned by :func:`dispatch`."""
+
+    ok: bool
+    payload: dict[str, Any]
+    terminal: bool = False
+
+    @classmethod
+    def success(cls, **payload: Any) -> "ActionResult":
+        return cls(ok=True, payload=payload, terminal=False)
+
+    @classmethod
+    def failure(cls, err: E.ActionError) -> "ActionResult":
+        return cls(ok=False, payload=err.to_dict(), terminal=False)
+
+    @classmethod
+    def terminate(cls, **payload: Any) -> "ActionResult":
+        return cls(ok=True, payload=payload, terminal=True)
+
+
+# ---- dispatcher ------------------------------------------------------
+
+
+def dispatch(graph: Graph, action: Action) -> ActionResult:
+    """Apply ``action`` to ``graph`` in place. Atomic on failure.
+
+    On any handler exception (including :class:`ActionError`) the graph is
+    rolled back to the pre-call snapshot.
+    """
+    snap = graph.snapshot()
+    try:
+        return _route(graph, action)
+    except E.ActionError as err:
+        _restore(graph, snap)
+        return ActionResult.failure(err)
+    except Exception as exc:  # pragma: no cover — unexpected handler bug
+        _restore(graph, snap)
+        return ActionResult.failure(
+            E.ActionError(E.SCHEMA_REJECTION, f"unhandled: {exc}")
+        )
+
+
+def _restore(graph: Graph, snap: Graph) -> None:
+    graph.modules = snap.modules
+    graph.nodes = snap.nodes
+    graph.edges = snap.edges
+
+
+def _route(graph: Graph, action: Action) -> ActionResult:
+    # Mutations
+    if isinstance(action, AddModule):
+        return _h_add_module(graph, action)
+    if isinstance(action, RemoveModule):
+        return _h_remove_module(graph, action)
+    if isinstance(action, AddNode):
+        return _h_add_node(graph, action)
+    if isinstance(action, RemoveNode):
+        return _h_remove_node(graph, action)
+    if isinstance(action, SetNodeModule):
+        return _h_set_node_module(graph, action)
+    if isinstance(action, AttachBody):
+        return _h_attach_body(graph, action)
+    if isinstance(action, AddEdge):
+        return _h_add_edge(graph, action)
+    if isinstance(action, RemoveEdge):
+        return _h_remove_edge(graph, action)
+    # Information (delegated; stubs for now)
+    if isinstance(action, QuerySpec):
+        return _h_query_spec(graph, action)
+    if isinstance(action, QuerySubgraph):
+        return _h_query_subgraph(graph, action)
+    if isinstance(action, QueryTypes):
+        return _h_query_types(graph, action)
+    if isinstance(action, MaterializeAndValidate):
+        return _h_materialize(graph, action)
+    if isinstance(action, RunBehavioralTests):
+        return _h_run_tests(graph, action)
+    if isinstance(action, Submit):
+        return _h_submit(graph, action)
+    raise E.ActionError(E.SCHEMA_REJECTION, f"unknown action: {type(action).__name__}")
+
+
+# ---- mutation handlers ----------------------------------------------
+
+
+def _h_add_module(graph: Graph, a: AddModule) -> ActionResult:
+    if graph.find_module(a.name) is not None:
+        raise E.ActionError(
+            E.NAME_COLLISION, f"module {a.name!r} already exists", name=a.name
+        )
+    graph.modules.append(Module(name=a.name, responsibility=a.responsibility))
+    return ActionResult.success(added_module=a.name)
+
+
+def _h_remove_module(graph: Graph, a: RemoveModule) -> ActionResult:
+    mod = graph.find_module(a.name)
+    if mod is None:
+        raise E.ActionError(E.UNKNOWN_MODULE, f"module {a.name!r} does not exist", name=a.name)
+    if any(n.module == a.name for n in graph.nodes):
+        raise E.ActionError(
+            E.MODULE_NOT_EMPTY,
+            f"module {a.name!r} still contains nodes",
+            name=a.name,
+            node_count=sum(1 for n in graph.nodes if n.module == a.name),
+        )
+    graph.modules = [m for m in graph.modules if m.name != a.name]
+    return ActionResult.success(removed_module=a.name)
+
+
+def _h_add_node(graph: Graph, a: AddNode) -> ActionResult:
+    if graph.find_module(a.module) is None:
+        raise E.ActionError(E.UNKNOWN_MODULE, f"module {a.module!r} does not exist", name=a.module)
+    if graph.find_node(a.name, a.module) is not None:
+        raise E.ActionError(
+            E.NAME_COLLISION,
+            f"node {a.module}.{a.name} already exists",
+            name=a.name,
+            module=a.module,
+        )
+    # Surface signature parse — catches errors that the pydantic regex misses.
+    try:
+        parse_signature(a.signature)
+    except ValueError as ve:
+        raise E.ActionError(E.SCHEMA_REJECTION, str(ve), signature=a.signature) from ve
+    decl_order = max((n.decl_order for n in graph.nodes), default=-1) + 1
+    graph.nodes.append(
+        Node(
+            name=a.name,
+            module=a.module,
+            signature=a.signature,
+            purity=a.purity,
+            error_policy=a.error_policy,
+            decl_order=decl_order,
+        )
+    )
+    return ActionResult.success(added_node=f"{a.module}.{a.name}", decl_order=decl_order)
+
+
+def _h_remove_node(graph: Graph, a: RemoveNode) -> ActionResult:
+    n = graph.find_node(a.name, a.module)
+    if n is None:
+        raise E.ActionError(
+            E.UNKNOWN_NODE, f"node {a.module}.{a.name} does not exist", name=a.name, module=a.module
+        )
+    qn = n.qualified_name
+    refs = [e for e in graph.edges if e.caller == qn or e.callee == qn]
+    if refs:
+        raise E.ActionError(
+            E.NODE_HAS_REFERENCES,
+            f"node {qn} is referenced by {len(refs)} edge(s)",
+            name=a.name,
+            module=a.module,
+            referencing_edges=[(e.caller, e.callee) for e in refs],
+        )
+    graph.nodes = [m for m in graph.nodes if not (m.name == a.name and m.module == a.module)]
+    return ActionResult.success(removed_node=qn)
+
+
+def _h_set_node_module(graph: Graph, a: SetNodeModule) -> ActionResult:
+    n = graph.find_node(a.name, a.current_module)
+    if n is None:
+        raise E.ActionError(
+            E.UNKNOWN_NODE,
+            f"node {a.current_module}.{a.name} does not exist",
+            name=a.name,
+            module=a.current_module,
+        )
+    new_mod = graph.find_module(a.new_module)
+    if new_mod is None:
+        raise E.ActionError(
+            E.UNKNOWN_MODULE,
+            f"target module {a.new_module!r} does not exist",
+            name=a.new_module,
+        )
+    if graph.find_node(a.name, a.new_module) is not None:
+        raise E.ActionError(
+            E.NAME_COLLISION,
+            f"node named {a.name!r} already exists in {a.new_module!r}",
+            name=a.name,
+            module=a.new_module,
+        )
+    old_qn = n.qualified_name
+    new_qn = f"{a.new_module}.{a.name}"
+    n.module = a.new_module
+    # Rewrite edge endpoints that referred to the old qualified name.
+    for e in graph.edges:
+        if e.caller == old_qn:
+            e.caller = new_qn
+        if e.callee == old_qn:
+            e.callee = new_qn
+    # Post-condition: rewriting must not have introduced an import cycle.
+    if graph.has_module_cycle():
+        raise E.ActionError(
+            E.WOULD_CREATE_CYCLE,
+            f"moving {old_qn} -> {new_qn} would create an import cycle",
+            from_qn=old_qn,
+            to_qn=new_qn,
+        )
+    return ActionResult.success(moved_node={"from": old_qn, "to": new_qn})
+
+
+def _h_attach_body(graph: Graph, a: AttachBody) -> ActionResult:
+    n = graph.find_node(a.name, a.module)
+    if n is None:
+        raise E.ActionError(
+            E.UNKNOWN_NODE,
+            f"node {a.module}.{a.name} does not exist",
+            name=a.name,
+            module=a.module,
+        )
+    spec = get_template(a.template)
+    if spec is None:
+        raise E.ActionError(
+            E.UNKNOWN_TEMPLATE, f"unknown template {a.template!r}", template=a.template
+        )
+    problems = validate_args(a.template, a.args)
+    if problems:
+        raise E.ActionError(
+            E.TEMPLATE_ARGS_INVALID,
+            f"args invalid for template {a.template!r}: {'; '.join(problems)}",
+            template=a.template,
+            problems=problems,
+        )
+    out_d = graph.fan_out(n.qualified_name)
+    in_d = graph.fan_in(n.qualified_name)
+    if not spec.edges_ok(out_d, in_d):
+        raise E.ActionError(
+            E.TEMPLATE_ARGS_INVALID,
+            f"template {a.template!r} requires different edge structure "
+            f"(out_d={out_d}, in_d={in_d})",
+            template=a.template,
+            out_degree=out_d,
+            in_degree=in_d,
+        )
+    n.body_template = a.template
+    n.body_template_args = dict(a.args)
+    return ActionResult.success(
+        attached={"node": n.qualified_name, "template": a.template}
+    )
+
+
+def _h_add_edge(graph: Graph, a: AddEdge) -> ActionResult:
+    caller = graph.find_node_qualified(a.caller)
+    callee = graph.find_node_qualified(a.callee)
+    if caller is None:
+        raise E.ActionError(E.UNKNOWN_NODE, f"caller {a.caller!r} does not exist", node=a.caller)
+    if callee is None:
+        raise E.ActionError(E.UNKNOWN_NODE, f"callee {a.callee!r} does not exist", node=a.callee)
+    if graph.find_edge(a.caller, a.callee) is not None:
+        raise E.ActionError(
+            E.DUPLICATE_EDGE,
+            f"edge {a.caller} -> {a.callee} already exists",
+            caller=a.caller,
+            callee=a.callee,
+        )
+    # Validate arg_mapping covers all required parameters of callee.
+    callee_sig = parse_signature(callee.signature)
+    caller_sig = parse_signature(caller.signature)
+    mapped_callee = {m.callee_param for m in a.arg_mapping}
+    mapped_caller = {m.caller_arg for m in a.arg_mapping}
+    missing = set(callee_sig.required_params) - mapped_callee
+    if missing:
+        raise E.ActionError(
+            E.ARG_MAPPING_INVALID,
+            f"arg_mapping is missing required callee params: {sorted(missing)}",
+            missing=sorted(missing),
+        )
+    bogus_callee = mapped_callee - set(callee_sig.all_params)
+    if bogus_callee:
+        raise E.ActionError(
+            E.ARG_MAPPING_INVALID,
+            f"arg_mapping references unknown callee params: {sorted(bogus_callee)}",
+            unknown=sorted(bogus_callee),
+        )
+    bogus_caller = mapped_caller - set(caller_sig.all_params)
+    if bogus_caller:
+        raise E.ActionError(
+            E.ARG_MAPPING_INVALID,
+            f"arg_mapping references unknown caller args: {sorted(bogus_caller)}",
+            unknown=sorted(bogus_caller),
+        )
+    # Add tentatively; check post-condition.
+    graph.edges.append(
+        Edge(
+            caller=a.caller,
+            callee=a.callee,
+            arg_mapping=[ArgMapping(**m.model_dump()) for m in a.arg_mapping],
+        )
+    )
+    if graph.has_module_cycle():
+        raise E.ActionError(
+            E.WOULD_CREATE_CYCLE,
+            f"adding edge {a.caller} -> {a.callee} would create an import cycle",
+            caller=a.caller,
+            callee=a.callee,
+        )
+    return ActionResult.success(added_edge={"caller": a.caller, "callee": a.callee})
+
+
+def _h_remove_edge(graph: Graph, a: RemoveEdge) -> ActionResult:
+    e = graph.find_edge(a.caller, a.callee)
+    if e is None:
+        raise E.ActionError(
+            E.UNKNOWN_EDGE,
+            f"edge {a.caller} -> {a.callee} does not exist",
+            caller=a.caller,
+            callee=a.callee,
+        )
+    graph.edges = [
+        x for x in graph.edges if not (x.caller == a.caller and x.callee == a.callee)
+    ]
+    return ActionResult.success(removed_edge={"caller": a.caller, "callee": a.callee})
+
+
+# ---- info / terminal handlers (stubs) -------------------------------
+
+
+def _h_query_spec(graph: Graph, a: QuerySpec) -> ActionResult:
+    # TODO: route to graphforge.constraints once tasks/specs are wired in.
+    return ActionResult.success(
+        not_implemented="query_spec routed via dispatcher; constraint engine TODO",
+        constraint_kind=a.constraint_kind,
+    )
+
+
+def _h_query_subgraph(graph: Graph, a: QuerySubgraph) -> ActionResult:
+    scope = a.scope
+    if scope.startswith("module:"):
+        mod = scope[len("module:") :]
+        nodes = [n.model_dump() for n in graph.nodes_in_module(mod)]
+        edges = [
+            e.model_dump()
+            for e in graph.edges
+            if e.caller.split(".")[0] == mod and e.callee.split(".")[0] == mod
+        ]
+        return ActionResult.success(scope=scope, nodes=nodes, edges=edges)
+    if scope.startswith("neighbors:"):
+        qn = scope[len("neighbors:") :]
+        return ActionResult.success(
+            scope=scope,
+            callers=graph.callers_of(qn),
+            callees=graph.callees_of(qn),
+        )
+    if scope.startswith("path:"):
+        # TODO: shortest-path search over call graph.
+        return ActionResult.success(
+            scope=scope, not_implemented="path search TODO"
+        )
+    raise E.ActionError(E.SCHEMA_REJECTION, f"unrecognized subgraph scope {scope!r}")
+
+
+def _h_query_types(graph: Graph, a: QueryTypes) -> ActionResult:
+    # TODO: delegate to graphforge.types.
+    return ActionResult.success(
+        scope=a.scope, not_implemented="type engine TODO"
+    )
+
+
+def _h_materialize(graph: Graph, a: MaterializeAndValidate) -> ActionResult:
+    """Project the graph to source and run the parse-only validator gate.
+
+    Heavier validation gates (mypy --strict, import-resolution, behavioral
+    tests) are added to this action's report as their subsystems land.
+    """
+    from graphforge.materializer import materialize as _materialize
+    from graphforge.validator import full_check
+
+    try:
+        files = _materialize(graph)
+    except ValueError as ve:
+        # Codegen rejected the graph (e.g. unknown pattern, template/edge
+        # structure mismatch missed by the dispatcher's preconditions).
+        raise E.ActionError(
+            E.SCHEMA_REJECTION, f"materialization failed: {ve}"
+        ) from ve
+    report = full_check(files)
+    return ActionResult.success(
+        files=list(files.keys()),
+        bytes_total=sum(len(s) for s in files.values()),
+        report=report.to_dict(),
+    )
+
+
+def _h_run_tests(graph: Graph, a: RunBehavioralTests) -> ActionResult:
+    # TODO: delegate to graphforge.behavioral.
+    raise E.ActionError(
+        E.SCHEMA_REJECTION, "run_behavioral_tests is not yet implemented"
+    )
+
+
+def _h_submit(graph: Graph, a: Submit) -> ActionResult:
+    return ActionResult.terminate(submitted=True)
diff --git a/graphforge/actions/errors.py b/graphforge/actions/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..e499876f65202e64b92c8ab129fd6bc545067eb3
--- /dev/null
+++ b/graphforge/actions/errors.py
@@ -0,0 +1,44 @@
+"""Structured action errors.
+
+Every failure mode in the action dispatcher surfaces as an :class:`ActionError`
+with a stable ``code`` so the agent can be trained against deterministic error
+strings (see PROPOSAL.md §4.4 — "failures return structured errors describing
+the cause"). Codes are kept short and stable across versions.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class ActionError(Exception):
+    """Raised by action handlers; caught and reported by the dispatcher."""
+
+    def __init__(self, code: str, message: str, **details: Any) -> None:
+        super().__init__(f"[{code}] {message}")
+        self.code = code
+        self.message = message
+        self.details = details
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"error": self.code, "message": self.message, **self.details}
+
+
+# ---- canonical codes -------------------------------------------------
+# Schema layer
+SCHEMA_REJECTION = "schema_rejection"
+# Pre-condition layer
+UNKNOWN_MODULE = "unknown_module"
+UNKNOWN_NODE = "unknown_node"
+UNKNOWN_EDGE = "unknown_edge"
+NAME_COLLISION = "name_collision"
+MODULE_NOT_EMPTY = "module_not_empty"
+NODE_HAS_REFERENCES = "node_has_references"
+DUPLICATE_EDGE = "duplicate_edge"
+UNKNOWN_TEMPLATE = "unknown_template"
+TEMPLATE_ARGS_INVALID = "template_args_invalid"
+RESPONSIBILITY_MISMATCH = "responsibility_mismatch"
+ARG_MAPPING_INVALID = "arg_mapping_invalid"
+# Post-condition layer
+WOULD_CREATE_CYCLE = "would_create_cycle"
+TYPE_MISMATCH = "type_mismatch"
diff --git a/graphforge/actions/schema.py b/graphforge/actions/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc089c21ac800003bc817e159eaeac2ebb8dbb9
--- /dev/null
+++ b/graphforge/actions/schema.py
@@ -0,0 +1,180 @@
+"""Action message schemas.
+
+These are the wire shapes accepted by the dispatcher. Every action is a
+discriminated-union member keyed on ``kind``.
+
+The action vocabulary mirrors PROPOSAL.md §4. Total surface:
+
+  Graph mutations
+    add_module, remove_module
+    add_node, remove_node, set_node_module, attach_body
+    add_edge, remove_edge
+  Information
+    query_spec, query_subgraph, query_types,
+    materialize_and_validate, run_behavioral_tests
+  Terminal
+    submit
+
+Note: the proposal abstract states "eleven actions"; the section-4 listing
+contains fourteen. We implement the section-4 set; the abstract count will
+be corrected in the next revision of PROPOSAL.md.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.graph.schema import ArgMapping, ErrorPolicy, Purity, ResponsibilityTag
+
+
+# Common config: forbid unknown fields, fail loudly on schema drift.
+_cfg = ConfigDict(extra="forbid")
+
+
+# ---- mutations -------------------------------------------------------
+
+
+class AddModule(BaseModel):
+    model_config = _cfg
+    kind: Literal["add_module"] = "add_module"
+    name: str
+    responsibility: ResponsibilityTag
+
+
+class RemoveModule(BaseModel):
+    model_config = _cfg
+    kind: Literal["remove_module"] = "remove_module"
+    name: str
+
+
+class AddNode(BaseModel):
+    model_config = _cfg
+    kind: Literal["add_node"] = "add_node"
+    name: str
+    module: str
+    signature: str
+    purity: Purity = "impure"
+    error_policy: ErrorPolicy = "none"
+
+
+class RemoveNode(BaseModel):
+    model_config = _cfg
+    kind: Literal["remove_node"] = "remove_node"
+    name: str
+    module: str
+
+
+class SetNodeModule(BaseModel):
+    model_config = _cfg
+    kind: Literal["set_node_module"] = "set_node_module"
+    name: str
+    current_module: str
+    new_module: str
+
+
+class AttachBody(BaseModel):
+    model_config = _cfg
+    kind: Literal["attach_body"] = "attach_body"
+    name: str
+    module: str
+    template: str
+    args: dict[str, object] = Field(default_factory=dict)
+
+
+class AddEdge(BaseModel):
+    model_config = _cfg
+    kind: Literal["add_edge"] = "add_edge"
+    caller: str
+    callee: str
+    arg_mapping: list[ArgMapping] = Field(default_factory=list)
+
+
+class RemoveEdge(BaseModel):
+    model_config = _cfg
+    kind: Literal["remove_edge"] = "remove_edge"
+    caller: str
+    callee: str
+
+
+# ---- information actions --------------------------------------------
+
+
+class QuerySpec(BaseModel):
+    model_config = _cfg
+    kind: Literal["query_spec"] = "query_spec"
+    constraint_kind: Optional[str] = None
+
+
+class QuerySubgraph(BaseModel):
+    model_config = _cfg
+    kind: Literal["query_subgraph"] = "query_subgraph"
+    scope: str  # "module:<name>" | "neighbors:<qualified>" | "path:<from>:<to>"
+
+
+class QueryTypes(BaseModel):
+    model_config = _cfg
+    kind: Literal["query_types"] = "query_types"
+    scope: str  # "all" | "module:<name>" | "node:<qualified>"
+
+
+class MaterializeAndValidate(BaseModel):
+    model_config = _cfg
+    kind: Literal["materialize_and_validate"] = "materialize_and_validate"
+
+
+class RunBehavioralTests(BaseModel):
+    model_config = _cfg
+    kind: Literal["run_behavioral_tests"] = "run_behavioral_tests"
+    materialized: bool = True
+
+
+# ---- terminal --------------------------------------------------------
+
+
+class Submit(BaseModel):
+    model_config = _cfg
+    kind: Literal["submit"] = "submit"
+
+
+# ---- discriminated union --------------------------------------------
+
+Action = Annotated[
+    Union[
+        AddModule,
+        RemoveModule,
+        AddNode,
+        RemoveNode,
+        SetNodeModule,
+        AttachBody,
+        AddEdge,
+        RemoveEdge,
+        QuerySpec,
+        QuerySubgraph,
+        QueryTypes,
+        MaterializeAndValidate,
+        RunBehavioralTests,
+        Submit,
+    ],
+    Field(discriminator="kind"),
+]
+
+
+__all__ = [
+    "Action",
+    "AddModule",
+    "RemoveModule",
+    "AddNode",
+    "RemoveNode",
+    "SetNodeModule",
+    "AttachBody",
+    "AddEdge",
+    "RemoveEdge",
+    "QuerySpec",
+    "QuerySubgraph",
+    "QueryTypes",
+    "MaterializeAndValidate",
+    "RunBehavioralTests",
+    "Submit",
+]
diff --git a/graphforge/actions/signature.py b/graphforge/actions/signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..f853a7a513c673e51958dbe2c39355614f25b394
--- /dev/null
+++ b/graphforge/actions/signature.py
@@ -0,0 +1,116 @@
+"""Cheap signature parser.
+
+Used by the dispatcher to validate ``add_edge`` arg-mappings against the
+callee's parameter list. Real type flow validation (caller_arg type vs
+callee_param type) is the type engine; this module only extracts parameter
+*names* from a signature string of the form::
+
+    (a: int, b: str = "x", *, c: bool) -> bool
+
+Annotations are tolerated as opaque text. Defaults are tolerated and treated
+as making the parameter optional.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Parameter:
+    name: str
+    annotation: str | None
+    has_default: bool
+
+
+@dataclass(frozen=True)
+class ParsedSignature:
+    parameters: list[Parameter]
+    return_annotation: str
+
+    @property
+    def required_params(self) -> list[str]:
+        return [p.name for p in self.parameters if not p.has_default]
+
+    @property
+    def all_params(self) -> list[str]:
+        return [p.name for p in self.parameters]
+
+
+_SIG_RE = re.compile(r"^\s*\((?P<params>.*)\)\s*->\s*(?P<ret>.+?)\s*$", re.DOTALL)
+
+
+def parse_signature(sig: str) -> ParsedSignature:
+    """Parse a function signature string. Lenient — caller validates more deeply.
+
+    Raises ``ValueError`` on signatures that fail surface checks. The schema
+    layer (Node validator) already requires ``(`` and ``->``; this is the
+    secondary parse used at dispatch time.
+    """
+    m = _SIG_RE.match(sig)
+    if not m:
+        raise ValueError(f"could not parse signature: {sig!r}")
+    raw_params = m.group("params").strip()
+    ret = m.group("ret").strip()
+
+    params: list[Parameter] = []
+    if raw_params:
+        for piece in _split_top_level(raw_params, ","):
+            piece = piece.strip()
+            if not piece or piece in {"*", "/"}:
+                continue
+            if piece.startswith("**"):
+                piece = piece[2:].lstrip()
+            elif piece.startswith("*"):
+                piece = piece[1:].lstrip()
+            has_default = False
+            if "=" in piece:
+                # split off default at top-level '=' (ignore ones inside [..]).
+                head, default = _split_default(piece)
+                piece = head.strip()
+                has_default = default is not None
+            name = piece
+            annotation: str | None = None
+            if ":" in piece:
+                name, annotation = piece.split(":", 1)
+                name = name.strip()
+                annotation = annotation.strip()
+            if not name.isidentifier():
+                raise ValueError(f"unparseable parameter {piece!r} in {sig!r}")
+            params.append(Parameter(name=name, annotation=annotation, has_default=has_default))
+
+    return ParsedSignature(parameters=params, return_annotation=ret)
+
+
+def _split_top_level(s: str, sep: str) -> list[str]:
+    """Split ``s`` on ``sep`` at bracket-depth 0."""
+    out: list[str] = []
+    depth = 0
+    buf: list[str] = []
+    for ch in s:
+        if ch in "([{":
+            depth += 1
+        elif ch in ")]}":
+            depth -= 1
+        if ch == sep and depth == 0:
+            out.append("".join(buf))
+            buf = []
+        else:
+            buf.append(ch)
+    if buf:
+        out.append("".join(buf))
+    return out
+
+
+def _split_default(piece: str) -> tuple[str, str | None]:
+    """Split off ``= default`` at bracket-depth 0. Returns (head, default | None)."""
+    depth = 0
+    for i, ch in enumerate(piece):
+        if ch in "([{":
+            depth += 1
+        elif ch in ")]}":
+            depth -= 1
+        elif ch == "=" and depth == 0:
+            return piece[:i], piece[i + 1 :]
+    return piece, None
diff --git a/graphforge/behavioral/__init__.py b/graphforge/behavioral/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..819cb3abdb861b54abf4592aade6f2ff083980e5
--- /dev/null
+++ b/graphforge/behavioral/__init__.py
@@ -0,0 +1,25 @@
+"""Behavioral test runner.
+
+Responsibilities (PROPOSAL.md §2.1, §6.2):
+
+  * Run a property-based test suite (hypothesis) against materialized code,
+    in a sandboxed subprocess with timeout + memory limit.
+  * Tests are part of the task definition; their bodies are *hidden* from
+    the agent. The agent sees only test names and pass/fail at submission.
+  * Distinguish failures (assertion) from errors (timeout, crash) — both
+    count as test failures, but they're surfaced separately for diagnostics.
+
+Public surface (TODO):
+
+    run_tests(files, tests, timeout=12.0) -> dict[str, TestResult]
+"""
+
+from __future__ import annotations
+
+
+def run_tests(  # pragma: no cover — TODO
+    files: dict[str, str],
+    tests: list[object],
+    timeout: float = 12.0,
+) -> dict[str, object]:
+    raise NotImplementedError("behavioral runner TODO — see PROPOSAL.md §6.2")
diff --git a/graphforge/constraints/__init__.py b/graphforge/constraints/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b65de419a05fa2b2772b7e83586f16e1acaf5e1
--- /dev/null
+++ b/graphforge/constraints/__init__.py
@@ -0,0 +1,49 @@
+"""Constraint vocabulary and dispatch.
+
+Three families (PROPOSAL.md §2.2):
+
+  * Structural — node_exists, edge_exists, module_count, acyclic_imports,
+    fan_in_max, fan_out_max, dag_depth_max, internal_only, …
+  * Type / signature — signature_matches, return_type, arg_type,
+    type_consistency, no_any_types, pure_function (TODO)
+  * Behavioral / materialization — materializes, imports_resolve,
+    type_checks, behavioral_test_passes, error_handling_present|absent
+
+Currently shipped: tier-0 subset of structural + ``materializes``. Additional
+kinds land as new discriminated members in :mod:`schema` and matching
+``_check_*`` functions in :mod:`checker`.
+"""
+
+from graphforge.constraints.checker import (
+    SatisfactionReport,
+    check,
+    evaluate_all,
+)
+from graphforge.constraints.schema import (
+    AcyclicImports,
+    Constraint,
+    EdgeExists,
+    Materializes,
+    ModuleCount,
+    ModuleResponsibility,
+    ModuleSizeMax,
+    NodeAbsent,
+    NodeExists,
+    STRUCTURAL_KINDS,
+)
+
+__all__ = [
+    "AcyclicImports",
+    "Constraint",
+    "EdgeExists",
+    "Materializes",
+    "ModuleCount",
+    "ModuleResponsibility",
+    "ModuleSizeMax",
+    "NodeAbsent",
+    "NodeExists",
+    "STRUCTURAL_KINDS",
+    "SatisfactionReport",
+    "check",
+    "evaluate_all",
+]
diff --git a/graphforge/constraints/checker.py b/graphforge/constraints/checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a54277b5420b867ee0a2110e7793ea20ef0ee8b
--- /dev/null
+++ b/graphforge/constraints/checker.py
@@ -0,0 +1,141 @@
+"""Constraint checker dispatch.
+
+Each constraint kind has a small ``_check_*`` function. ``check`` routes by
+isinstance and ``evaluate_all`` reports which constraints from a list are
+satisfied or not.
+
+Behavioral / materialization constraints (currently just ``materializes``)
+delegate to the materializer and validator subsystems.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from graphforge.constraints.schema import (
+    AcyclicImports,
+    Constraint,
+    EdgeExists,
+    Materializes,
+    ModuleCount,
+    ModuleResponsibility,
+    ModuleSizeMax,
+    NodeAbsent,
+    NodeExists,
+    STRUCTURAL_KINDS,
+)
+from graphforge.graph.schema import Graph
+
+
+@dataclass
+class SatisfactionReport:
+    satisfied: list[Constraint] = field(default_factory=list)
+    unsatisfied: list[Constraint] = field(default_factory=list)
+
+    @property
+    def total(self) -> int:
+        return len(self.satisfied) + len(self.unsatisfied)
+
+    @property
+    def all_satisfied(self) -> bool:
+        return self.total > 0 and not self.unsatisfied
+
+    def split_by_family(self) -> tuple["SatisfactionReport", "SatisfactionReport"]:
+        """Split into (structural, behavioral) sub-reports.
+
+        Useful for the reward engine, which scores the two families with
+        different magnitudes per PROPOSAL.md §5.2.
+        """
+        sr = SatisfactionReport()
+        br = SatisfactionReport()
+        for c in self.satisfied:
+            (sr if c.kind in STRUCTURAL_KINDS else br).satisfied.append(c)
+        for c in self.unsatisfied:
+            (sr if c.kind in STRUCTURAL_KINDS else br).unsatisfied.append(c)
+        return sr, br
+
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "satisfied": [c.model_dump() for c in self.satisfied],
+            "unsatisfied": [c.model_dump() for c in self.unsatisfied],
+            "total": self.total,
+            "all_satisfied": self.all_satisfied,
+        }
+
+
+# ---- per-kind checkers ----------------------------------------------
+
+
+def _check_node_exists(g: Graph, c: NodeExists) -> bool:
+    return g.find_node(c.name, c.module) is not None
+
+
+def _check_node_absent(g: Graph, c: NodeAbsent) -> bool:
+    return g.find_node(c.name, c.module) is None
+
+
+def _check_edge_exists(g: Graph, c: EdgeExists) -> bool:
+    return g.find_edge(c.caller, c.callee) is not None
+
+
+def _check_module_count(g: Graph, c: ModuleCount) -> bool:
+    return len(g.modules) == c.n
+
+
+def _check_module_size_max(g: Graph, c: ModuleSizeMax) -> bool:
+    return len(g.nodes_in_module(c.module)) <= c.n
+
+
+def _check_module_responsibility(g: Graph, c: ModuleResponsibility) -> bool:
+    m = g.find_module(c.module)
+    return m is not None and m.responsibility == c.responsibility
+
+
+def _check_acyclic_imports(g: Graph, _c: AcyclicImports) -> bool:
+    return not g.has_module_cycle()
+
+
+def _check_materializes(g: Graph, _c: Materializes) -> bool:
+    # Imported lazily so that callers who don't use this checker don't pay
+    # the cost of pulling the materializer/validator graph.
+    from graphforge.materializer import materialize
+    from graphforge.validator import full_check
+
+    try:
+        files = materialize(g)
+    except Exception:
+        return False
+    return full_check(files).ok
+
+
+# ---- dispatch --------------------------------------------------------
+
+
+def check(graph: Graph, constraint: Constraint) -> bool:
+    if isinstance(constraint, NodeExists):
+        return _check_node_exists(graph, constraint)
+    if isinstance(constraint, NodeAbsent):
+        return _check_node_absent(graph, constraint)
+    if isinstance(constraint, EdgeExists):
+        return _check_edge_exists(graph, constraint)
+    if isinstance(constraint, ModuleCount):
+        return _check_module_count(graph, constraint)
+    if isinstance(constraint, ModuleSizeMax):
+        return _check_module_size_max(graph, constraint)
+    if isinstance(constraint, ModuleResponsibility):
+        return _check_module_responsibility(graph, constraint)
+    if isinstance(constraint, AcyclicImports):
+        return _check_acyclic_imports(graph, constraint)
+    if isinstance(constraint, Materializes):
+        return _check_materializes(graph, constraint)
+    raise ValueError(f"unknown constraint kind: {constraint!r}")
+
+
+def evaluate_all(graph: Graph, constraints: list[Constraint]) -> SatisfactionReport:
+    rep = SatisfactionReport()
+    for c in constraints:
+        if check(graph, c):
+            rep.satisfied.append(c)
+        else:
+            rep.unsatisfied.append(c)
+    return rep
diff --git a/graphforge/constraints/schema.py b/graphforge/constraints/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..05fbbd38ce8e8070827582326a9f26d498ac2a4f
--- /dev/null
+++ b/graphforge/constraints/schema.py
@@ -0,0 +1,129 @@
+"""Constraint schemas (tier-0 subset).
+
+Constraints are pydantic discriminated-union members keyed on ``kind``.
+Tier-0 carves out the smallest set sufficient to express a real task and
+exercise the reward engine end-to-end. The remaining vocabulary in
+PROPOSAL.md §2.2 (fan_in_max, dag_depth_max, type_consistency,
+behavioral_test_passes, …) lands on top of this same shape as new
+discriminated members + checker functions.
+
+Each constraint member is a pure data record. Behavior lives in
+:mod:`graphforge.constraints.checker`.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.graph.schema import ResponsibilityTag
+
+_cfg = ConfigDict(extra="forbid")
+
+
+# ---- structural ------------------------------------------------------
+
+
+class NodeExists(BaseModel):
+    model_config = _cfg
+    kind: Literal["node_exists"] = "node_exists"
+    name: str
+    module: str
+
+
+class NodeAbsent(BaseModel):
+    model_config = _cfg
+    kind: Literal["node_absent"] = "node_absent"
+    name: str
+    module: str
+
+
+class EdgeExists(BaseModel):
+    model_config = _cfg
+    kind: Literal["edge_exists"] = "edge_exists"
+    caller: str  # qualified
+    callee: str  # qualified
+
+
+class ModuleCount(BaseModel):
+    model_config = _cfg
+    kind: Literal["module_count"] = "module_count"
+    n: int = Field(..., ge=0)
+
+
+class ModuleSizeMax(BaseModel):
+    model_config = _cfg
+    kind: Literal["module_size_max"] = "module_size_max"
+    module: str
+    n: int = Field(..., ge=0)
+
+
+class ModuleResponsibility(BaseModel):
+    model_config = _cfg
+    kind: Literal["module_responsibility"] = "module_responsibility"
+    module: str
+    responsibility: ResponsibilityTag
+
+
+class AcyclicImports(BaseModel):
+    model_config = _cfg
+    kind: Literal["acyclic_imports"] = "acyclic_imports"
+
+
+# ---- behavioral / materialization -----------------------------------
+
+
+class Materializes(BaseModel):
+    model_config = _cfg
+    kind: Literal["materializes"] = "materializes"
+
+
+# ---- discriminated union --------------------------------------------
+
+Constraint = Annotated[
+    Union[
+        NodeExists,
+        NodeAbsent,
+        EdgeExists,
+        ModuleCount,
+        ModuleSizeMax,
+        ModuleResponsibility,
+        AcyclicImports,
+        Materializes,
+    ],
+    Field(discriminator="kind"),
+]
+
+
+# Set of kinds considered "structural" for the reward engine's per-constraint
+# +1 magnitude. The "behavioral" family is reserved for property-test results
+# (BehavioralTestPasses, TODO) which earn the higher +3 magnitude. The
+# ``materializes`` constraint is structural for scoring purposes; the more
+# severe "Materialization fails: -8" penalty in PROPOSAL.md §5.2 is an
+# independent gate driven by the materializer raising or returning parse
+# errors, not by this constraint kind.
+STRUCTURAL_KINDS = {
+    "node_exists",
+    "node_absent",
+    "edge_exists",
+    "module_count",
+    "module_size_max",
+    "module_responsibility",
+    "acyclic_imports",
+    "materializes",
+}
+
+
+__all__ = [
+    "AcyclicImports",
+    "Constraint",
+    "EdgeExists",
+    "Materializes",
+    "ModuleCount",
+    "ModuleResponsibility",
+    "ModuleSizeMax",
+    "NodeAbsent",
+    "NodeExists",
+    "STRUCTURAL_KINDS",
+]
diff --git a/graphforge/graph/__init__.py b/graphforge/graph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a954ba19fa90da3c80d1d6f5df43cb19d6ad07
--- /dev/null
+++ b/graphforge/graph/__init__.py
@@ -0,0 +1,23 @@
+"""Canonical graph schema. See :mod:`graphforge.graph.schema`."""
+
+from graphforge.graph.schema import (
+    ArgMapping,
+    Edge,
+    ErrorPolicy,
+    Graph,
+    Module,
+    Node,
+    Purity,
+    ResponsibilityTag,
+)
+
+__all__ = [
+    "ArgMapping",
+    "Edge",
+    "ErrorPolicy",
+    "Graph",
+    "Module",
+    "Node",
+    "Purity",
+    "ResponsibilityTag",
+]
diff --git a/graphforge/graph/schema.py b/graphforge/graph/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2bc176073c477a5857979f96573a7b2a9165482
--- /dev/null
+++ b/graphforge/graph/schema.py
@@ -0,0 +1,308 @@
+"""Canonical graph schema.
+
+The graph is the single source of truth for an in-progress program. Every
+materialization is a deterministic function of (graph, template library).
+
+Wire format mirrors the JSON shape documented in PROPOSAL.md §3.1, exactly:
+
+    {
+      "modules": [{"name": ..., "responsibility": ...}, ...],
+      "nodes":   [{"name": ..., "module": ..., "signature": ...,
+                   "body_template": ..., "body_template_args": {...},
+                   "purity": ..., "error_policy": ..., "decl_order": ...}, ...],
+      "edges":   [{"caller": "<module>.<name>",
+                   "callee": "<module>.<name>",
+                   "arg_mapping": [{"caller_arg": ..., "callee_param": ...}, ...]}, ...]
+    }
+
+This module enforces shape and well-formedness only. Higher-order invariants
+(unique names, edge endpoints exist, no cycles, type-flow compatibility) are
+enforced by the action dispatcher and the type engine, not the schema, so
+that callers can build partial / invalid graphs and inspect why they fail.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from typing import Literal, Optional
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
+# ----------------------------------------------------------------------
+# Enumerated tags
+# ----------------------------------------------------------------------
+
+# Responsibility tags constrain which kinds of nodes a module is allowed to
+# host. The canonical set; new tags are added intentionally because tasks
+# encode constraints against this vocabulary.
+ResponsibilityTag = Literal[
+    "io",
+    "validation",
+    "transform",
+    "orchestration",
+    "storage",
+    "formatting",
+    "lookup",
+    "policy",
+    "logging",
+    "computation",
+]
+
+Purity = Literal["pure", "impure"]
+
+# How a function handles errors in its body. "guard" means it includes a
+# guard / try-except. "propagate" means it deliberately lets errors flow up.
+# "none" is the default — no claim either way.
+ErrorPolicy = Literal["guard", "propagate", "none"]
+
+
+# ----------------------------------------------------------------------
+# Atomic records
+# ----------------------------------------------------------------------
+
+
+class Module(BaseModel):
+    """A declared module — one Python file at materialization time."""
+
+    model_config = ConfigDict(extra="forbid", frozen=False)
+
+    name: str = Field(..., min_length=1)
+    responsibility: ResponsibilityTag
+
+    @field_validator("name")
+    @classmethod
+    def _name_is_identifier(cls, v: str) -> str:
+        if not v.isidentifier():
+            raise ValueError(f"module name {v!r} is not a Python identifier")
+        if v.startswith("_"):
+            raise ValueError(f"module name {v!r} must not start with an underscore")
+        return v
+
+
+class Node(BaseModel):
+    """A declared function. ``body_template`` may be unset until attach_body."""
+
+    model_config = ConfigDict(extra="forbid", frozen=False)
+
+    name: str = Field(..., min_length=1)
+    module: str = Field(..., min_length=1)
+    signature: str = Field(..., min_length=2)  # e.g., "(x: int) -> bool"
+    body_template: Optional[str] = None
+    body_template_args: dict[str, object] = Field(default_factory=dict)
+    purity: Purity = "impure"
+    error_policy: ErrorPolicy = "none"
+    decl_order: int = 0
+
+    @field_validator("name")
+    @classmethod
+    def _name_is_identifier(cls, v: str) -> str:
+        if not v.isidentifier():
+            raise ValueError(f"node name {v!r} is not a Python identifier")
+        return v
+
+    @field_validator("signature")
+    @classmethod
+    def _signature_shape(cls, v: str) -> str:
+        # Cheap surface check; the type engine does the real parse.
+        if not v.lstrip().startswith("("):
+            raise ValueError(f"signature must start with '(': got {v!r}")
+        if "->" not in v:
+            raise ValueError(f"signature must include '->' return arrow: got {v!r}")
+        return v
+
+    # Convenience -----------------------------------------------------
+
+    @property
+    def qualified_name(self) -> str:
+        """``<module>.<name>`` — the canonical address used on edges."""
+        return f"{self.module}.{self.name}"
+
+
+class ArgMapping(BaseModel):
+    """How an edge wires a caller's argument to a callee's parameter."""
+
+    model_config = ConfigDict(extra="forbid", frozen=False)
+
+    caller_arg: str = Field(..., min_length=1)
+    callee_param: str = Field(..., min_length=1)
+
+
+class Edge(BaseModel):
+    """A CALLS edge. Endpoints are qualified node names ``<module>.<name>``."""
+
+    model_config = ConfigDict(extra="forbid", frozen=False)
+
+    caller: str = Field(..., min_length=3)
+    callee: str = Field(..., min_length=3)
+    arg_mapping: list[ArgMapping] = Field(default_factory=list)
+
+    @field_validator("caller", "callee")
+    @classmethod
+    def _qualified(cls, v: str) -> str:
+        if v.count(".") != 1:
+            raise ValueError(
+                f"edge endpoint {v!r} is not qualified (expected '<module>.<name>')"
+            )
+        mod, name = v.split(".")
+        if not mod.isidentifier() or not name.isidentifier():
+            raise ValueError(f"edge endpoint {v!r} has non-identifier parts")
+        return v
+
+
+# ----------------------------------------------------------------------
+# Graph
+# ----------------------------------------------------------------------
+
+
+class Graph(BaseModel):
+    """Canonical graph state. Mutable; cloned via ``snapshot``/``restore``."""
+
+    model_config = ConfigDict(extra="forbid", frozen=False)
+
+    modules: list[Module] = Field(default_factory=list)
+    nodes: list[Node] = Field(default_factory=list)
+    edges: list[Edge] = Field(default_factory=list)
+
+    # ----- lookup ----------------------------------------------------
+
+    def find_module(self, name: str) -> Optional[Module]:
+        for m in self.modules:
+            if m.name == name:
+                return m
+        return None
+
+    def find_node(self, name: str, module: str) -> Optional[Node]:
+        for n in self.nodes:
+            if n.name == name and n.module == module:
+                return n
+        return None
+
+    def find_node_qualified(self, qualified: str) -> Optional[Node]:
+        if qualified.count(".") != 1:
+            return None
+        mod, nm = qualified.split(".")
+        return self.find_node(nm, mod)
+
+    def find_edge(self, caller: str, callee: str) -> Optional[Edge]:
+        for e in self.edges:
+            if e.caller == caller and e.callee == callee:
+                return e
+        return None
+
+    def nodes_in_module(self, module: str) -> list[Node]:
+        return [n for n in self.nodes if n.module == module]
+
+    def callers_of(self, qualified: str) -> list[str]:
+        return [e.caller for e in self.edges if e.callee == qualified]
+
+    def callees_of(self, qualified: str) -> list[str]:
+        return [e.callee for e in self.edges if e.caller == qualified]
+
+    def fan_in(self, qualified: str) -> int:
+        return len(self.callers_of(qualified))
+
+    def fan_out(self, qualified: str) -> int:
+        return len(self.callees_of(qualified))
+
+    # ----- structural derivations ------------------------------------
+
+    def import_edges(self) -> set[tuple[str, str]]:
+        """Set of (caller_module, callee_module) pairs from cross-module edges."""
+        out: set[tuple[str, str]] = set()
+        for e in self.edges:
+            cm = e.caller.split(".")[0]
+            tm = e.callee.split(".")[0]
+            if cm != tm:
+                out.add((cm, tm))
+        return out
+
+    def has_module_cycle(self) -> bool:
+        """True iff the cross-module import graph contains a directed cycle."""
+        adj: dict[str, set[str]] = {m.name: set() for m in self.modules}
+        for src, dst in self.import_edges():
+            adj.setdefault(src, set()).add(dst)
+            adj.setdefault(dst, set())
+        WHITE, GRAY, BLACK = 0, 1, 2
+        color: dict[str, int] = {k: WHITE for k in adj}
+
+        def visit(u: str) -> bool:
+            color[u] = GRAY
+            for v in adj.get(u, ()):
+                if color[v] == GRAY:
+                    return True
+                if color[v] == WHITE and visit(v):
+                    return True
+            color[u] = BLACK
+            return False
+
+        return any(color[u] == WHITE and visit(u) for u in adj)
+
+    def call_graph_depth(self) -> int:
+        """Longest path length (in edges) in the function call DAG.
+
+        If the call graph is cyclic, returns the special value -1 (callers
+        should treat this as an invariant violation).
+        """
+        adj: dict[str, list[str]] = {n.qualified_name: [] for n in self.nodes}
+        for e in self.edges:
+            adj.setdefault(e.caller, []).append(e.callee)
+            adj.setdefault(e.callee, [])
+        memo: dict[str, int] = {}
+        ON_STACK = -2
+
+        def dfs(u: str) -> int:
+            if u in memo:
+                if memo[u] == ON_STACK:
+                    return -1
+                return memo[u]
+            memo[u] = ON_STACK
+            best = 0
+            for v in adj.get(u, ()):
+                d = dfs(v)
+                if d == -1:
+                    return -1
+                best = max(best, d + 1)
+            memo[u] = best
+            return best
+
+        results = [dfs(u) for u in adj]
+        if any(r == -1 for r in results):
+            return -1
+        return max(results, default=0)
+
+    # ----- copying / hashing -----------------------------------------
+
+    def snapshot(self) -> "Graph":
+        """Deep copy. Used by the dispatcher for atomic action rollback."""
+        return self.model_copy(deep=True)
+
+    def structural_hash(self) -> str:
+        """Stable SHA-256 over a canonical JSON projection.
+
+        Insensitive to list ordering on the dimensions where order is not
+        semantically meaningful (modules, nodes), but sensitive to
+        ``decl_order`` because that affects materialized output.
+        """
+        canon: dict[str, object] = {
+            "modules": sorted(
+                [m.model_dump() for m in self.modules],
+                key=lambda d: d["name"],
+            ),
+            "nodes": sorted(
+                [n.model_dump() for n in self.nodes],
+                key=lambda d: (d["module"], d["name"]),
+            ),
+            "edges": sorted(
+                [e.model_dump() for e in self.edges],
+                key=lambda d: (d["caller"], d["callee"]),
+            ),
+        }
+        blob = json.dumps(canon, sort_keys=True, default=str).encode("utf-8")
+        return hashlib.sha256(blob).hexdigest()
+
+    # ----- factories -------------------------------------------------
+
+    @classmethod
+    def empty(cls) -> "Graph":
+        return cls()
diff --git a/graphforge/knowledge_graph.py b/graphforge/knowledge_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf3de8b9fccc3b7160d650c7264aed5fb4f6cef
--- /dev/null
+++ b/graphforge/knowledge_graph.py
@@ -0,0 +1,233 @@
+"""In-memory Knowledge Graph for a Python repository.
+
+Mirrors the structure of a Neo4j property graph but lives in RAM:
+
+Nodes
+-----
+  repo        — the repository root
+  package     — a directory containing __init__.py
+  module      — a .py file
+  class       — a class definition
+  function    — a top-level or nested function / async function
+  method      — a method inside a class
+
+Edges (directed)
+-----------------
+  contains    — parent → child (repo→package, package→module, module→class, …)
+  calls       — function/method → function/method (same-file same-package)
+  imports     — module → module  (from x import y  /  import x)
+  inherits    — class → class
+
+Each node stores the actual source lines so the agent can read/edit them.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from dataclasses import dataclass, field
+from typing import Iterable
+
+
+# ── node & edge ───────────────────────────────────────────────────────────────
+
+@dataclass
+class KGNode:
+    node_id: str          # unique key, e.g. "function:validators.py:validate_title"
+    node_type: str        # module | class | function | method | package | repo
+    name: str             # short identifier
+    file_path: str        # relative path from repo root (empty for repo/package)
+    line_start: int = 0
+    line_end: int = 0
+    source: str = ""      # full source text of this node (incl. def line)
+    docstring: str = ""
+    metadata: dict = field(default_factory=dict)
+
+    def brief(self) -> str:
+        """One-line summary for graph overviews."""
+        loc = f"  [{self.file_path}:{self.line_start}]" if self.file_path else ""
+        return f"[{self.node_type.upper():<8}] {self.node_id}{loc}"
+
+
+@dataclass
+class KGEdge:
+    edge_type: str   # contains | calls | imports | inherits
+    source_id: str
+    target_id: str
+
+
+# ── knowledge graph ───────────────────────────────────────────────────────────
+
+class KnowledgeGraph:
+    """Property graph for a repository.
+
+    Supports rich queries used by the agent and reward checker.
+    """
+
+    def __init__(self, repo_path: str) -> None:
+        self.repo_path = repo_path
+        self._nodes: dict[str, KGNode] = {}
+        self._edges: list[KGEdge] = []
+
+    # ── mutation ──────────────────────────────────────────────────────────────
+
+    def add_node(self, node: KGNode) -> None:
+        self._nodes[node.node_id] = node
+
+    def add_edge(self, edge: KGEdge) -> None:
+        self._edges.append(edge)
+
+    def update_node_source(self, node_id: str, new_source: str) -> None:
+        """Replace a node's source and recount lines."""
+        node = self._nodes[node_id]
+        node.source = new_source
+        lines = new_source.splitlines()
+        node.line_end = node.line_start + len(lines) - 1
+
+    def insert_node(
+        self,
+        parent_id: str,
+        new_node: KGNode,
+    ) -> None:
+        """Add new_node to the graph and wire a contains edge from parent."""
+        self._nodes[new_node.node_id] = new_node
+        self._edges.append(KGEdge("contains", parent_id, new_node.node_id))
+
+    def remove_node(self, node_id: str) -> None:
+        self._nodes.pop(node_id, None)
+        self._edges = [e for e in self._edges
+                       if e.source_id != node_id and e.target_id != node_id]
+
+    # ── queries ───────────────────────────────────────────────────────────────
+
+    def get_node(self, node_id: str) -> KGNode | None:
+        return self._nodes.get(node_id)
+
+    def all_nodes(self, node_type: str | None = None) -> list[KGNode]:
+        nodes = list(self._nodes.values())
+        if node_type:
+            nodes = [n for n in nodes if n.node_type == node_type]
+        return nodes
+
+    def children_of(self, node_id: str) -> list[KGNode]:
+        child_ids = {e.target_id for e in self._edges
+                     if e.source_id == node_id and e.edge_type == "contains"}
+        return [self._nodes[cid] for cid in child_ids if cid in self._nodes]
+
+    def parent_of(self, node_id: str) -> KGNode | None:
+        for e in self._edges:
+            if e.target_id == node_id and e.edge_type == "contains":
+                return self._nodes.get(e.source_id)
+        return None
+
+    def callers_of(self, node_id: str) -> list[KGNode]:
+        caller_ids = {e.source_id for e in self._edges
+                      if e.target_id == node_id and e.edge_type == "calls"}
+        return [self._nodes[cid] for cid in caller_ids if cid in self._nodes]
+
+    def callees_of(self, node_id: str) -> list[KGNode]:
+        callee_ids = {e.target_id for e in self._edges
+                      if e.source_id == node_id and e.edge_type == "calls"}
+        return [self._nodes[cid] for cid in callee_ids if cid in self._nodes]
+
+    def imports_of(self, module_id: str) -> list[KGNode]:
+        imp_ids = {e.target_id for e in self._edges
+                   if e.source_id == module_id and e.edge_type == "imports"}
+        return [self._nodes[i] for i in imp_ids if i in self._nodes]
+
+    def search(self, keywords: str, node_type: str | None = None) -> list[KGNode]:
+        """Fuzzy keyword search over node names, docstrings, and source."""
+        kws = keywords.lower().split()
+        results: list[KGNode] = []
+        for node in self._nodes.values():
+            if node_type and node.node_type != node_type:
+                continue
+            haystack = f"{node.name} {node.docstring} {node.source}".lower()
+            if all(kw in haystack for kw in kws):
+                results.append(node)
+        return results
+
+    def subgraph(self, root_id: str, depth: int = 2) -> list[KGNode]:
+        """BFS from root_id up to depth hops; returns all encountered nodes."""
+        visited: set[str] = set()
+        frontier = {root_id}
+        for _ in range(depth):
+            next_frontier: set[str] = set()
+            for nid in frontier:
+                if nid in visited:
+                    continue
+                visited.add(nid)
+                for e in self._edges:
+                    if e.source_id == nid and e.target_id not in visited:
+                        next_frontier.add(e.target_id)
+            frontier = next_frontier
+        visited.update(frontier)
+        return [self._nodes[nid] for nid in visited if nid in self._nodes]
+
+    # ── text representations ──────────────────────────────────────────────────
+
+    def overview(self, max_chars: int = 3000) -> str:
+        """Compact multi-line overview of the repo graph, capped to avoid LLM context overflow."""
+        lines: list[str] = [f"## Repository: {self.repo_path}", ""]
+        modules = self.all_nodes("module")
+        all_fns  = self.all_nodes("function")
+        all_cls  = self.all_nodes("class")
+        lines.append(f"  {len(modules)} modules · {len(all_fns)} functions · {len(all_cls)} classes")
+        lines.append("")
+
+        for mod in sorted(modules, key=lambda n: n.file_path):
+            children = self.children_of(mod.node_id)
+            funcs   = [c for c in children if c.node_type in ("function", "method")]
+            classes = [c for c in children if c.node_type == "class"]
+            summary = []
+            if classes:
+                summary.append(f"{len(classes)} class{'es' if len(classes)>1 else ''}")
+            if funcs:
+                summary.append(f"{len(funcs)} fn{'s' if len(funcs)>1 else ''}")
+            lines.append(f"  [{mod.file_path}]  ({', '.join(summary) or 'empty'})")
+            for cls in sorted(classes, key=lambda n: n.name):
+                methods = [c for c in self.children_of(cls.node_id) if c.node_type == "method"]
+                mnames  = ", ".join(m.name for m in sorted(methods, key=lambda n: n.line_start))
+                lines.append(f"    class {cls.name}  →  {mnames or '(no methods)'}")
+                lines.append(f"      node_id: {cls.node_id}")
+            for fn in sorted(funcs, key=lambda n: n.line_start):
+                lines.append(f"    def {fn.name}{fn.metadata.get('signature', '')}")
+                lines.append(f"      node_id: {fn.node_id}")
+
+            # Stop expanding if we are already near the character cap
+            current = "\n".join(lines)
+            if len(current) > max_chars:
+                remaining = len(modules) - (modules.index(mod) + 1)
+                if remaining:
+                    lines.append(f"\n  ... [{remaining} more modules not shown — use query() to explore]")
+                break
+
+        return "\n".join(lines)
+
+    def node_detail(self, node_id: str) -> str:
+        """Full inspection view of a single node."""
+        node = self._nodes.get(node_id)
+        if node is None:
+            return f"[ERROR] node_id {node_id!r} not found in graph."
+        lines = [
+            f"## Node: {node.node_id}",
+            f"type     : {node.node_type}",
+            f"file     : {node.file_path}  (lines {node.line_start}–{node.line_end})",
+        ]
+        if node.docstring:
+            lines.append(f"docstring: {node.docstring[:120]}")
+        callers = self.callers_of(node_id)
+        callees = self.callees_of(node_id)
+        if callers:
+            lines.append("called by: " + ", ".join(n.name for n in callers))
+        if callees:
+            lines.append("calls    : " + ", ".join(n.name for n in callees))
+        children = self.children_of(node_id)
+        if children:
+            lines.append("contains : " + ", ".join(c.name for c in children))
+        lines += ["", "### Source", "```python", node.source or "(no source)", "```"]
+        return "\n".join(lines)
+
+    def snapshot(self) -> "KnowledgeGraph":
+        """Deep copy — used to preserve state before mutations."""
+        import copy
+        return copy.deepcopy(self)
diff --git a/graphforge/materializer/__init__.py b/graphforge/materializer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d568b7de75e53c047058946a163cf90d6b40d80
--- /dev/null
+++ b/graphforge/materializer/__init__.py
@@ -0,0 +1,20 @@
+"""Graph -> Python source projection.
+
+Responsibilities (PROPOSAL.md §3.3):
+
+  * Emit one ``<module>.py`` per declared module.
+  * Emit functions in :attr:`Node.decl_order` order.
+  * Compute ``from <module> import <name>`` lines from cross-module edges,
+    deduplicated and sorted.
+  * Expand body templates with the node's ``body_template_args`` to produce
+    a runnable function body.
+
+The materializer is total over well-formed graphs: every dispatcher-accepted
+graph must produce parseable source. Round-trip correctness (the produced
+source re-parses to the same graph) is enforced by tests in
+:mod:`graphforge.parser` (TODO).
+"""
+
+from graphforge.materializer.materialize import materialize
+
+__all__ = ["materialize"]
diff --git a/graphforge/materializer/codegen.py b/graphforge/materializer/codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83e9b64b61cfa6751fbe8487cd8b2b479c25102
--- /dev/null
+++ b/graphforge/materializer/codegen.py
@@ -0,0 +1,169 @@
+"""Per-template body codegen.
+
+Each public ``render_<template>`` function takes the host node, its outgoing
+edges in deterministic order, and returns a multi-line indented body suitable
+for inserting after a ``def`` line. Bodies use only stdlib and never reference
+unresolved names (the orchestrator ensures imports + pattern constants are
+in scope).
+
+Codegen is intentionally simple: the goal is *runnable, readable* Python that
+respects template semantics, not optimal idiomatic code.
+"""
+
+from __future__ import annotations
+
+from graphforge.graph.schema import Edge, Graph, Node
+from graphforge.materializer import patterns
+
+INDENT = "    "
+
+
+# ---- helpers ---------------------------------------------------------
+
+
+def _kwargs_for(edge: Edge) -> str:
+    """Render an edge's arg_mapping as ``param=arg, param2=arg2``."""
+    return ", ".join(f"{m.callee_param}={m.caller_arg}" for m in edge.arg_mapping)
+
+
+def _callee_name(edge: Edge) -> str:
+    """The local symbol used at the call site (just the function name).
+
+    The orchestrator emits ``from <module> import <name>`` for cross-module
+    callees, so the call site can always use the bare name.
+    """
+    return edge.callee.split(".", 1)[1]
+
+
+def _indent(lines: list[str]) -> str:
+    return "\n".join(INDENT + line for line in lines)
+
+
+# ---- per-template renderers -----------------------------------------
+
+
+def render_passthrough_call(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if len(out_edges) != 1:
+        raise ValueError(
+            f"passthrough_call on {node.qualified_name} requires 1 out-edge, "
+            f"got {len(out_edges)}"
+        )
+    e = out_edges[0]
+    return _indent([f"return {_callee_name(e)}({_kwargs_for(e)})"])
+
+
+def render_sequential_calls(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if not out_edges:
+        raise ValueError(
+            f"sequential_calls on {node.qualified_name} requires >=1 out-edge"
+        )
+    lines: list[str] = []
+    for e in out_edges[:-1]:
+        lines.append(f"{_callee_name(e)}({_kwargs_for(e)})")
+    last = out_edges[-1]
+    lines.append(f"return {_callee_name(last)}({_kwargs_for(last)})")
+    return _indent(lines)
+
+
+def render_validate_with_regex(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if out_edges:
+        raise ValueError(
+            f"validate_with_regex on {node.qualified_name} must have 0 out-edges"
+        )
+    pattern_name = str(node.body_template_args.get("pattern", ""))
+    if patterns.get_pattern(pattern_name) is None:
+        raise ValueError(
+            f"unknown regex pattern {pattern_name!r} on {node.qualified_name}; "
+            f"known: {patterns.known_patterns()}"
+        )
+    constant = patterns.constant_name(pattern_name)
+    # The host signature is expected to be (s: str) -> bool — but we just use
+    # the first parameter name, whatever it is, to be tolerant.
+    from graphforge.actions.signature import parse_signature
+    parsed = parse_signature(node.signature)
+    if not parsed.parameters:
+        raise ValueError(
+            f"validate_with_regex on {node.qualified_name} requires "
+            f"at least one parameter"
+        )
+    arg = parsed.parameters[0].name
+    return _indent([f"return re.match({constant}, {arg}) is not None"])
+
+
+def render_early_return_guard(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if len(out_edges) != 1:
+        raise ValueError(
+            f"early_return_guard on {node.qualified_name} requires 1 out-edge"
+        )
+    condition = str(node.body_template_args.get("condition", "True"))
+    e = out_edges[0]
+    return _indent(
+        [
+            f"if not ({condition}):",
+            f"{INDENT}return None",
+            f"return {_callee_name(e)}({_kwargs_for(e)})",
+        ]
+    )
+
+
+def render_try_call_with_fallback(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if len(out_edges) != 2:
+        raise ValueError(
+            f"try_call_with_fallback on {node.qualified_name} requires "
+            f"exactly 2 out-edges (primary, fallback)"
+        )
+    primary, fallback = out_edges
+    return _indent(
+        [
+            "try:",
+            f"{INDENT}return {_callee_name(primary)}({_kwargs_for(primary)})",
+            "except Exception:",
+            f"{INDENT}return {_callee_name(fallback)}({_kwargs_for(fallback)})",
+        ]
+    )
+
+
+def render_leaf_constant(node: Node, out_edges: list[Edge], _g: Graph) -> str:
+    if out_edges:
+        raise ValueError(
+            f"leaf_constant on {node.qualified_name} must have 0 out-edges"
+        )
+    if "value" not in node.body_template_args:
+        raise ValueError(
+            f"leaf_constant on {node.qualified_name} requires args.value"
+        )
+    value = node.body_template_args["value"]
+    return _indent([f"return {value!r}"])
+
+
+# ---- registry --------------------------------------------------------
+
+
+_RENDERERS: dict[str, object] = {
+    "passthrough_call": render_passthrough_call,
+    "sequential_calls": render_sequential_calls,
+    "validate_with_regex": render_validate_with_regex,
+    "early_return_guard": render_early_return_guard,
+    "try_call_with_fallback": render_try_call_with_fallback,
+    "leaf_constant": render_leaf_constant,
+}
+
+
+def render_body(node: Node, out_edges: list[Edge], graph: Graph) -> str:
+    """Render the body for ``node`` based on its attached body template."""
+    if node.body_template is None:
+        # No body attached yet — emit a placeholder so the file still parses.
+        return _indent(['raise NotImplementedError("body not attached")'])
+    fn = _RENDERERS.get(node.body_template)
+    if fn is None:
+        raise ValueError(
+            f"no codegen for template {node.body_template!r} on {node.qualified_name}"
+        )
+    return fn(node, out_edges, graph)  # type: ignore[operator]
+
+
+def template_imports(template: str | None) -> set[str]:
+    """Stdlib imports a template needs, beyond cross-module function imports."""
+    if template == "validate_with_regex":
+        return {"re"}
+    return set()
diff --git a/graphforge/materializer/materialize.py b/graphforge/materializer/materialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..c827fcfa0a16f68d17d5da4a8f7553b8e2c5b917
--- /dev/null
+++ b/graphforge/materializer/materialize.py
@@ -0,0 +1,134 @@
+"""Materialize a :class:`Graph` into a dict of ``{filename: source}``.
+
+Determinism guarantees:
+
+  * One file per module, named ``<module>.py``.
+  * Within a file, functions emitted in :attr:`Node.decl_order`.
+  * Imports sorted: stdlib first (alpha), then ``from <module> import <name>``
+    (alpha by module, alpha by name).
+  * Pattern constants emitted only if used, in alpha order.
+  * Out-edges of a node iterated in insertion order, which matters for
+    ``sequential_calls`` and ``try_call_with_fallback`` semantics.
+
+The orchestrator is a pure function: same graph in, same source out.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Iterable
+
+from graphforge.graph.schema import Edge, Graph, Node
+from graphforge.materializer import codegen, patterns
+
+HEADER = '"""Auto-generated by graphforge.materializer. Do not edit by hand."""\n'
+FUTURE = "from __future__ import annotations\n"
+
+
+# ---- helpers ---------------------------------------------------------
+
+
+def _out_edges_in_order(graph: Graph, qualified: str) -> list[Edge]:
+    """Out-edges of ``qualified`` in insertion order."""
+    return [e for e in graph.edges if e.caller == qualified]
+
+
+def _nodes_by_module(graph: Graph) -> dict[str, list[Node]]:
+    """Map module-name -> nodes in decl_order."""
+    by_mod: dict[str, list[Node]] = defaultdict(list)
+    for n in graph.nodes:
+        by_mod[n.module].append(n)
+    for ns in by_mod.values():
+        ns.sort(key=lambda n: (n.decl_order, n.name))
+    return by_mod
+
+
+def _cross_module_imports(graph: Graph, module: str) -> list[tuple[str, str]]:
+    """``[(callee_module, callee_name), ...]`` needed by ``module``."""
+    pairs: set[tuple[str, str]] = set()
+    for e in graph.edges:
+        caller_mod = e.caller.split(".", 1)[0]
+        if caller_mod != module:
+            continue
+        callee_mod, callee_name = e.callee.split(".", 1)
+        if callee_mod != module:
+            pairs.add((callee_mod, callee_name))
+    return sorted(pairs)
+
+
+def _stdlib_imports_for(nodes: Iterable[Node]) -> list[str]:
+    """Stdlib imports the templates in this module require."""
+    needed: set[str] = set()
+    for n in nodes:
+        needed |= codegen.template_imports(n.body_template)
+    return sorted(needed)
+
+
+def _patterns_used_by(nodes: Iterable[Node]) -> list[str]:
+    """Named patterns referenced by validate_with_regex nodes in this module."""
+    used: set[str] = set()
+    for n in nodes:
+        if n.body_template == "validate_with_regex":
+            name = str(n.body_template_args.get("pattern", ""))
+            if patterns.get_pattern(name) is not None:
+                used.add(name)
+    return sorted(used)
+
+
+# ---- core ------------------------------------------------------------
+
+
+def materialize(graph: Graph) -> dict[str, str]:
+    """Project ``graph`` to a ``{filename: source}`` map.
+
+    Modules with zero nodes are still emitted as empty files (just header +
+    future import) so that downstream import-resolution sees them.
+    """
+    by_mod = _nodes_by_module(graph)
+    files: dict[str, str] = {}
+    for module in graph.modules:
+        nodes = by_mod.get(module.name, [])
+        files[f"{module.name}.py"] = _render_module(graph, module.name, nodes)
+    return files
+
+
+def _render_module(graph: Graph, module_name: str, nodes: list[Node]) -> str:
+    parts: list[str] = [HEADER, FUTURE, "\n"]
+
+    # Stdlib imports.
+    for imp in _stdlib_imports_for(nodes):
+        parts.append(f"import {imp}\n")
+
+    # Cross-module function imports.
+    for callee_mod, callee_name in _cross_module_imports(graph, module_name):
+        parts.append(f"from {callee_mod} import {callee_name}\n")
+
+    if (
+        any(_stdlib_imports_for(nodes))
+        or _cross_module_imports(graph, module_name)
+    ):
+        parts.append("\n")
+
+    # Pattern constants used in this module. We emit a plain string literal
+    # (not a raw-string-prefixed one) because ``repr()`` already produces a
+    # valid Python string literal — wrapping it in ``r"..."`` would double
+    # the backslashes and break regex metacharacters like ``\s`` and ``\d``.
+    used_patterns = _patterns_used_by(nodes)
+    for name in used_patterns:
+        regex = patterns.get_pattern(name)
+        constant = patterns.constant_name(name)
+        parts.append(f"{constant} = {regex!r}\n")
+    if used_patterns:
+        parts.append("\n")
+
+    # Functions.
+    for i, node in enumerate(nodes):
+        out_edges = _out_edges_in_order(graph, node.qualified_name)
+        body = codegen.render_body(node, out_edges, graph)
+        parts.append(f"def {node.name}{node.signature}:\n{body}\n")
+        if i != len(nodes) - 1:
+            parts.append("\n")
+
+    source = "".join(parts)
+    # Ensure exactly one trailing newline.
+    return source.rstrip("\n") + "\n"
diff --git a/graphforge/materializer/patterns.py b/graphforge/materializer/patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..d541de30b6de46e954f39d660b5e8854b8211fea
--- /dev/null
+++ b/graphforge/materializer/patterns.py
@@ -0,0 +1,34 @@
+"""Named regex patterns for ``validate_with_regex`` template.
+
+Patterns are referenced by name in the graph (e.g. ``args={"pattern": "EMAIL"}``)
+and resolved here at materialization time. The registry keeps task definitions
+domain-agnostic — a task constraint can name a pattern without leaking the
+regex itself into the graph schema.
+
+Add new patterns sparingly; every name here becomes part of the constraint
+vocabulary that tasks can use.
+"""
+
+from __future__ import annotations
+
+# name -> (regex string, brief description)
+_PATTERNS: dict[str, str] = {
+    "EMAIL": r"[^@\s]+@[^@\s]+\.[^@\s]+",
+    "HEXCOLOR": r"#[0-9a-fA-F]{6}",
+    "PHONE": r"\+?\d{10,15}",
+    "ALPHANUM": r"[A-Za-z0-9]+",
+    "URL": r"https?://[^\s]+",
+}
+
+
+def known_patterns() -> list[str]:
+    return sorted(_PATTERNS.keys())
+
+
+def get_pattern(name: str) -> str | None:
+    return _PATTERNS.get(name)
+
+
+def constant_name(name: str) -> str:
+    """Module-level constant name we emit for a given pattern name."""
+    return f"_PATTERN_{name}"
diff --git a/graphforge/parser/__init__.py b/graphforge/parser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a2d52bb37c14d5d30e6fe18298d4f0ac9373ca3
--- /dev/null
+++ b/graphforge/parser/__init__.py
@@ -0,0 +1,27 @@
+"""Round-trip parser: Python source -> Graph.
+
+Responsibilities (PROPOSAL.md §3.4):
+
+  * Walk an AST per module file.
+  * Recover function declarations as :class:`Node` objects.
+  * Recover ``from x import y`` lines as cross-module edges (best-effort).
+  * Recognize body templates by structural pattern matching against the
+    template library, and recover ``body_template`` + ``body_template_args``.
+  * Produce a :class:`Graph` identical (per ``structural_hash``) to the one
+    that produced the source via :mod:`graphforge.materializer`.
+
+The round-trip parser is unit-tested against every body template + every
+constraint pattern. If it fails to round-trip, the materializer emits a
+warning and the graph is treated as canonical.
+
+Public surface (TODO):
+
+    parse_program(files: dict[str, str]) -> Graph
+    parse_directory(path: Path) -> Graph
+"""
+
+from __future__ import annotations
+
+
+def parse_program(files: dict[str, str]) -> object:  # pragma: no cover — TODO
+    raise NotImplementedError("round-trip parser TODO — see PROPOSAL.md §3.4")
diff --git a/graphforge/repo_parser.py b/graphforge/repo_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0400c35ac88bcb46c7014968247d4f6c22cb024
--- /dev/null
+++ b/graphforge/repo_parser.py
@@ -0,0 +1,271 @@
+"""Parse a Python repository (directory tree) into a KnowledgeGraph.
+
+Usage
+-----
+    from graphforge.repo_parser import parse_repo
+    kg = parse_repo("/path/to/my_package")
+
+What it extracts
+----------------
+  Nodes  : repo, package, module, class, function, method
+  Edges  : contains, calls (same-file), imports, inherits
+
+Cross-file call resolution is best-effort: if function A in file X calls
+function B and B appears anywhere in the graph, an edge is added.
+"""
+
+from __future__ import annotations
+
+import ast
+import os
+from pathlib import Path
+from typing import Any
+
+from graphforge.knowledge_graph import KGEdge, KGNode, KnowledgeGraph
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _node_id(node_type: str, file_path: str, *names: str) -> str:
+    parts = [node_type, file_path] + list(names)
+    return ":".join(p for p in parts if p)
+
+
+def _sig(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
+    args = []
+    for arg in node.args.args:
+        ann = f": {ast.unparse(arg.annotation)}" if arg.annotation else ""
+        args.append(f"{arg.arg}{ann}")
+    ret = f" -> {ast.unparse(node.returns)}" if node.returns else ""
+    return f"({', '.join(args)}){ret}"
+
+
+def _source_slice(source_lines: list[str], start: int, end: int) -> str:
+    """1-indexed, inclusive."""
+    return "\n".join(source_lines[start - 1 : end])
+
+
+def _direct_calls(func_node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
+    """Collect names of directly called functions (Name-style calls only)."""
+    calls: set[str] = set()
+    for node in ast.walk(func_node):
+        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
+            calls.add(node.func.id)
+    return calls
+
+
+# ── single-file parser ────────────────────────────────────────────────────────
+
+def _parse_file(
+    file_path: str,       # relative to repo root
+    abs_path: str,
+    kg: KnowledgeGraph,
+    parent_id: str,
+) -> None:
+    try:
+        source = Path(abs_path).read_text(encoding="utf-8", errors="replace")
+    except Exception:
+        return
+
+    try:
+        tree = ast.parse(source, filename=abs_path)
+    except SyntaxError:
+        return
+
+    lines = source.splitlines()
+    mod_id = _node_id("module", file_path)
+
+    # Module node
+    mod_doc = ast.get_docstring(tree) or ""
+    kg.add_node(KGNode(
+        node_id=mod_id,
+        node_type="module",
+        name=Path(file_path).stem,
+        file_path=file_path,
+        line_start=1,
+        line_end=len(lines),
+        source=source,
+        docstring=mod_doc,
+    ))
+    kg.add_edge(KGEdge("contains", parent_id, mod_id))
+
+    # Import edges (resolve module names)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                imp_id = _node_id("module", alias.name.replace(".", "/") + ".py")
+                kg.add_edge(KGEdge("imports", mod_id, imp_id))
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            imp_id = _node_id("module", node.module.replace(".", "/") + ".py")
+            kg.add_edge(KGEdge("imports", mod_id, imp_id))
+
+    # Top-level classes and functions
+    func_name_to_id: dict[str, str] = {}   # for call resolution within file
+
+    for stmt in tree.body:
+        if isinstance(stmt, ast.ClassDef):
+            _parse_class(stmt, file_path, lines, kg, mod_id, func_name_to_id)
+        elif isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            _parse_function(stmt, file_path, lines, kg, mod_id, func_name_to_id)
+
+    # Same-file call edges
+    _resolve_calls(func_name_to_id, kg)
+
+
+def _parse_class(
+    cls_node: ast.ClassDef,
+    file_path: str,
+    lines: list[str],
+    kg: KnowledgeGraph,
+    parent_id: str,
+    func_name_to_id: dict[str, str],
+) -> None:
+    cls_id = _node_id("class", file_path, cls_node.name)
+    doc = ast.get_docstring(cls_node) or ""
+    kg.add_node(KGNode(
+        node_id=cls_id,
+        node_type="class",
+        name=cls_node.name,
+        file_path=file_path,
+        line_start=cls_node.lineno,
+        line_end=cls_node.end_lineno,
+        source=_source_slice(lines, cls_node.lineno, cls_node.end_lineno),
+        docstring=doc,
+    ))
+    kg.add_edge(KGEdge("contains", parent_id, cls_id))
+
+    # Inheritance edges
+    for base in cls_node.bases:
+        if isinstance(base, ast.Name):
+            base_id = _node_id("class", file_path, base.id)
+            kg.add_edge(KGEdge("inherits", cls_id, base_id))
+
+    # Methods
+    for item in cls_node.body:
+        if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            _parse_method(item, file_path, lines, kg, cls_id, cls_node.name, func_name_to_id)
+
+
+def _parse_function(
+    fn: ast.FunctionDef | ast.AsyncFunctionDef,
+    file_path: str,
+    lines: list[str],
+    kg: KnowledgeGraph,
+    parent_id: str,
+    func_name_to_id: dict[str, str],
+) -> None:
+    fn_id = _node_id("function", file_path, fn.name)
+    doc = ast.get_docstring(fn) or ""
+    kg.add_node(KGNode(
+        node_id=fn_id,
+        node_type="function",
+        name=fn.name,
+        file_path=file_path,
+        line_start=fn.lineno,
+        line_end=fn.end_lineno,
+        source=_source_slice(lines, fn.lineno, fn.end_lineno),
+        docstring=doc,
+        metadata={"signature": _sig(fn), "calls": list(_direct_calls(fn))},
+    ))
+    kg.add_edge(KGEdge("contains", parent_id, fn_id))
+    func_name_to_id[fn.name] = fn_id
+
+
+def _parse_method(
+    fn: ast.FunctionDef | ast.AsyncFunctionDef,
+    file_path: str,
+    lines: list[str],
+    kg: KnowledgeGraph,
+    parent_id: str,
+    class_name: str,
+    func_name_to_id: dict[str, str],
+) -> None:
+    method_id = _node_id("method", file_path, class_name, fn.name)
+    doc = ast.get_docstring(fn) or ""
+    kg.add_node(KGNode(
+        node_id=method_id,
+        node_type="method",
+        name=fn.name,
+        file_path=file_path,
+        line_start=fn.lineno,
+        line_end=fn.end_lineno,
+        source=_source_slice(lines, fn.lineno, fn.end_lineno),
+        docstring=doc,
+        metadata={"signature": _sig(fn), "calls": list(_direct_calls(fn))},
+    ))
+    kg.add_edge(KGEdge("contains", parent_id, method_id))
+    # register under unqualified name too for call resolution
+    func_name_to_id[fn.name] = method_id
+
+
+def _resolve_calls(func_name_to_id: dict[str, str], kg: KnowledgeGraph) -> None:
+    """Add calls edges based on direct-call names collected during parse."""
+    for fn_id, node in [(nid, n) for nid, n in kg._nodes.items()
+                        if n.node_type in ("function", "method")]:
+        calls: list[str] = node.metadata.get("calls", [])
+        for callee_name in calls:
+            if callee_name in func_name_to_id:
+                callee_id = func_name_to_id[callee_name]
+                if callee_id != fn_id:
+                    kg.add_edge(KGEdge("calls", fn_id, callee_id))
+
+
+# ── repo walker ───────────────────────────────────────────────────────────────
+
+def parse_repo(repo_path: str, exclude_dirs: set[str] | None = None) -> KnowledgeGraph:
+    """Walk repo_path recursively and return a KnowledgeGraph.
+
+    Parameters
+    ----------
+    repo_path : str
+        Absolute or relative path to the root of the repo.
+    exclude_dirs : set[str], optional
+        Directory names to skip (e.g. {"__pycache__", ".git", "tests"}).
+    """
+    if exclude_dirs is None:
+        exclude_dirs = {"__pycache__", ".git", ".venv", "venv", "env",
+                        "node_modules", ".mypy_cache", ".pytest_cache", "dist", "build"}
+
+    abs_root = str(Path(repo_path).resolve())
+    kg = KnowledgeGraph(repo_path=repo_path)
+
+    # Root repo node
+    repo_name = Path(abs_root).name
+    repo_id = _node_id("repo", "", repo_name)
+    kg.add_node(KGNode(
+        node_id=repo_id,
+        node_type="repo",
+        name=repo_name,
+        file_path="",
+    ))
+
+    # Walk directory tree
+    for dirpath, dirnames, filenames in os.walk(abs_root):
+        # Prune excluded dirs in-place (modifies os.walk traversal)
+        dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
+
+        rel_dir = os.path.relpath(dirpath, abs_root)
+        if rel_dir == ".":
+            rel_dir = ""
+
+        parent_id = repo_id
+        if rel_dir:
+            pkg_id = _node_id("package", rel_dir)
+            if pkg_id not in kg._nodes:
+                kg.add_node(KGNode(
+                    node_id=pkg_id,
+                    node_type="package",
+                    name=Path(rel_dir).name,
+                    file_path=rel_dir,
+                ))
+                kg.add_edge(KGEdge("contains", repo_id, pkg_id))
+            parent_id = pkg_id
+
+        for fname in sorted(filenames):
+            if not fname.endswith(".py"):
+                continue
+            rel_file = os.path.join(rel_dir, fname) if rel_dir else fname
+            abs_file = os.path.join(dirpath, fname)
+            _parse_file(rel_file, abs_file, kg, parent_id)
+
+    return kg
diff --git a/graphforge/repo_registry.py b/graphforge/repo_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..9312f3925055ed2fdd6bf48b1f6b05e09e5d16d4
--- /dev/null
+++ b/graphforge/repo_registry.py
@@ -0,0 +1,145 @@
+"""Registry of training repos with their clone URLs and source paths.
+
+Add a new repo by appending to REGISTRY. The pipeline will clone it,
+parse it, and auto-generate tasks from its doctests.
+
+Each entry:
+    name        short identifier used in task_ids
+    url         git clone URL (depth-1 clone)
+    src_hint    subdirectory containing the Python package
+                (tried as: <clone>/<hint>, <clone>/src/<hint>, <clone>)
+    n_tasks     max tasks to pull from this repo
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class RepoSpec:
+    name: str
+    url: str
+    src_hint: str
+    n_tasks: int = 6
+
+
+REGISTRY: list[RepoSpec] = [
+    # ── string / text ────────────────────────────────────────────────────────
+    RepoSpec(
+        name="humanize",
+        url="https://github.com/jmoiron/humanize.git",
+        src_hint="src/humanize",
+        n_tasks=6,
+    ),
+    RepoSpec(
+        name="wcwidth",
+        url="https://github.com/jquast/wcwidth.git",
+        src_hint="wcwidth",
+        n_tasks=6,
+    ),
+    RepoSpec(
+        name="inflect",
+        url="https://github.com/jaraco/inflect.git",
+        src_hint="inflect",
+        n_tasks=4,
+    ),
+
+    # ── iteration / functional ───────────────────────────────────────────────
+    RepoSpec(
+        name="boltons",
+        url="https://github.com/mahmoud/boltons.git",
+        src_hint="boltons",
+        n_tasks=10,
+    ),
+    RepoSpec(
+        name="more-itertools",
+        url="https://github.com/more-itertools/more-itertools.git",
+        src_hint="more_itertools",
+        n_tasks=8,
+    ),
+    RepoSpec(
+        name="toolz",
+        url="https://github.com/pytoolz/toolz.git",
+        src_hint="toolz",
+        n_tasks=6,
+    ),
+
+    # ── data transformation / ETL ────────────────────────────────────────────
+    RepoSpec(
+        name="petl",
+        url="https://github.com/petl-developers/petl.git",
+        src_hint="src/petl",
+        n_tasks=8,
+    ),
+    RepoSpec(
+        name="pydash",
+        url="https://github.com/dgilland/pydash.git",
+        src_hint="src/pydash",
+        n_tasks=8,
+    ),
+
+]
+
+# Repos that were evaluated and produced 0 tasks (no literal-eval-able doctests):
+#   num2words, parse, dateutil — omitted from REGISTRY
+
+
+def _find_src(clone_dir: str, hint: str) -> str:
+    for candidate in [
+        f"{clone_dir}/{hint}",
+        f"{clone_dir}/src/{hint}",
+        clone_dir,
+    ]:
+        if Path(candidate).is_dir():
+            return candidate
+    return clone_dir
+
+
+def load_all_tasks(
+    clone_root: str = "/tmp/train_repos",
+    registry: list[RepoSpec] | None = None,
+    verbose: bool = True,
+) -> list:
+    """Clone every repo in the registry and return all AutoTask objects.
+
+    Args:
+        clone_root: Directory under which repos are cloned.
+        registry:   Use a custom registry; defaults to REGISTRY.
+        verbose:    Print progress.
+
+    Returns:
+        Flat list of AutoTask objects from all repos.
+    """
+    import subprocess
+    from pathlib import Path
+    from graphforge.task_generator import generate_tasks
+
+    specs = registry or REGISTRY
+    all_tasks = []
+    Path(clone_root).mkdir(parents=True, exist_ok=True)
+
+    for spec in specs:
+        clone_dir = str(Path(clone_root) / spec.name)
+        if not Path(clone_dir).exists():
+            if verbose:
+                print(f"Cloning {spec.name} ...")
+            subprocess.check_call(
+                ["git", "clone", "--depth", "1", "-q", spec.url, clone_dir]
+            )
+
+        src = _find_src(clone_dir, spec.src_hint)
+        try:
+            kg, tasks = generate_tasks(src, n_tasks=spec.n_tasks)
+            all_tasks.extend(tasks)
+            if verbose:
+                print(f"  {spec.name}: {len(tasks)} tasks  "
+                      f"(DAG {len(kg._nodes)} nodes)")
+        except Exception as exc:
+            if verbose:
+                print(f"  {spec.name}: SKIPPED — {exc}")
+
+    if verbose:
+        print(f"\nTotal auto-tasks: {len(all_tasks)}")
+    return all_tasks
diff --git a/graphforge/reward/__init__.py b/graphforge/reward/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0fa9699178a6fcd1554e700405c38c72660e82
--- /dev/null
+++ b/graphforge/reward/__init__.py
@@ -0,0 +1,45 @@
+"""Reward engine — see :mod:`graphforge.reward.engine`.
+
+Per-turn (dense, small) and terminal (sparse, large) reward computation
+following PROPOSAL.md §5.
+"""
+
+from graphforge.reward.engine import (
+    ActionOutcome,
+    ALL_BEHAVIORAL_BONUS,
+    ALL_STRUCTURAL_BONUS,
+    ALPHA_TOKEN_COST,
+    BEHAVIORAL_PER_PASS,
+    DUPLICATE_ACTION,
+    MATERIALIZE_FAIL_PENALTY,
+    MUTATION_FAIL,
+    PER_TURN_COST,
+    SCHEMA_REJECTION,
+    STRUCTURAL_PER_SAT,
+    TYPE_CHECK_BONUS,
+    TOKEN_EFFICIENCY_MAX,
+    TerminalReward,
+    TurnReward,
+    score_terminal,
+    score_turn,
+)
+
+__all__ = [
+    "ALPHA_TOKEN_COST",
+    "ALL_BEHAVIORAL_BONUS",
+    "ALL_STRUCTURAL_BONUS",
+    "ActionOutcome",
+    "BEHAVIORAL_PER_PASS",
+    "DUPLICATE_ACTION",
+    "MATERIALIZE_FAIL_PENALTY",
+    "MUTATION_FAIL",
+    "PER_TURN_COST",
+    "SCHEMA_REJECTION",
+    "STRUCTURAL_PER_SAT",
+    "TOKEN_EFFICIENCY_MAX",
+    "TYPE_CHECK_BONUS",
+    "TerminalReward",
+    "TurnReward",
+    "score_terminal",
+    "score_turn",
+]
diff --git a/graphforge/reward/engine.py b/graphforge/reward/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..581e5b4cce4942e24c3724fa58ef21501a8350b9
--- /dev/null
+++ b/graphforge/reward/engine.py
@@ -0,0 +1,211 @@
+"""Reward engine — per-turn (dense, small) and terminal (sparse, large).
+
+Implementation follows PROPOSAL.md §5 verbatim. The two halves are pure
+functions over lightweight envelopes so the server can call them without
+threading state through the reward module.
+
+Decisions worth flagging:
+
+* ``All-behavioral-passing`` bonus is awarded only when there is at least
+  one behavioral test. The gate for the token-efficiency bonus, however,
+  treats zero behavioral tests as vacuously satisfied (so a tier-0 task
+  with no behavioral tests can still earn token-efficiency reward).
+* ``type_checks_ok`` is tri-state: ``True`` / ``False`` / ``None``. ``None``
+  means the type-check gate didn't run (e.g. mypy isn't wired yet); the
+  +3 bonus is suppressed in that case.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+
+# Coefficients (PROPOSAL.md §5.1). Override at call time if you want.
+ALPHA_TOKEN_COST: float = 0.0008
+PER_TURN_COST: float = -0.1
+MUTATION_FAIL: float = -2.0
+SCHEMA_REJECTION: float = -2.0
+DUPLICATE_ACTION: float = -1.0
+
+# Terminal magnitudes (§5.2)
+STRUCTURAL_PER_SAT: float = 1.0
+BEHAVIORAL_PER_PASS: float = 3.0
+ALL_STRUCTURAL_BONUS: float = 5.0
+ALL_BEHAVIORAL_BONUS: float = 5.0
+TYPE_CHECK_BONUS: float = 3.0
+MATERIALIZE_FAIL_PENALTY: float = -8.0
+TOKEN_EFFICIENCY_MAX: float = 5.0
+
+
+# ---- per-turn -------------------------------------------------------
+
+
+class ActionOutcome(str, Enum):
+    """Coarse classification used by ``score_turn``.
+
+    ``SUCCESS``    — mutation or info action returned ``ok=True``.
+    ``FAILURE``    — handler raised :class:`ActionError` (rollback path).
+    ``MALFORMED``  — pydantic schema rejected the action at parse time.
+    """
+
+    SUCCESS = "success"
+    FAILURE = "failure"
+    MALFORMED = "malformed"
+
+
+@dataclass(frozen=True)
+class TurnReward:
+    base: float          # outcome-dependent component
+    duplicate: float     # 0 or DUPLICATE_ACTION
+    per_turn: float      # PER_TURN_COST
+    token_cost: float    # alpha * tokens_returned, negated
+
+    @property
+    def total(self) -> float:
+        return self.base + self.duplicate + self.per_turn + self.token_cost
+
+    def to_dict(self) -> dict[str, float]:
+        return {
+            "base": self.base,
+            "duplicate": self.duplicate,
+            "per_turn": self.per_turn,
+            "token_cost": self.token_cost,
+            "total": self.total,
+        }
+
+
+def score_turn(
+    *,
+    outcome: ActionOutcome,
+    is_duplicate: bool,
+    tokens_returned: int,
+    alpha: float = ALPHA_TOKEN_COST,
+    per_turn_cost: float = PER_TURN_COST,
+) -> TurnReward:
+    if outcome is ActionOutcome.SUCCESS:
+        base = 0.0
+    elif outcome is ActionOutcome.FAILURE:
+        base = MUTATION_FAIL
+    else:  # MALFORMED
+        base = SCHEMA_REJECTION
+    return TurnReward(
+        base=base,
+        duplicate=DUPLICATE_ACTION if is_duplicate else 0.0,
+        per_turn=per_turn_cost,
+        token_cost=-alpha * max(0, tokens_returned),
+    )
+
+
+# ---- terminal -------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class TerminalReward:
+    structural: float           # +1 per structural constraint satisfied
+    behavioral: float           # +3 per behavioral test passing
+    bonus_all_structural: float
+    bonus_all_behavioral: float
+    bonus_type_checks: float
+    penalty_materialize: float  # 0 or MATERIALIZE_FAIL_PENALTY
+    efficiency: float           # gated by all-structural AND all-behavioral
+
+    components: dict[str, object] = field(default_factory=dict)
+
+    @property
+    def total(self) -> float:
+        return (
+            self.structural
+            + self.behavioral
+            + self.bonus_all_structural
+            + self.bonus_all_behavioral
+            + self.bonus_type_checks
+            + self.penalty_materialize
+            + self.efficiency
+        )
+
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "structural": self.structural,
+            "behavioral": self.behavioral,
+            "bonus_all_structural": self.bonus_all_structural,
+            "bonus_all_behavioral": self.bonus_all_behavioral,
+            "bonus_type_checks": self.bonus_type_checks,
+            "penalty_materialize": self.penalty_materialize,
+            "efficiency": self.efficiency,
+            "total": self.total,
+            "components": self.components,
+        }
+
+
+def score_terminal(
+    *,
+    n_structural_satisfied: int,
+    n_structural_total: int,
+    n_behavioral_passing: int,
+    n_behavioral_total: int,
+    materialization_ok: bool,
+    type_checks_ok: bool | None,
+    tokens_used: int,
+    budget: int,
+) -> TerminalReward:
+    if n_structural_satisfied < 0 or n_structural_total < 0:
+        raise ValueError("structural counts must be non-negative")
+    if n_behavioral_passing < 0 or n_behavioral_total < 0:
+        raise ValueError("behavioral counts must be non-negative")
+    if budget <= 0:
+        raise ValueError("budget must be positive")
+
+    structural = STRUCTURAL_PER_SAT * n_structural_satisfied
+    behavioral = BEHAVIORAL_PER_PASS * n_behavioral_passing
+
+    all_structural = (
+        n_structural_total > 0 and n_structural_satisfied == n_structural_total
+    )
+    all_behavioral_present_and_passing = (
+        n_behavioral_total > 0 and n_behavioral_passing == n_behavioral_total
+    )
+    bonus_all_structural = ALL_STRUCTURAL_BONUS if all_structural else 0.0
+    bonus_all_behavioral = (
+        ALL_BEHAVIORAL_BONUS if all_behavioral_present_and_passing else 0.0
+    )
+
+    if type_checks_ok is True:
+        bonus_type_checks = TYPE_CHECK_BONUS
+    else:
+        bonus_type_checks = 0.0
+
+    penalty_materialize = (
+        0.0 if materialization_ok else MATERIALIZE_FAIL_PENALTY
+    )
+
+    # Efficiency bonus is gated on all-structural AND all-behavioral satisfied.
+    # When n_behavioral_total == 0 the behavioral half is vacuously satisfied
+    # for the gate's purposes (otherwise tier-0 tasks could never earn it).
+    behavioral_gate_ok = (
+        n_behavioral_total == 0
+        or n_behavioral_passing == n_behavioral_total
+    )
+    efficiency = 0.0
+    if all_structural and behavioral_gate_ok:
+        ratio = max(0.0, (budget - tokens_used) / budget)
+        efficiency = TOKEN_EFFICIENCY_MAX * ratio
+
+    return TerminalReward(
+        structural=structural,
+        behavioral=behavioral,
+        bonus_all_structural=bonus_all_structural,
+        bonus_all_behavioral=bonus_all_behavioral,
+        bonus_type_checks=bonus_type_checks,
+        penalty_materialize=penalty_materialize,
+        efficiency=efficiency,
+        components={
+            "n_structural_satisfied": n_structural_satisfied,
+            "n_structural_total": n_structural_total,
+            "n_behavioral_passing": n_behavioral_passing,
+            "n_behavioral_total": n_behavioral_total,
+            "materialization_ok": materialization_ok,
+            "type_checks_ok": type_checks_ok,
+            "tokens_used": tokens_used,
+            "budget": budget,
+        },
+    )
diff --git a/graphforge/sample_repos/humanize/__init__.py b/graphforge/sample_repos/humanize/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..318e5f69c69892e0d9a43c88f391cec6a926916d
--- /dev/null
+++ b/graphforge/sample_repos/humanize/__init__.py
@@ -0,0 +1,18 @@
+"""Humanize — convert numbers, file sizes, and times to human-readable strings."""
+from graphforge.sample_repos.humanize.filesize import naturalsize
+from graphforge.sample_repos.humanize.number import (
+    apnumber,
+    clamp,
+    fractional,
+    intcomma,
+    intword,
+    ordinal,
+    scientific,
+)
+from graphforge.sample_repos.humanize.time import (
+    naturaldate,
+    naturalday,
+    naturaldelta,
+    naturaltime,
+    precisedelta,
+)
diff --git a/graphforge/sample_repos/humanize/filesize.py b/graphforge/sample_repos/humanize/filesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..26163fcd0fc31669dfaaf0e2f8789d99cdfd447b
--- /dev/null
+++ b/graphforge/sample_repos/humanize/filesize.py
@@ -0,0 +1,49 @@
+"""Bits and bytes related humanization."""
+
+suffixes = {
+    "decimal": ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"),
+    "binary": ("KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"),
+    "gnu": "KMGTPEZY",
+}
+
+
+def naturalsize(value, binary=False, gnu=False, format="%.1f"):
+    """Format a number of bytes as a human-readable file size (e.g. 10 kB).
+
+    By default, decimal suffixes (kB, MB) are used.
+
+    Examples:
+        >>> naturalsize(3000000)
+        '3.0 MB'
+        >>> naturalsize(300, False, True)
+        '300B'
+        >>> naturalsize(3000, True)
+        '2.9 KiB'
+    """
+    if gnu:
+        suffix = suffixes["gnu"]
+    elif binary:
+        suffix = suffixes["binary"]
+    else:
+        suffix = suffixes["decimal"]
+
+    base = 1024 if (gnu or binary) else 1000
+    bytes_ = float(value)
+    abs_bytes = abs(bytes_)
+
+    if abs_bytes == 1 and not gnu:
+        return "%d Byte" % bytes_
+    elif abs_bytes < base and not gnu:
+        return "%d Bytes" % bytes_
+    elif abs_bytes < base and gnu:
+        return "%dB" % bytes_
+
+    for i, s in enumerate(suffix):
+        unit = base ** (i + 2)
+        if abs_bytes < unit and not gnu:
+            return (format + " %s") % ((base * bytes_ / unit), s)
+        elif abs_bytes < unit and gnu:
+            return (format + "%s") % ((base * bytes_ / unit), s)
+    if gnu:
+        return (format + "%s") % ((base * bytes_ / unit), s)
+    return (format + " %s") % ((base * bytes_ / unit), s)
diff --git a/graphforge/sample_repos/humanize/number.py b/graphforge/sample_repos/humanize/number.py
new file mode 100644
index 0000000000000000000000000000000000000000..547b65c563b12a952eaf1e891bc8cf7d23d74b3f
--- /dev/null
+++ b/graphforge/sample_repos/humanize/number.py
@@ -0,0 +1,198 @@
+"""Humanizing functions for numbers."""
+
+import math
+import re
+from fractions import Fraction
+
+powers = [10**x for x in (3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 100)]
+human_powers = (
+    "thousand", "million", "billion", "trillion", "quadrillion",
+    "quintillion", "sextillion", "septillion", "octillion",
+    "nonillion", "decillion", "googol",
+)
+
+
+def ordinal(value):
+    """Convert an integer to its ordinal string (1 → '1st', 2 → '2nd', etc.).
+
+    Examples:
+        >>> ordinal(1)
+        '1st'
+        >>> ordinal(12)
+        '12th'
+        >>> ordinal(103)
+        '103rd'
+    """
+    try:
+        value = int(value)
+    except (TypeError, ValueError):
+        return value
+    t = ("th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th")
+    if value % 100 in (11, 12, 13):
+        return f"{value}th"
+    return f"{value}{t[value % 10]}"
+
+
+def intcomma(value, ndigits=None):
+    """Convert an integer to a string with commas every three digits.
+
+    Examples:
+        >>> intcomma(1000000)
+        '1,000,000'
+        >>> intcomma(1234567.25)
+        '1,234,567.25'
+    """
+    try:
+        if isinstance(value, str):
+            float(value.replace(",", ""))
+        else:
+            float(value)
+    except (TypeError, ValueError):
+        return value
+
+    if ndigits:
+        orig = "{0:.{1}f}".format(value, ndigits)
+    else:
+        orig = str(value)
+
+    new = re.sub(r"^(-?\d+)(\d{3})", r"\g<1>,\g<2>", orig)
+    if orig == new:
+        return new
+    return intcomma(new)
+
+
+def intword(value, format="%.1f"):
+    """Convert a large integer to a friendly text representation.
+
+    Examples:
+        >>> intword(1000000)
+        '1.0 million'
+        >>> intword(1200000000)
+        '1.2 billion'
+    """
+    try:
+        value = int(value)
+    except (TypeError, ValueError):
+        return value
+    if value < powers[0]:
+        return str(value)
+    for ordinal_idx, power in enumerate(powers[1:], 1):
+        if value < power:
+            chopped = value / float(powers[ordinal_idx - 1])
+            count = math.ceil(chopped)
+            label = human_powers[ordinal_idx - 1]
+            plural = label + "s" if count != 1 else label
+            if float(format % chopped) == float(10**3):
+                chopped = value / float(powers[ordinal_idx])
+                count = math.ceil(chopped)
+                label = human_powers[ordinal_idx]
+                plural = label + "s" if count != 1 else label
+                return (format + " %s") % (chopped, plural)
+            return (format + " %s") % (chopped, plural)
+    return str(value)
+
+
+def apnumber(value):
+    """Convert integers 0–9 to their AP-style word equivalents.
+
+    Examples:
+        >>> apnumber(5)
+        'five'
+        >>> apnumber(10)
+        '10'
+    """
+    words = ("zero", "one", "two", "three", "four",
+             "five", "six", "seven", "eight", "nine")
+    try:
+        value = int(value)
+    except (TypeError, ValueError):
+        return value
+    if not 0 <= value < 10:
+        return str(value)
+    return words[value]
+
+
+def fractional(value):
+    """Convert a float to a human-readable fractional string.
+
+    Examples:
+        >>> fractional(0.3)
+        '3/10'
+        >>> fractional(1.3)
+        '1 3/10'
+        >>> fractional(1)
+        '1'
+    """
+    try:
+        number = float(value)
+    except (TypeError, ValueError):
+        return value
+    whole = int(number)
+    frac = Fraction(number - whole).limit_denominator(1000)
+    n, d = frac.numerator, frac.denominator
+    if whole and not n and d == 1:
+        return f"{whole:.0f}"
+    elif not whole:
+        return f"{n:.0f}/{d:.0f}"
+    return f"{whole:.0f} {n:.0f}/{d:.0f}"
+
+
+def scientific(value, precision=2):
+    """Return a number in scientific notation (e.g. 5.00 x 10²).
+
+    Examples:
+        >>> scientific(500)
+        '5.00 x 10²'
+        >>> scientific(0.3)
+        '3.00 x 10⁻¹'
+    """
+    exponents = {
+        "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴",
+        "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹",
+        "+": "⁺", "-": "⁻",
+    }
+    negative = False
+    try:
+        if "-" in str(value):
+            value = str(value).replace("-", "")
+            negative = True
+        if isinstance(value, str):
+            value = float(value)
+        fmt = "{:.%se}" % str(int(precision))
+        n = fmt.format(value)
+    except (ValueError, TypeError):
+        return value
+    part1, part2 = n.split("e")
+    part2 = part2.replace("-0", "-").replace("+0", "")
+    new_part2 = []
+    if negative:
+        new_part2.append(exponents["-"])
+    for char in part2:
+        new_part2.append(exponents[char])
+    return part1 + " x 10" + "".join(new_part2)
+
+
+def clamp(value, format="{:}", floor=None, ceil=None, floor_token="<", ceil_token=">"):
+    """Return a number formatted and clamped between floor and ceil.
+
+    Examples:
+        >>> clamp(123.456)
+        '123.456'
+        >>> clamp(0.001, floor=0.01)
+        '<0.01'
+        >>> clamp(999, ceil=100)
+        '>100'
+    """
+    if value is None:
+        return None
+    if floor is not None and value < floor:
+        value, token = floor, floor_token
+    elif ceil is not None and value > ceil:
+        value, token = ceil, ceil_token
+    else:
+        token = ""
+    if isinstance(format, str):
+        return token + format.format(value)
+    elif callable(format):
+        return token + format(value)
+    raise ValueError("format must be a string or callable")
diff --git a/graphforge/sample_repos/humanize/time.py b/graphforge/sample_repos/humanize/time.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d89c2a7461250c344c90f4dfb0877b992b9b99
--- /dev/null
+++ b/graphforge/sample_repos/humanize/time.py
@@ -0,0 +1,225 @@
+"""Time humanizing functions."""
+
+import datetime as dt
+import math
+from enum import Enum
+from functools import total_ordering
+
+
+@total_ordering
+class Unit(Enum):
+    MICROSECONDS = 0
+    MILLISECONDS = 1
+    SECONDS = 2
+    MINUTES = 3
+    HOURS = 4
+    DAYS = 5
+    MONTHS = 6
+    YEARS = 7
+
+    def __lt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value < other.value
+        return NotImplemented
+
+
+def _now():
+    return dt.datetime.now()
+
+
+def _abs_timedelta(delta):
+    if delta.days < 0:
+        now = _now()
+        return now - (now + delta)
+    return delta
+
+
+def _date_and_delta(value, *, now=None):
+    if not now:
+        now = _now()
+    if isinstance(value, dt.datetime):
+        date = value
+        delta = now - value
+    elif isinstance(value, dt.timedelta):
+        date = now - value
+        delta = value
+    else:
+        try:
+            value = int(value)
+            delta = dt.timedelta(seconds=value)
+            date = now - delta
+        except (ValueError, TypeError):
+            return None, value
+    return date, _abs_timedelta(delta)
+
+
+def naturaldelta(value, months=True, minimum_unit="seconds") -> str:
+    """Return a natural representation of a timedelta or number of seconds.
+
+    Does not include tense (use naturaltime for past/future).
+
+    Examples:
+        >>> import datetime as dt
+        >>> naturaldelta(dt.timedelta(seconds=90))
+        'a minute'
+        >>> naturaldelta(dt.timedelta(hours=2))
+        '2 hours'
+        >>> naturaldelta(dt.timedelta(days=400))
+        'a year'
+    """
+    tmp = Unit[minimum_unit.upper()]
+    if tmp not in (Unit.SECONDS, Unit.MILLISECONDS, Unit.MICROSECONDS):
+        raise ValueError(f"Minimum unit '{minimum_unit}' not supported")
+    minimum_unit = tmp
+
+    if isinstance(value, dt.timedelta):
+        delta = value
+    else:
+        try:
+            value = int(value)
+            delta = dt.timedelta(seconds=value)
+        except (ValueError, TypeError):
+            return value
+
+    seconds = abs(delta.seconds)
+    days = abs(delta.days)
+    years = days // 365
+    days = days % 365
+    months_count = int(days // 30.5)
+
+    if not years and days < 1:
+        if seconds == 0:
+            return "a moment"
+        elif seconds == 1:
+            return "a second"
+        elif seconds < 60:
+            return f"{seconds} seconds" if seconds > 1 else "a second"
+        elif 60 <= seconds < 120:
+            return "a minute"
+        elif 120 <= seconds < 3600:
+            minutes = seconds // 60
+            return f"{minutes} minutes"
+        elif 3600 <= seconds < 7200:
+            return "an hour"
+        else:
+            hours = seconds // 3600
+            return f"{hours} hours"
+    elif years == 0:
+        if days == 1:
+            return "a day"
+        if not months or not months_count:
+            return f"{days} days"
+        elif months_count == 1:
+            return "a month"
+        return f"{months_count} months"
+    elif years == 1:
+        if not months_count and not days:
+            return "a year"
+        elif not months_count:
+            return f"1 year, {days} days" if days > 1 else "1 year, a day"
+        elif months_count == 1:
+            return "1 year, 1 month"
+        return f"1 year, {months_count} months"
+    return f"{years} years"
+
+
+def naturaltime(value, future=False, months=True, minimum_unit="seconds", when=None) -> str:
+    """Return a natural representation of a time relative to now.
+
+    Examples:
+        >>> import datetime as dt
+        >>> naturaltime(dt.timedelta(seconds=30))
+        '30 seconds ago'
+        >>> naturaltime(dt.timedelta(hours=1), future=True)
+        'an hour from now'
+    """
+    now = when or _now()
+    date, delta = _date_and_delta(value, now=now)
+    if date is None:
+        return value
+    if isinstance(value, (dt.datetime, dt.timedelta)):
+        future = date > now
+    ago = "%s from now" if future else "%s ago"
+    delta_str = naturaldelta(delta, months, minimum_unit)
+    if delta_str == "a moment":
+        return "now"
+    return ago % delta_str
+
+
+def naturalday(value, format="%b %d") -> str:
+    """Return 'today', 'tomorrow', 'yesterday', or a formatted date string.
+
+    Examples:
+        >>> import datetime as dt
+        >>> naturalday(dt.date.today())
+        'today'
+    """
+    try:
+        value = dt.date(value.year, value.month, value.day)
+    except (AttributeError, OverflowError, ValueError):
+        return value
+    delta = value - dt.date.today()
+    if delta.days == 0:
+        return "today"
+    elif delta.days == 1:
+        return "tomorrow"
+    elif delta.days == -1:
+        return "yesterday"
+    return value.strftime(format)
+
+
+def naturaldate(value) -> str:
+    """Like naturalday, but appends year for dates more than ~5 months away."""
+    try:
+        value = dt.date(value.year, value.month, value.day)
+    except (AttributeError, OverflowError, ValueError):
+        return value
+    delta = _abs_timedelta(value - dt.date.today())
+    if delta.days >= 5 * 365 / 12:
+        return naturalday(value, "%b %d %Y")
+    return naturalday(value)
+
+
+def precisedelta(value, minimum_unit="seconds", suppress=(), format="%0.2f") -> str:
+    """Return a precise, human-readable representation of a timedelta.
+
+    Examples:
+        >>> import datetime as dt
+        >>> precisedelta(dt.timedelta(seconds=3633, days=2))
+        '2 days and 1 hour and 33 seconds'
+    """
+    date, delta = _date_and_delta(value)
+    if date is None:
+        return value
+
+    suppress_units = {Unit[s.upper()] for s in suppress}
+    min_unit = Unit[minimum_unit.upper()]
+
+    days = delta.days
+    secs = delta.seconds
+
+    years, days = divmod(days, 365)
+    months_count = int(days // 30.5)
+    days = days % 30
+
+    hours, secs = divmod(secs, 3600)
+    minutes, secs = divmod(secs, 60)
+
+    parts = []
+    for count, singular, plural in [
+        (years,         "year",   "years"),
+        (months_count,  "month",  "months"),
+        (days,          "day",    "days"),
+        (hours,         "hour",   "hours"),
+        (minutes,       "minute", "minutes"),
+        (secs,          "second", "seconds"),
+    ]:
+        if count > 0:
+            label = singular if count == 1 else plural
+            parts.append(f"{count} {label}")
+
+    if not parts:
+        return "0 seconds"
+    if len(parts) == 1:
+        return parts[0]
+    return " and ".join(parts)
diff --git a/graphforge/sample_repos/task_manager/__init__.py b/graphforge/sample_repos/task_manager/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a24c1a609997961ec5a2445158f638db528ff71
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/__init__.py
@@ -0,0 +1 @@
+"""Task Manager — a small synthetic package used as the training repo."""
diff --git a/graphforge/sample_repos/task_manager/api.py b/graphforge/sample_repos/task_manager/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd965018d69cac7a31c34bc7ca885cc76c1cd3c
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/api.py
@@ -0,0 +1,48 @@
+"""High-level API layer that wires models, storage, and validators together."""
+
+from __future__ import annotations
+
+from graphforge.sample_repos.task_manager.models import Task
+from graphforge.sample_repos.task_manager.storage import TaskStore
+from graphforge.sample_repos.task_manager.validators import validate_priority, validate_tags, validate_title
+
+_store = TaskStore()
+
+
+def create_task(
+    title: str,
+    priority: str = "medium",
+    tags: list[str] | None = None,
+) -> Task:
+    """Create and persist a new task.
+
+    Raises ValueError if title or tags are invalid.
+    """
+    if not validate_title(title):
+        raise ValueError(f"Invalid title: {title!r}")
+    resolved_tags = tags or []
+    if not validate_tags(resolved_tags):
+        raise ValueError(f"Invalid tags: {resolved_tags!r}")
+    task = Task(title=title, priority=priority, tags=resolved_tags)
+    _store.add(task)
+    return task
+
+
+def get_all_tasks() -> list[Task]:
+    """Return every task in the store."""
+    return _store.all()
+
+
+def complete_task(title: str) -> bool:
+    """Mark a task done by title. Returns True if found, False otherwise."""
+    task = _store.find_by_title(title)
+    if task:
+        task.complete()
+        return True
+    return False
+
+
+def reset_store() -> None:
+    """Clear the store — used by tests between runs."""
+    global _store
+    _store = TaskStore()
diff --git a/graphforge/sample_repos/task_manager/models.py b/graphforge/sample_repos/task_manager/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..83566d175f358e01eb2a30fb439968f3010d7fb8
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/models.py
@@ -0,0 +1,47 @@
+"""Domain models for the task manager."""
+
+from __future__ import annotations
+
+from datetime import date
+from typing import Optional
+
+
+class Task:
+    """A single task in the task manager."""
+
+    def __init__(
+        self,
+        title: str,
+        priority: str,
+        tags: list[str],
+        due_date: Optional[date] = None,
+    ) -> None:
+        self.title = title
+        self.priority = priority   # expected: "low" | "medium" | "high"
+        self.tags = tags
+        self.due_date = due_date
+        self.done = False
+
+    def complete(self) -> None:
+        """Mark this task as done."""
+        self.done = True
+
+    def to_dict(self) -> dict:
+        return {
+            "title": self.title,
+            "priority": self.priority,
+            "tags": self.tags,
+            "done": self.done,
+            "due_date": str(self.due_date) if self.due_date else None,
+        }
+
+
+class User:
+    """A user who owns tasks."""
+
+    def __init__(self, username: str, email: str) -> None:
+        self.username = username
+        self.email = email
+
+    def display(self) -> str:
+        return f"{self.username} <{self.email}>"
diff --git a/graphforge/sample_repos/task_manager/storage.py b/graphforge/sample_repos/task_manager/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b896e96f42aab5f546f1d0ca988be86d8913e00
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/storage.py
@@ -0,0 +1,37 @@
+"""In-memory task storage."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from graphforge.sample_repos.task_manager.models import Task
+
+
+class TaskStore:
+    """Simple in-memory list-backed store for Task objects."""
+
+    def __init__(self) -> None:
+        self._tasks: list[Task] = []
+
+    def add(self, task: Task) -> None:
+        """Append task to the store."""
+        self._tasks.append(task)
+
+    def all(self) -> list[Task]:
+        """Return all tasks."""
+        return list(self._tasks)
+
+    def find_by_title(self, title: str) -> Optional[Task]:
+        """Return the first task whose title matches, or None."""
+        for t in self._tasks:
+            if t.title == title:
+                return t
+        return None
+
+    def find_done(self) -> list[Task]:
+        """Return all completed tasks."""
+        return [t for t in self._tasks if t.done]
+
+    def find_pending(self) -> list[Task]:
+        """Return all incomplete tasks."""
+        return [t for t in self._tasks if not t.done]
diff --git a/graphforge/sample_repos/task_manager/validators.py b/graphforge/sample_repos/task_manager/validators.py
new file mode 100644
index 0000000000000000000000000000000000000000..4083f55b3b0a22c6d73ae960cf3551ac3c7bd446
--- /dev/null
+++ b/graphforge/sample_repos/task_manager/validators.py
@@ -0,0 +1,25 @@
+"""Input validation functions for the task manager."""
+
+from __future__ import annotations
+
+VALID_PRIORITIES = {"low", "medium", "high"}
+
+
+def validate_title(title: str) -> bool:
+    """Return True if title is a non-empty string <= 200 chars."""
+    return isinstance(title, str) and 0 < len(title) <= 200
+
+
+def validate_tags(tags: object) -> bool:
+    """Return True if tags is a list of strings."""
+    return isinstance(tags, list) and all(isinstance(t, str) for t in tags)
+
+
+def validate_email(email: str) -> bool:
+    """Return True if email looks like a valid address (contains @ and .)."""
+    return isinstance(email, str) and "@" in email and "." in email.split("@")[-1]
+
+
+def validate_priority(priority: str) -> bool:
+    """Return True if priority is one of 'low', 'medium', or 'high'."""
+    return priority in VALID_PRIORITIES
diff --git a/graphforge/server/__init__.py b/graphforge/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e126ae8dbab1d0c8872cf47255788f1f10e092
--- /dev/null
+++ b/graphforge/server/__init__.py
@@ -0,0 +1,19 @@
+"""FastAPI OpenEnv server.
+
+Endpoints (PROPOSAL.md §6.1):
+
+  POST /reset    -> create a fresh episode, return initial observation
+  POST /step     -> apply an Action, return (observation, reward, done, info)
+  GET  /state    -> snapshot the current episode state for debugging
+  POST /close    -> tear down the episode
+
+The server is a thin shell: it owns episode state (graph, task spec,
+action history, token counter, turn counter, materialization cache) and
+delegates the work to the dispatcher, reward engine, and validators.
+
+The training-side OpenEnv client calls this over HTTP at localhost:8000.
+"""
+
+from graphforge.server.app import app
+
+__all__ = ["app"]
diff --git a/graphforge/server/app.py b/graphforge/server/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32e8c89a3985abf5a9c71076fb24b895bbd7c1d
--- /dev/null
+++ b/graphforge/server/app.py
@@ -0,0 +1,124 @@
+"""FastAPI application — the OpenEnv server.
+
+Endpoints (PROPOSAL.md §6.1):
+
+  POST /reset   { task_id?: str | None, seed?: int }
+                -> { episode_id, observation }
+  POST /step    { episode_id, action: Action }
+                -> { observation, reward, done, info }
+  GET  /state?episode_id=...
+                -> { ... full snapshot ... }
+  POST /close   { episode_id }
+                -> { closed: bool }
+
+The handlers are thin: routing, request validation, episode lookup. The
+actual per-step orchestration lives in :mod:`graphforge.server.runner`.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+from graphforge.actions.schema import Action
+from graphforge.server.episode import GLOBAL_STORE, Episode, EpisodeStore
+from graphforge.server.runner import step as runner_step
+from graphforge.tasks import default_task, get_task
+
+app = FastAPI(
+    title="GraphForge OpenEnv server",
+    version="0.0.1",
+    description="See graphforge.server for the wire shape.",
+)
+
+
+# ---- request / response models --------------------------------------
+
+
+class ResetRequest(BaseModel):
+    task_id: Optional[str] = None
+    seed: Optional[int] = None  # reserved for variant generation, unused for tier-0
+
+
+class StepRequest(BaseModel):
+    episode_id: str
+    # ``Action`` is itself an Annotated discriminated union; no need to
+    # re-declare the discriminator on this field.
+    action: Action
+
+
+class CloseRequest(BaseModel):
+    episode_id: str
+
+
+# ---- store wiring (overridable for tests) ---------------------------
+
+
+def _store() -> EpisodeStore:
+    return GLOBAL_STORE
+
+
+# ---- helpers --------------------------------------------------------
+
+
+def _require_episode(episode_id: str) -> Episode:
+    ep = _store().get(episode_id)
+    if ep is None:
+        raise HTTPException(status_code=404, detail=f"unknown episode_id: {episode_id!r}")
+    return ep
+
+
+def _initial_observation(ep: Episode) -> dict[str, Any]:
+    return {
+        "episode_id": ep.id,
+        "task": ep.task.visible_payload(),
+        "turns_total": 0,
+        "tokens_used_total": 0,
+        "budget": ep.task.budget,
+        "episode_cap": ep.task.episode_cap,
+    }
+
+
+# ---- endpoints ------------------------------------------------------
+
+
+@app.post("/reset")
+def reset(req: ResetRequest) -> dict:
+    if req.task_id is None:
+        task = default_task()
+    else:
+        t = get_task(req.task_id)
+        if t is None:
+            raise HTTPException(status_code=404, detail=f"unknown task_id: {req.task_id!r}")
+        task = t
+    ep = Episode.new(task=task)
+    _store().put(ep)
+    return {
+        "episode_id": ep.id,
+        "observation": _initial_observation(ep),
+    }
+
+
+@app.post("/step")
+def step(req: StepRequest) -> dict:
+    ep = _require_episode(req.episode_id)
+    return runner_step(ep, req.action)
+
+
+@app.get("/state")
+def state(episode_id: str) -> dict:
+    ep = _require_episode(episode_id)
+    return ep.state_snapshot()
+
+
+@app.post("/close")
+def close(req: CloseRequest) -> dict:
+    closed = _store().drop(req.episode_id)
+    return {"closed": closed}
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+    return {"status": "ok", "version": app.version}
diff --git a/graphforge/server/episode.py b/graphforge/server/episode.py
new file mode 100644
index 0000000000000000000000000000000000000000..3817525be8850498a2d3ff8c507c031876d7e0dd
--- /dev/null
+++ b/graphforge/server/episode.py
@@ -0,0 +1,171 @@
+"""Episode state — one per active OpenEnv session.
+
+The server holds episodes in an in-memory dict keyed by ``episode_id``.
+Episodes are entirely self-contained: they own a :class:`Graph`, a
+:class:`Task`, and the running history. There is no leakage between
+episodes (PROPOSAL.md §6.2 — "episode isolation").
+
+Token accounting is a server-side concern. We use a coarse character-based
+estimate (``len(json) // 4``) until a real tokenizer is wired in. The
+estimate is consistent across baseline and trained runs because both go
+through the same envelope.
+"""
+
+from __future__ import annotations
+
+import json
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+from graphforge.actions.dispatcher import ActionResult
+from graphforge.graph.schema import Graph
+from graphforge.reward.engine import ActionOutcome, TurnReward
+from graphforge.tasks.schema import Task
+
+
+# ---- token estimation -----------------------------------------------
+
+
+def estimate_tokens(payload: Any) -> int:
+    """Coarse token estimate over a JSON-serializable payload.
+
+    ~4 chars / token is the GPT-style rule of thumb. The exact tokenizer
+    matters for training-time reward magnitudes; this estimate is a
+    placeholder that's monotone in the size of the payload, which is
+    enough to drive the 'prefer cheap queries over expensive ones' shaping
+    while we wait on the real Qwen tokenizer.
+    """
+    try:
+        s = json.dumps(payload, default=str)
+    except Exception:
+        s = str(payload)
+    return max(0, len(s) // 4)
+
+
+# ---- history records ------------------------------------------------
+
+
+@dataclass
+class TurnRecord:
+    turn: int
+    action_kind: str
+    action_args: dict[str, Any]
+    outcome: str       # ActionOutcome value
+    ok: bool
+    reward: float
+    payload: dict[str, Any] = field(default_factory=dict)
+    is_duplicate: bool = False
+    tokens_returned: int = 0
+
+
+# ---- episode --------------------------------------------------------
+
+
+@dataclass
+class Episode:
+    id: str
+    task: Task
+    graph: Graph = field(default_factory=Graph.empty)
+    history: list[TurnRecord] = field(default_factory=list)
+    tokens_used: int = 0
+    turns: int = 0
+    terminated: bool = False
+    terminal_reward: float | None = None
+    terminal_payload: dict[str, Any] | None = None
+
+    @classmethod
+    def new(cls, task: Task) -> "Episode":
+        return cls(id=str(uuid.uuid4()), task=task)
+
+    # ----- duplicate detection ---------------------------------------
+
+    def is_duplicate(self, kind: str, args: dict[str, Any]) -> bool:
+        """True iff an identical (kind, args) action was tried this episode."""
+        for r in self.history:
+            if r.action_kind == kind and r.action_args == args:
+                return True
+        return False
+
+    # ----- bookkeeping -----------------------------------------------
+
+    def record_turn(
+        self,
+        kind: str,
+        args: dict[str, Any],
+        result: ActionResult,
+        outcome: ActionOutcome,
+        turn_reward: TurnReward,
+        is_duplicate: bool,
+        tokens_returned: int,
+    ) -> TurnRecord:
+        rec = TurnRecord(
+            turn=self.turns,
+            action_kind=kind,
+            action_args=args,
+            outcome=outcome.value,
+            ok=result.ok,
+            reward=turn_reward.total,
+            payload=result.payload,
+            is_duplicate=is_duplicate,
+            tokens_returned=tokens_returned,
+        )
+        self.history.append(rec)
+        self.turns += 1
+        self.tokens_used += tokens_returned
+        return rec
+
+    # ----- snapshot --------------------------------------------------
+
+    def state_snapshot(self) -> dict[str, Any]:
+        return {
+            "episode_id": self.id,
+            "task": self.task.visible_payload(),
+            "turns": self.turns,
+            "tokens_used": self.tokens_used,
+            "budget": self.task.budget,
+            "episode_cap": self.task.episode_cap,
+            "terminated": self.terminated,
+            "graph": {
+                "modules": [m.model_dump() for m in self.graph.modules],
+                "nodes": [n.model_dump() for n in self.graph.nodes],
+                "edges": [e.model_dump() for e in self.graph.edges],
+            },
+            "history": [
+                {
+                    "turn": r.turn,
+                    "action_kind": r.action_kind,
+                    "ok": r.ok,
+                    "reward": r.reward,
+                }
+                for r in self.history
+            ],
+            "terminal_reward": self.terminal_reward,
+        }
+
+
+# ---- in-memory store ------------------------------------------------
+
+
+class EpisodeStore:
+    """Thin wrapper around a dict so we can swap in a TTL cache later."""
+
+    def __init__(self) -> None:
+        self._eps: dict[str, Episode] = {}
+
+    def put(self, ep: Episode) -> None:
+        self._eps[ep.id] = ep
+
+    def get(self, episode_id: str) -> Episode | None:
+        return self._eps.get(episode_id)
+
+    def drop(self, episode_id: str) -> bool:
+        return self._eps.pop(episode_id, None) is not None
+
+    def __len__(self) -> int:
+        return len(self._eps)
+
+
+# Singleton store. The server module holds onto this for the lifetime of
+# the process. Tests can construct their own EpisodeStore for isolation.
+GLOBAL_STORE = EpisodeStore()
diff --git a/graphforge/server/runner.py b/graphforge/server/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8499d03a84eabfa8cfac897aa8048e45cf33d3
--- /dev/null
+++ b/graphforge/server/runner.py
@@ -0,0 +1,144 @@
+"""Episode runner — the per-step orchestration the server endpoints use.
+
+Pulls together dispatcher, reward engine, constraint checker, and episode
+state. Kept separate from the FastAPI app so it can be unit-tested without
+spinning up an HTTP server.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from graphforge.actions import dispatch
+from graphforge.actions.schema import Action, Submit
+from graphforge.constraints import evaluate_all
+from graphforge.materializer import materialize
+from graphforge.reward.engine import (
+    ActionOutcome,
+    TurnReward,
+    score_terminal,
+    score_turn,
+)
+from graphforge.server.episode import (
+    Episode,
+    TurnRecord,
+    estimate_tokens,
+)
+from graphforge.validator import full_check
+
+
+def _classify_outcome(action: Action, ok: bool) -> ActionOutcome:
+    # Schema rejection happens before this function (caught by FastAPI's
+    # pydantic validation). What we see here is a successfully-parsed
+    # action that either succeeded or failed at handler-time.
+    return ActionOutcome.SUCCESS if ok else ActionOutcome.FAILURE
+
+
+def _render_observation(ep: Episode, turn_record: TurnRecord) -> dict[str, Any]:
+    return {
+        "turn": turn_record.turn,
+        "ok": turn_record.ok,
+        "outcome": turn_record.outcome,
+        "payload": turn_record.payload,
+        "reward": turn_record.reward,
+        "is_duplicate": turn_record.is_duplicate,
+        "tokens_returned": turn_record.tokens_returned,
+        "tokens_used_total": ep.tokens_used,
+        "turns_total": ep.turns,
+        "budget_remaining": max(0, ep.task.budget - ep.tokens_used),
+        "episode_cap_remaining": max(0, ep.task.episode_cap - ep.turns),
+    }
+
+
+def step(ep: Episode, action: Action) -> dict[str, Any]:
+    """Apply ``action`` to ``ep``. Auto-terminates on submit or cap.
+
+    Returns a dict in the OpenEnv ``/step`` response shape:
+    ``{observation, reward, done, info}``.
+    """
+    if ep.terminated:
+        return {
+            "observation": {},
+            "reward": 0.0,
+            "done": True,
+            "info": {"error": "episode_already_terminated"},
+        }
+
+    args = action.model_dump(exclude={"kind"})
+    kind = action.kind  # type: ignore[attr-defined]
+    is_duplicate = ep.is_duplicate(kind, args)
+
+    result = dispatch(ep.graph, action)
+    tokens_returned = estimate_tokens(result.payload)
+    outcome = _classify_outcome(action, result.ok)
+    turn_reward = score_turn(
+        outcome=outcome,
+        is_duplicate=is_duplicate,
+        tokens_returned=tokens_returned,
+    )
+    rec = ep.record_turn(
+        kind=kind,
+        args=args,
+        result=result,
+        outcome=outcome,
+        turn_reward=turn_reward,
+        is_duplicate=is_duplicate,
+        tokens_returned=tokens_returned,
+    )
+
+    done = False
+    info: dict[str, Any] = {}
+
+    # Terminate on Submit.
+    if isinstance(action, Submit):
+        done = True
+        terminal = _score_terminal(ep)
+        ep.terminated = True
+        ep.terminal_reward = terminal["total"]
+        ep.terminal_payload = terminal
+        info["terminal"] = terminal
+
+    # Terminate on episode cap.
+    if not done and ep.turns >= ep.task.episode_cap:
+        done = True
+        terminal = _score_terminal(ep)
+        ep.terminated = True
+        ep.terminal_reward = terminal["total"]
+        ep.terminal_payload = terminal
+        info["terminal"] = terminal
+        info["reason"] = "episode_cap_reached"
+
+    return {
+        "observation": _render_observation(ep, rec),
+        "reward": rec.reward + (info.get("terminal", {}).get("total", 0.0) if done else 0.0),
+        "done": done,
+        "info": info,
+    }
+
+
+def _score_terminal(ep: Episode) -> dict[str, Any]:
+    """Compute terminal reward + return a serialized payload."""
+    sat = evaluate_all(ep.graph, ep.task.all_constraints)
+    structural, behavioral = sat.split_by_family()
+
+    # materialization gate: try to materialize + parse-check.
+    materialization_ok = False
+    try:
+        files = materialize(ep.graph)
+        materialization_ok = full_check(files).ok
+    except Exception:
+        materialization_ok = False
+
+    reward = score_terminal(
+        n_structural_satisfied=len(structural.satisfied),
+        n_structural_total=structural.total,
+        n_behavioral_passing=len(behavioral.satisfied),
+        n_behavioral_total=behavioral.total,
+        materialization_ok=materialization_ok,
+        type_checks_ok=None,  # mypy not wired yet
+        tokens_used=ep.tokens_used,
+        budget=ep.task.budget,
+    )
+    out = reward.to_dict()
+    out["satisfaction"] = sat.to_dict()
+    return out
diff --git a/graphforge/task_generator.py b/graphforge/task_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..08eff48ddaf8b4409a1f4bcf9c66d1dca6dd32b4
--- /dev/null
+++ b/graphforge/task_generator.py
@@ -0,0 +1,227 @@
+"""Auto-generate training tasks from any Python repository.
+
+Pipeline
+--------
+1. Parse the repo with AST → KnowledgeGraph
+2. Find public functions that have doctest examples (>>> in docstring)
+3. Extract those examples as runnable assertions
+4. Replace the function body with `raise NotImplementedError` — the agent
+   must re-implement it from the docstring alone
+5. Return RepoTask objects ready for GRPO training — no hand-writing needed
+
+Usage
+-----
+    from graphforge.task_generator import generate_tasks
+    tasks = generate_tasks("/tmp/humanize/src/humanize", n_tasks=6)
+    for t in tasks:
+        print(t.task_id, "→", t.description[:60])
+"""
+
+from __future__ import annotations
+
+import ast
+import doctest
+import textwrap
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from graphforge.knowledge_graph import KGNode, KnowledgeGraph
+from graphforge.repo_parser import parse_repo
+
+
+# ── Task dataclass (mirrors env.tasks.RepoTask but lives here to avoid circular import) ──
+
+@dataclass
+class AutoTask:
+    task_id: str
+    repo_name: str
+    repo_path: str          # absolute path to the repo source directory
+    description: str
+    test_code: str          # uses short import: from <repo_name>.<module> import <func>
+    stubbed_node_id: str    # the node whose body was replaced
+    original_source: str    # saved so env can restore on reset
+    max_turns: int = 12
+    difficulty: int = 0
+    hints: list[str] = field(default_factory=list)
+
+
+# ── Doctest extraction ────────────────────────────────────────────────────────
+
+def _extract_all_examples(docstring: str) -> list[tuple[str, str]]:
+    """Return ALL doctest lines as (source, want) — want is '' for setup lines."""
+    if not docstring:
+        return []
+    parser = doctest.DocTestParser()
+    try:
+        examples = parser.get_examples(docstring, name="<doc>")
+        return [(ex.source.strip(), ex.want.strip()) for ex in examples]
+    except Exception:
+        return []
+
+
+def _to_assertion(expr: str, expected: str) -> str | None:
+    """Convert one doctest example to a Python assertion.
+
+    - True/False expected → assert (expr) is True/False
+    - Traceback expected  → skip
+    - Non-literal         → skip
+    """
+    if not expected or expected.startswith("Traceback"):
+        return None
+    if expected in ("True", "False"):
+        return f"assert ({expr}) is {expected}, f'got {{repr({expr})}}'"
+    try:
+        ast.literal_eval(expected)
+    except (ValueError, SyntaxError):
+        return None
+    return f"assert {expr} == {expected}, f'got {{repr({expr})}}'"
+
+
+def _build_test_code(func_name: str, module_stem: str, repo_name: str,
+                     all_examples: list[tuple[str, str]]) -> str | None:
+    """Build complete test code including setup lines then assertions."""
+    import_line = f"from {repo_name}.{module_stem} import {func_name}"
+    setup_lines: list[str] = []
+    assertion_lines: list[str] = []
+
+    for expr, expected in all_examples:
+        if not expected:
+            setup_lines.append(expr)
+        else:
+            a = _to_assertion(expr, expected)
+            if a and func_name in a:   # only keep assertions that call our function
+                assertion_lines.append(a)
+
+    if len(assertion_lines) < 2:
+        return None
+    parts = [import_line] + setup_lines + assertion_lines
+    return "\n".join(parts)
+
+
+# ── Function stubbing ─────────────────────────────────────────────────────────
+
+def _stub_function(source: str) -> str:
+    """Replace a function body with `raise NotImplementedError`, keeping signature + docstring."""
+    dedented = textwrap.dedent(source)
+    try:
+        tree = ast.parse(dedented)
+    except SyntaxError:
+        return source
+
+    lines = dedented.splitlines()
+    for node in ast.walk(tree):
+        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            continue
+
+        body = node.body
+        indent = "    " * (node.col_offset // 4 + 1)
+
+        # Keep signature lines (everything up to and including the colon)
+        sig_end = body[0].lineno - 1  # 0-indexed line where body starts
+
+        # Keep docstring if present
+        if body and isinstance(body[0], ast.Expr) and isinstance(body[0].value, ast.Constant):
+            keep_until = body[0].end_lineno  # inclusive, 1-indexed
+        else:
+            keep_until = sig_end
+
+        kept = "\n".join(lines[:keep_until])
+        stub = kept.rstrip() + f"\n{indent}raise NotImplementedError\n"
+        return stub
+
+    return source
+
+
+# ── Candidate selection ───────────────────────────────────────────────────────
+
+def _score_candidate(node: KGNode, examples: list) -> int:
+    """Higher = better training signal. Prefer more examples and longer docstrings."""
+    return len(examples) * 3 + min(len(node.docstring or ""), 200) // 20
+
+
+def _find_candidates(kg: KnowledgeGraph, repo_name: str) -> list[tuple[KGNode, str, int]]:
+    """Return (node, test_code, score) for all viable candidates."""
+    candidates = []
+    for node in kg.all_nodes("function"):
+        if node.name.startswith("_"):
+            continue
+        if not node.docstring or not node.source:
+            continue
+        module_stem = Path(node.file_path).stem if node.file_path else None
+        if not module_stem:
+            continue
+
+        examples = _extract_all_examples(node.docstring)
+        if not examples:
+            continue
+
+        test_code = _build_test_code(node.name, module_stem, repo_name, examples)
+        if not test_code:
+            continue
+
+        score = _score_candidate(node, examples)
+        candidates.append((node, test_code, score))
+
+    candidates.sort(key=lambda x: x[2], reverse=True)
+    return candidates
+
+
+# ── Main entry point ──────────────────────────────────────────────────────────
+
+def generate_tasks(
+    repo_source_dir: str,
+    n_tasks: int = 4,
+    max_turns: int = 12,
+) -> tuple[KnowledgeGraph, list[AutoTask]]:
+    """Parse a Python repo directory and auto-generate training tasks.
+
+    Args:
+        repo_source_dir: Path to the Python package source directory.
+                         e.g. '/tmp/humanize/src/humanize'
+        n_tasks: How many tasks to generate (picks highest-scoring candidates).
+        max_turns: Max turns per episode.
+
+    Returns:
+        (kg, tasks) — the Knowledge Graph and the list of AutoTask objects.
+    """
+    repo_source_dir = str(Path(repo_source_dir).resolve())
+    repo_name = Path(repo_source_dir).name
+    kg = parse_repo(repo_source_dir)
+
+    candidates = _find_candidates(kg, repo_name)
+    if not candidates:
+        raise ValueError(
+            f"No suitable candidates found in {repo_source_dir}. "
+            "Make sure functions have doctest examples (>>> in docstring)."
+        )
+
+    selected = candidates[:n_tasks]
+    tasks: list[AutoTask] = []
+
+    for node, test_code, score in selected:
+        stubbed = _stub_function(node.source)
+        desc = textwrap.dedent(f"""\
+            Implement the function `{node.name}` in `{node.file_path}`.
+
+            {node.docstring.strip() if node.docstring else 'No docstring available.'}
+        """).strip()
+
+        task = AutoTask(
+            task_id=f"auto.{repo_name}.{node.name}",
+            repo_name=repo_name,
+            repo_path=repo_source_dir,
+            description=desc,
+            test_code=test_code,
+            stubbed_node_id=node.node_id,
+            original_source=node.source,
+            max_turns=max_turns,
+            difficulty=min(2, max(0, score // 8)),
+            hints=[
+                f"Look at {node.file_path} to understand the module style.",
+                f"The function signature is: {node.name}{node.metadata.get('signature', '(...)')}",
+            ],
+        )
+        tasks.append(task)
+
+    return kg, tasks
diff --git a/graphforge/tasks/__init__.py b/graphforge/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4c6af16cabc3ad690189c94f5ad961efdcd8c2
--- /dev/null
+++ b/graphforge/tasks/__init__.py
@@ -0,0 +1,10 @@
+"""Task bank and variant generator.
+
+Tier-0 ships one hand-written task. Tier-1+ tasks and parametric variant
+generation are TODO. See PROPOSAL.md §2.1, §2.3 for the full design.
+"""
+
+from graphforge.tasks.bank import default_task, get_task, list_tasks
+from graphforge.tasks.schema import Task
+
+__all__ = ["Task", "default_task", "get_task", "list_tasks"]
diff --git a/graphforge/tasks/bank.py b/graphforge/tasks/bank.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3ae66bb274be9695ff591f7b65e58a35302036
--- /dev/null
+++ b/graphforge/tasks/bank.py
@@ -0,0 +1,71 @@
+"""Tier-0 task bank.
+
+A single hand-written task that exercises every implemented subsystem
+end-to-end: build a one-module ``validators`` package with an ``is_email``
+function attached to ``validate_with_regex(EMAIL)``. Tier-1+ tasks land in
+follow-up modules.
+
+Variant generation (PROPOSAL.md §2.3 — ~50 concrete variants per template
+× domain vocabulary) is also TODO; for now we hand-author tasks until the
+env's reward-signal shape is validated end-to-end.
+"""
+
+from __future__ import annotations
+
+from graphforge.constraints.schema import (
+    AcyclicImports,
+    Materializes,
+    ModuleCount,
+    ModuleResponsibility,
+    ModuleSizeMax,
+    NodeAbsent,
+    NodeExists,
+)
+from graphforge.tasks.schema import Task
+
+
+TIER_0_EMAIL_VALIDATOR = Task(
+    id="t0.email_validator",
+    tier=0,
+    description=(
+        "Build a tiny single-module package called 'validators'. It should "
+        "expose a function `is_email(s: str) -> bool` that returns True for "
+        "well-formed email addresses and False otherwise. Use the "
+        "`validate_with_regex` body template with the EMAIL pattern. The "
+        "module must materialize cleanly to runnable Python."
+    ),
+    visible_constraints=[
+        ModuleCount(n=1),
+        ModuleResponsibility(module="validators", responsibility="validation"),
+        NodeExists(name="is_email", module="validators"),
+        Materializes(),
+    ],
+    hidden_constraints=[
+        # The visible constraints already pin most of this; the hidden set
+        # adds shape constraints the agent must infer from the description.
+        ModuleSizeMax(module="validators", n=1),
+        NodeAbsent(name="main", module="validators"),
+        AcyclicImports(),
+    ],
+    behavioral_test_names=[],  # tier-0 has no behavioral tests
+    budget=4000,
+    episode_cap=20,
+)
+
+
+_TASKS: dict[str, Task] = {
+    TIER_0_EMAIL_VALIDATOR.id: TIER_0_EMAIL_VALIDATOR,
+}
+
+
+def list_tasks() -> list[Task]:
+    return list(_TASKS.values())
+
+
+def get_task(task_id: str) -> Task | None:
+    return _TASKS.get(task_id)
+
+
+def default_task() -> Task:
+    """The task `/reset` picks when no ``task_id`` is specified."""
+    return TIER_0_EMAIL_VALIDATOR
diff --git a/graphforge/tasks/schema.py b/graphforge/tasks/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d9a7b2e907371328790cd06f27339a64c7b250
--- /dev/null
+++ b/graphforge/tasks/schema.py
@@ -0,0 +1,45 @@
+"""Task data model.
+
+A *task* is the agent-facing unit of work. The visible portion is what the
+agent sees at reset — natural-language description plus the visible subset
+of constraints. The hidden portion drives reward but is invisible to the
+policy, forcing the agent to interpret the description rather than mechanically
+satisfying a fully-revealed checklist (PROPOSAL.md §2.1).
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from graphforge.constraints.schema import Constraint
+
+
+class Task(BaseModel):
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    id: str = Field(..., min_length=1)
+    tier: int = Field(..., ge=0, le=3)
+    description: str = Field(..., min_length=1)
+    visible_constraints: list[Constraint] = Field(default_factory=list)
+    hidden_constraints: list[Constraint] = Field(default_factory=list)
+    # Behavioral test names are visible to the agent at reset; bodies live in
+    # the test runner (TODO) and are hidden. Empty for tier-0.
+    behavioral_test_names: list[str] = Field(default_factory=list)
+    budget: int = Field(..., gt=0)
+    episode_cap: int = Field(..., gt=0)
+
+    @property
+    def all_constraints(self) -> list[Constraint]:
+        return list(self.visible_constraints) + list(self.hidden_constraints)
+
+    def visible_payload(self) -> dict[str, object]:
+        """Subset of the task that's exposed to the agent at reset."""
+        return {
+            "id": self.id,
+            "tier": self.tier,
+            "description": self.description,
+            "visible_constraints": [c.model_dump() for c in self.visible_constraints],
+            "behavioral_test_names": list(self.behavioral_test_names),
+            "budget": self.budget,
+            "episode_cap": self.episode_cap,
+        }
diff --git a/graphforge/templates/__init__.py b/graphforge/templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c91735af7c6c7bdcb2bd8b09144e09979c5001d
--- /dev/null
+++ b/graphforge/templates/__init__.py
@@ -0,0 +1,15 @@
+"""Body template library.
+
+Templates are the constrained building blocks for function bodies. See
+PROPOSAL.md §3.2. The seed set is in :mod:`graphforge.templates.registry`;
+the full ~25-entry library and codegen live in :mod:`library` (TODO).
+"""
+
+from graphforge.templates.registry import (
+    TemplateSpec,
+    get_template,
+    known_templates,
+    validate_args,
+)
+
+__all__ = ["TemplateSpec", "get_template", "known_templates", "validate_args"]
diff --git a/graphforge/templates/registry.py b/graphforge/templates/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ec30bf45955f231f9edc7c720081bee9b27b78f
--- /dev/null
+++ b/graphforge/templates/registry.py
@@ -0,0 +1,104 @@
+"""Body template registry.
+
+The full library is roughly 25 templates spanning common patterns
+(passthrough_call, sequential_calls, validate_with_regex, dispatch_on_type,
+try_call_with_fallback, accumulate, compose, ...). See PROPOSAL.md §3.2.
+
+This file holds a *seed* registry sufficient for the action dispatcher and
+its tests to exercise the attach_body code path. Each entry declares only
+the metadata the dispatcher needs:
+
+  * ``args_schema``  — required arg names and their JSON-shape hint
+  * ``required_edges`` — predicate that the node has the right edges to
+    support this template (e.g., passthrough_call needs exactly one out-edge)
+
+Codegen (template -> Python source) and full type signatures live in
+:mod:`graphforge.templates.library` and are TODO.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class TemplateSpec:
+    name: str
+    args_schema: dict[str, type]
+    description: str
+    # Predicate that takes (out_degree, in_degree) of the host node and
+    # returns True iff the template is attachable. Real validation
+    # (template <-> signature compatibility, type flow) is the type engine.
+    edges_ok: Callable[[int, int], bool] = lambda out_d, in_d: True  # noqa: E731
+
+
+_REGISTRY: dict[str, TemplateSpec] = {
+    "passthrough_call": TemplateSpec(
+        name="passthrough_call",
+        args_schema={},
+        description="Call exactly one downstream function and return its result.",
+        edges_ok=lambda out_d, in_d: out_d == 1,
+    ),
+    "sequential_calls": TemplateSpec(
+        name="sequential_calls",
+        args_schema={},
+        description="Call each downstream function in declaration order; return the last.",
+        edges_ok=lambda out_d, in_d: out_d >= 1,
+    ),
+    "validate_with_regex": TemplateSpec(
+        name="validate_with_regex",
+        args_schema={"pattern": str},
+        description="Apply a named regex pattern to the input; return bool.",
+        edges_ok=lambda out_d, in_d: out_d == 0,
+    ),
+    "early_return_guard": TemplateSpec(
+        name="early_return_guard",
+        args_schema={"condition": str},
+        description="Guard with an early-return; otherwise delegate to one downstream call.",
+        edges_ok=lambda out_d, in_d: out_d == 1,
+    ),
+    "try_call_with_fallback": TemplateSpec(
+        name="try_call_with_fallback",
+        args_schema={},
+        description="Try the first out-edge; on exception, delegate to the second.",
+        edges_ok=lambda out_d, in_d: out_d == 2,
+    ),
+    "leaf_constant": TemplateSpec(
+        name="leaf_constant",
+        args_schema={"value": object},
+        description="Return a literal constant. Leaf node.",
+        edges_ok=lambda out_d, in_d: out_d == 0,
+    ),
+}
+
+
+def known_templates() -> list[str]:
+    return sorted(_REGISTRY.keys())
+
+
+def get_template(name: str) -> TemplateSpec | None:
+    return _REGISTRY.get(name)
+
+
+def validate_args(name: str, args: dict[str, object]) -> list[str]:
+    """Return a list of human-readable problems with ``args``.
+
+    Empty list means the args satisfy the schema.
+    """
+    spec = _REGISTRY.get(name)
+    if spec is None:
+        return [f"unknown template: {name!r}"]
+    problems: list[str] = []
+    extra = set(args) - set(spec.args_schema)
+    missing = set(spec.args_schema) - set(args)
+    for k in sorted(missing):
+        problems.append(f"missing arg {k!r}")
+    for k in sorted(extra):
+        problems.append(f"unexpected arg {k!r}")
+    for k, T in spec.args_schema.items():
+        if k in args and T is not object and not isinstance(args[k], T):
+            problems.append(
+                f"arg {k!r} should be {T.__name__}, got {type(args[k]).__name__}"
+            )
+    return problems
diff --git a/graphforge/training/__init__.py b/graphforge/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd90292153e559b37fe30ccae955be9f68d31eca
--- /dev/null
+++ b/graphforge/training/__init__.py
@@ -0,0 +1,56 @@
+"""Training: multi-turn rollout for GRPO / SFT.
+
+Public surface:
+
+    EnvClient                 — protocol; HttpEnvClient or InProcessEnvClient
+    Policy                    — protocol; ScriptedPolicy or HfPolicy
+    rollout(...)              — drive one episode, return Trajectory
+    Trajectory, TurnSample    — per-turn (prompt, completion, reward, return)
+
+The rollout is environment-agnostic and policy-agnostic — see
+PROPOSAL.md §7.2 for the GRPOTrainer integration story.
+"""
+
+from graphforge.training.client import (
+    EnvClient,
+    HttpEnvClient,
+    InProcessEnvClient,
+)
+from graphforge.training.policy import HfPolicy, Policy, ScriptedPolicy
+from graphforge.training.protocol import (
+    ParseFailure,
+    ParseSuccess,
+    parse_completion,
+    render_action,
+)
+from graphforge.training.rollout import (
+    Trajectory,
+    TurnSample,
+    rollout,
+    trajectory_summary,
+)
+
+__all__ = [
+    "EnvClient",
+    "HfPolicy",
+    "HttpEnvClient",
+    "InProcessEnvClient",
+    "ParseFailure",
+    "ParseSuccess",
+    "Policy",
+    "ScriptedPolicy",
+    "Trajectory",
+    "TurnSample",
+    "parse_completion",
+    "render_action",
+    "rollout",
+    "trajectory_summary",
+]
+
+
+def train_grpo(config: object) -> None:  # pragma: no cover — TODO
+    raise NotImplementedError("GRPO training TODO — see PROPOSAL.md §7")
+
+
+def train_sft(config: object) -> None:  # pragma: no cover — TODO
+    raise NotImplementedError("SFT plan B TODO — see PROPOSAL.md §7.4")
diff --git a/graphforge/training/client.py b/graphforge/training/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..778a65ac69dcc63112a9f3a2e42b52289d238c83
--- /dev/null
+++ b/graphforge/training/client.py
@@ -0,0 +1,122 @@
+"""Env client adapters.
+
+Two implementations of the same contract:
+
+  * :class:`HttpEnvClient` — talks to a running FastAPI server over HTTP
+    (``localhost:8000`` during training).
+  * :class:`InProcessEnvClient` — drives the same FastAPI app via
+    ``fastapi.testclient.TestClient``, no socket required. Used by tests
+    and by single-process training jobs.
+
+Both expose the same three operations: ``reset``, ``step``, ``close``. The
+rollout code only depends on the protocol, so swapping transports doesn't
+ripple through anything else.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class EnvClient(Protocol):
+    """Minimal client surface the rollout depends on."""
+
+    def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]: ...
+
+    def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]: ...
+
+    def close(self, episode_id: str) -> dict[str, Any]: ...
+
+
+# ---- HTTP transport --------------------------------------------------
+
+
+class HttpEnvClient:
+    """Thin httpx wrapper. Use during training when the env server runs out-of-process."""
+
+    def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 30.0) -> None:
+        # Defer the import so the dep is optional for users who only do
+        # in-process drives in tests / notebooks.
+        import httpx
+
+        self._client = httpx.Client(base_url=base_url, timeout=timeout)
+
+    def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]:
+        body: dict[str, Any] = {}
+        if task_id is not None:
+            body["task_id"] = task_id
+        if seed is not None:
+            body["seed"] = seed
+        r = self._client.post("/reset", json=body)
+        r.raise_for_status()
+        return r.json()
+
+    def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]:
+        r = self._client.post("/step", json={"episode_id": episode_id, "action": action})
+        # 422 = malformed action payload; surface as a structured response
+        # rather than raising, because the agent's job is to learn from it.
+        if r.status_code == 422:
+            return {
+                "observation": {},
+                "reward": 0.0,  # caller will overlay with MALFORMED scoring
+                "done": False,
+                "info": {"error": "schema_rejection", "detail": r.json()},
+            }
+        r.raise_for_status()
+        return r.json()
+
+    def close(self, episode_id: str) -> dict[str, Any]:
+        r = self._client.post("/close", json={"episode_id": episode_id})
+        r.raise_for_status()
+        return r.json()
+
+    def __enter__(self) -> "HttpEnvClient":
+        return self
+
+    def __exit__(self, *_exc: object) -> None:
+        self._client.close()
+
+
+# ---- in-process transport -------------------------------------------
+
+
+class InProcessEnvClient:
+    """Drive the FastAPI app via ``TestClient`` without a real socket."""
+
+    def __init__(self, app: object | None = None) -> None:
+        from fastapi.testclient import TestClient
+
+        if app is None:
+            from graphforge.server.app import app as default_app
+            app = default_app
+        self._client = TestClient(app)  # type: ignore[arg-type]
+
+    def reset(self, task_id: str | None = None, seed: int | None = None) -> dict[str, Any]:
+        body: dict[str, Any] = {}
+        if task_id is not None:
+            body["task_id"] = task_id
+        if seed is not None:
+            body["seed"] = seed
+        r = self._client.post("/reset", json=body)
+        r.raise_for_status()
+        return r.json()
+
+    def step(self, episode_id: str, action: dict[str, Any]) -> dict[str, Any]:
+        r = self._client.post(
+            "/step", json={"episode_id": episode_id, "action": action}
+        )
+        if r.status_code == 422:
+            return {
+                "observation": {},
+                "reward": 0.0,
+                "done": False,
+                "info": {"error": "schema_rejection", "detail": r.json()},
+            }
+        r.raise_for_status()
+        return r.json()
+
+    def close(self, episode_id: str) -> dict[str, Any]:
+        r = self._client.post("/close", json={"episode_id": episode_id})
+        r.raise_for_status()
+        return r.json()
diff --git a/graphforge/training/policy.py b/graphforge/training/policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbbebc4c1a06ae147d99cb42e2768186b6757802
--- /dev/null
+++ b/graphforge/training/policy.py
@@ -0,0 +1,112 @@
+"""Policy interface and stub policies.
+
+A *policy* is anything that, given a list of messages, returns a single
+completion string. The rollout doesn't care whether that string came from
+a 0.5B Qwen sample, a hand-written script, or random noise — only that
+the contract is honored.
+
+This file provides:
+
+  * :class:`Policy` — a runtime-checkable Protocol.
+  * :class:`ScriptedPolicy` — yields a fixed list of completions in order.
+    Useful for tests and for building oracle trajectories during rejection-
+    sampling SFT (PROPOSAL.md §7.4 plan B).
+  * :class:`HfPolicy` — wraps an HF causal LM + tokenizer; the real thing.
+    Defined here so consumers can swap it in once we hook up Qwen, but
+    deliberately not imported at module-load time.
+"""
+
+from __future__ import annotations
+
+from typing import Iterator, Protocol, runtime_checkable
+
+from graphforge.training.prompt import Message
+
+
+@runtime_checkable
+class Policy(Protocol):
+    def sample(self, messages: list[Message]) -> str: ...
+
+
+# ---- scripted -------------------------------------------------------
+
+
+class ScriptedPolicy:
+    """Returns each item of ``completions`` in order.
+
+    If the rollout asks for more turns than there are scripted completions,
+    raises :class:`StopIteration` — that's a test bug, not an env bug.
+    """
+
+    def __init__(self, completions: list[str]) -> None:
+        self._iter: Iterator[str] = iter(completions)
+        self._n = len(completions)
+
+    def sample(self, _messages: list[Message]) -> str:
+        return next(self._iter)
+
+
+# ---- HF (lazy) ------------------------------------------------------
+
+
+class HfPolicy:
+    """A real LM-backed policy. Imports torch / transformers lazily.
+
+    Constructor args::
+
+        model           — a HF AutoModelForCausalLM
+        tokenizer       — the matching tokenizer
+        max_new_tokens  — generation cap per turn (PROPOSAL.md §7.1: 384)
+        temperature, top_p — sampling knobs
+    """
+
+    def __init__(
+        self,
+        model: object,
+        tokenizer: object,
+        *,
+        max_new_tokens: int = 384,
+        temperature: float = 0.7,
+        top_p: float = 0.95,
+    ) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+
+    def sample(self, messages: list[Message]) -> str:
+        # Defer heavy imports.
+        import torch  # noqa: F401  — required for inputs / device
+
+        # Critical for trained-eval correctness: ensure the model is in
+        # eval mode (no dropout) and that KV-cache is enabled (post-SFT,
+        # gradient checkpointing may have set use_cache=False).
+        self.model.eval()  # type: ignore[attr-defined]
+        if hasattr(self.model, "config"):
+            self.model.config.use_cache = True  # type: ignore[attr-defined]
+
+        tok = self.tokenizer
+        # Render to text first, then tokenize. ``apply_chat_template`` 's
+        # return type drifted across transformers versions (sometimes a raw
+        # tensor, sometimes a BatchEncoding); going through ``tok(text)`` is
+        # the canonical pattern and works on all of them.
+        text = tok.apply_chat_template(  # type: ignore[attr-defined]
+            messages, add_generation_prompt=True, tokenize=False
+        )
+        inputs = tok(text, return_tensors="pt")  # type: ignore[operator]
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}  # type: ignore[attr-defined]
+
+        with torch.no_grad():
+            out_ids = self.model.generate(  # type: ignore[attr-defined]
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                do_sample=True,
+                temperature=self.temperature,
+                top_p=self.top_p,
+                pad_token_id=tok.eos_token_id,  # type: ignore[attr-defined]
+                use_cache=True,
+            )
+        prompt_len = inputs["input_ids"].shape[-1]
+        gen = out_ids[0, prompt_len:]
+        return tok.decode(gen, skip_special_tokens=True)  # type: ignore[attr-defined]
diff --git a/graphforge/training/prompt.py b/graphforge/training/prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1c80842e7809e4be0d22442e28cb217bcce435
--- /dev/null
+++ b/graphforge/training/prompt.py
@@ -0,0 +1,160 @@
+"""Prompt / conversation builder.
+
+Produces the message list the policy sees, in HF chat-template-compatible
+shape: ``[{"role": "system", "content": ...}, {"role": "user", ...}, ...]``.
+
+The system prompt is short and stable across episodes; the per-task user
+turn is the natural-language description plus the visible constraints
+(rendered compactly so we don't burn context on JSON).
+
+After each step, the env's observation is appended as a ``user`` turn —
+this is the role that's typically used for tool-result injection in the
+absence of a dedicated ``tool`` role in the chat template.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from graphforge.training.protocol import ACTION_CLOSE, ACTION_OPEN
+
+Message = dict[str, str]
+
+
+SYSTEM_PROMPT = f"""You are an agent that builds Python programs by mutating a typed function-call graph.
+
+You don't write source code directly. Instead, each turn you emit exactly one tool call.
+The environment applies the call to a graph, replies with an observation, and the cycle repeats.
+At the end, the graph is materialized into Python and scored against a hidden specification.
+
+# Tool call format
+
+Your reply each turn should end with one tool call like this:
+
+    {ACTION_OPEN}
+    {{"kind": "add_module", "name": "validators", "responsibility": "validation"}}
+    {ACTION_CLOSE}
+
+Reasoning before the call is fine; the parser takes the last <action> block.
+Malformed output (no tag, bad JSON, missing 'kind') costs reward.
+
+# Available tools
+
+Graph mutations:
+  add_module(name, responsibility)
+  remove_module(name)
+  add_node(name, module, signature, purity?, error_policy?)
+  remove_node(name, module)
+  set_node_module(name, current_module, new_module)
+  attach_body(name, module, template, args?)
+  add_edge(caller, callee, arg_mapping?)            # caller/callee are "<module>.<name>"
+  remove_edge(caller, callee)
+
+Information (cheap):
+  query_subgraph(scope)        # "module:<name>" | "neighbors:<qualified>" | "path:<from>:<to>"
+  query_spec(constraint_kind?) # how many constraints satisfied
+  query_types(scope)           # type view (TODO)
+
+Information (expensive — token cost):
+  materialize_and_validate()   # project graph to Python, parse-check
+  run_behavioral_tests()       # property tests (TODO)
+
+Terminal:
+  submit()                     # ends episode and triggers final scoring
+
+# Reward shape
+
+Per turn:
+  successful mutation         0
+  failed mutation            -2
+  malformed output           -2
+  duplicate of prior action  -1
+  per-turn cost              -0.1
+  token cost on response     -0.0008 * tokens
+
+Terminal:
+  +1 per structural constraint satisfied
+  +5 if all structural constraints satisfied
+  +5 * (budget_remaining / budget) if all satisfied  (token-efficiency bonus)
+  -8 if materialization fails
+
+Plan before you act. Failed actions and reading expensive responses cost reward."""
+
+
+def initial_messages(task_visible: dict[str, Any]) -> list[Message]:
+    """Build the conversation seed for a fresh episode.
+
+    ``task_visible`` is the dict returned by ``Task.visible_payload()``.
+    """
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": _format_task_user_turn(task_visible)},
+    ]
+
+
+def append_observation(
+    messages: list[Message], observation: dict[str, Any]
+) -> list[Message]:
+    """Append an env observation as a user turn. Returns a new list."""
+    return list(messages) + [
+        {"role": "user", "content": _format_observation(observation)},
+    ]
+
+
+def append_completion(messages: list[Message], completion: str) -> list[Message]:
+    return list(messages) + [{"role": "assistant", "content": completion}]
+
+
+# ---- formatting -----------------------------------------------------
+
+
+def _format_task_user_turn(task_visible: dict[str, Any]) -> str:
+    desc = task_visible.get("description", "(no description)")
+    cs = task_visible.get("visible_constraints", [])
+    rendered = "\n".join(f"  - {_format_constraint(c)}" for c in cs) or "  (none)"
+    tier = task_visible.get("tier")
+    cap = task_visible.get("episode_cap")
+    budget = task_visible.get("budget")
+    return (
+        f"# Task (tier {tier})\n"
+        f"{desc}\n\n"
+        f"# Visible constraints (the spec also has hidden constraints; you must "
+        f"interpret the description, not just satisfy this checklist)\n"
+        f"{rendered}\n\n"
+        f"# Limits\n"
+        f"  episode_cap: {cap} turns\n"
+        f"  budget: {budget} tokens\n"
+    )
+
+
+def _format_constraint(c: dict[str, Any]) -> str:
+    kind = c.get("kind", "?")
+    rest = {k: v for k, v in c.items() if k != "kind"}
+    if not rest:
+        return kind
+    inside = ", ".join(f"{k}={v!r}" for k, v in rest.items())
+    return f"{kind}({inside})"
+
+
+def _format_observation(obs: dict[str, Any]) -> str:
+    """Render a /step observation tersely — the agent doesn't need every field.
+
+    Returns a multi-line string with the action outcome, the payload, and
+    running counters. Kept concise to control token cost.
+    """
+    payload_text = json.dumps(obs.get("payload", {}), indent=2, default=str)
+    if len(payload_text) > 800:
+        payload_text = payload_text[:800] + "\n  …(truncated)"
+    return (
+        f"# Observation\n"
+        f"  ok: {obs.get('ok')}\n"
+        f"  outcome: {obs.get('outcome')}\n"
+        f"  duplicate: {obs.get('is_duplicate')}\n"
+        f"  reward: {obs.get('reward')}\n"
+        f"  turns_total: {obs.get('turns_total')}\n"
+        f"  tokens_used_total: {obs.get('tokens_used_total')}\n"
+        f"  budget_remaining: {obs.get('budget_remaining')}\n"
+        f"  episode_cap_remaining: {obs.get('episode_cap_remaining')}\n"
+        f"  payload: {payload_text}\n"
+    )
diff --git a/graphforge/training/protocol.py b/graphforge/training/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f25e829ae18623ba309039e60f968a7df474ca
--- /dev/null
+++ b/graphforge/training/protocol.py
@@ -0,0 +1,98 @@
+"""Tool-call wire format.
+
+The agent emits a single tool call per turn as a JSON object wrapped in
+``<action>...</action>`` tags::
+
+    Some optional reasoning text the model writes before the call.
+    <action>
+    {"kind": "add_module", "name": "validators", "responsibility": "validation"}
+    </action>
+
+Why this format and not OpenAI / Qwen native tool-calling:
+
+* It's tokenizer-agnostic. We don't depend on any chat-template's tool-call
+  hooks, so we can swap models freely.
+* It's easy for a 0.5B model to emit reliably with a few in-context examples.
+* It's easy to fail cleanly: malformed output produces a structured
+  ``ParseFailure`` that maps to MALFORMED in the reward engine.
+
+If the model emits multiple ``<action>`` blocks we take the *last* one; this
+matches "the agent reasoned, then committed to one action" and avoids
+rewarding an early stutter.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+
+ACTION_OPEN = "<action>"
+ACTION_CLOSE = "</action>"
+
+_ACTION_RE = re.compile(r"<action>\s*(.*?)\s*</action>", re.DOTALL)
+
+
+@dataclass(frozen=True)
+class ParseSuccess:
+    action: dict[str, object]
+    raw: str  # the JSON text we extracted, for debugging
+
+
+@dataclass(frozen=True)
+class ParseFailure:
+    code: str
+    message: str
+    raw: str
+
+
+ParseResult = ParseSuccess | ParseFailure
+
+
+def parse_completion(text: str) -> ParseResult:
+    """Extract a tool call from a model completion.
+
+    On success, returns ``ParseSuccess`` whose ``action`` is a JSON dict
+    suitable to forward to ``/step``. On any failure path returns a
+    ``ParseFailure`` with a stable code:
+
+      * ``no_action_tag``       — neither tag found
+      * ``unclosed_tag``        — open tag without close
+      * ``invalid_json``        — tags found but body wasn't JSON
+      * ``not_an_object``       — JSON parsed but isn't a dict
+      * ``missing_kind``        — dict is missing the ``kind`` field
+    """
+    if ACTION_OPEN not in text:
+        return ParseFailure("no_action_tag", "no <action> tag found", raw=text)
+    if ACTION_CLOSE not in text:
+        return ParseFailure("unclosed_tag", "<action> tag never closed", raw=text)
+
+    matches = _ACTION_RE.findall(text)
+    if not matches:
+        return ParseFailure(
+            "no_action_tag",
+            "<action> tags present but body could not be extracted",
+            raw=text,
+        )
+    body = matches[-1].strip()  # take the last action emitted
+    try:
+        obj = json.loads(body)
+    except json.JSONDecodeError as e:
+        return ParseFailure("invalid_json", f"json error: {e.msg}", raw=body)
+
+    if not isinstance(obj, dict):
+        return ParseFailure(
+            "not_an_object",
+            f"action body must be a JSON object, got {type(obj).__name__}",
+            raw=body,
+        )
+    if "kind" not in obj:
+        return ParseFailure("missing_kind", "action object lacks 'kind' field", raw=body)
+
+    return ParseSuccess(action=obj, raw=body)
+
+
+def render_action(action: dict[str, object]) -> str:
+    """Render an action dict in the on-the-wire format. Used by tests and
+    by scripted policies."""
+    return f"{ACTION_OPEN}\n{json.dumps(action)}\n{ACTION_CLOSE}"
diff --git a/graphforge/training/rollout.py b/graphforge/training/rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ec3c8d7c7c70e8cb03a07adce2217a0fdfdd5af
--- /dev/null
+++ b/graphforge/training/rollout.py
@@ -0,0 +1,238 @@
+"""Multi-turn rollout — the bridge between the env and a policy.
+
+For each turn:
+
+  1. The policy is sampled, given the conversation so far. It returns a
+     single text completion.
+  2. The completion is parsed to extract the tool call. If parsing fails,
+     a synthetic ``schema_rejection`` step is recorded with the reward
+     engine's MALFORMED magnitude and the loop continues.
+  3. The tool call is forwarded to the env via ``EnvClient.step``. The env
+     returns ``{observation, reward, done, info}``.
+  4. The observation is appended to the conversation as a user turn.
+  5. We stop on ``done`` or when ``episode_cap`` is reached.
+
+After the loop we compute discounted returns from each turn and produce a
+list of ``TurnSample(prompt_messages, completion_text, reward, return_)``
+tuples — exactly the shape ``trl.GRPOTrainer`` consumes when wrapped with
+a custom reward function.
+
+The rollout is environment-agnostic via :class:`EnvClient` and
+policy-agnostic via :class:`Policy`. Both come from sibling modules; the
+rollout function never imports torch or httpx directly.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from graphforge.reward.engine import (
+    DUPLICATE_ACTION,
+    PER_TURN_COST,
+    SCHEMA_REJECTION,
+)
+from graphforge.training.client import EnvClient
+from graphforge.training.policy import Policy
+from graphforge.training.prompt import (
+    Message,
+    append_completion,
+    append_observation,
+    initial_messages,
+)
+from graphforge.training.protocol import (
+    ParseFailure,
+    ParseSuccess,
+    parse_completion,
+)
+
+
+# ---- per-turn record -------------------------------------------------
+
+
+@dataclass
+class TurnSample:
+    """Single (prompt, completion, reward, return) tuple for the trainer.
+
+    ``prompt_messages`` is the conversation up to (but not including) the
+    assistant's completion at this turn.
+    """
+
+    turn: int
+    prompt_messages: list[Message]
+    completion_text: str
+    reward: float
+    return_: float = 0.0
+
+    # Diagnostics; not consumed by the trainer.
+    parse_ok: bool = True
+    parse_failure_code: str | None = None
+    env_response: dict[str, Any] = field(default_factory=dict)
+    done: bool = False
+
+
+@dataclass
+class Trajectory:
+    episode_id: str
+    task_id: str
+    samples: list[TurnSample] = field(default_factory=list)
+    terminated_naturally: bool = False
+    terminal_total: float | None = None
+
+    @property
+    def total_reward(self) -> float:
+        return sum(s.reward for s in self.samples)
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+
+# ---- rollout ---------------------------------------------------------
+
+
+def rollout(
+    *,
+    policy: Policy,
+    env: EnvClient,
+    task_id: str | None = None,
+    seed: int | None = None,
+    gamma: float = 0.97,
+    max_turns: int | None = None,
+    auto_close: bool = True,
+) -> Trajectory:
+    """Run one episode end-to-end. Returns a :class:`Trajectory`.
+
+    ``max_turns`` overrides the task's ``episode_cap`` if specified
+    (useful for unit tests). Otherwise the env's own cap fires first.
+    ``auto_close`` calls ``env.close`` when the episode ends.
+    """
+    reset_resp = env.reset(task_id=task_id, seed=seed)
+    episode_id = reset_resp["episode_id"]
+    task_visible = reset_resp["observation"]["task"]
+    cap = max_turns or task_visible["episode_cap"]
+
+    messages = initial_messages(task_visible)
+    samples: list[TurnSample] = []
+    done = False
+    terminal_total: float | None = None
+
+    for turn_idx in range(cap):
+        # 1. Sample the policy.
+        completion = policy.sample(messages)
+        prompt_at_turn = list(messages)  # snapshot before appending the assistant turn
+
+        # 2. Parse the tool call.
+        parsed = parse_completion(completion)
+
+        if isinstance(parsed, ParseFailure):
+            # Synthetic step — env never sees the action. Reward mirrors
+            # the MALFORMED branch of score_turn (no token cost because
+            # nothing came back from the env).
+            reward = SCHEMA_REJECTION + PER_TURN_COST
+            sample = TurnSample(
+                turn=turn_idx,
+                prompt_messages=prompt_at_turn,
+                completion_text=completion,
+                reward=reward,
+                parse_ok=False,
+                parse_failure_code=parsed.code,
+            )
+            samples.append(sample)
+            messages = append_completion(messages, completion)
+            messages = append_observation(
+                messages,
+                {
+                    "ok": False,
+                    "outcome": "malformed",
+                    "is_duplicate": False,
+                    "reward": reward,
+                    "payload": {"error": parsed.code, "message": parsed.message},
+                    "turns_total": turn_idx + 1,
+                    "tokens_used_total": 0,
+                    "budget_remaining": task_visible["budget"],
+                    "episode_cap_remaining": cap - (turn_idx + 1),
+                },
+            )
+            continue
+
+        # 3. Forward to env.
+        assert isinstance(parsed, ParseSuccess)
+        env_resp = env.step(episode_id, parsed.action)
+
+        info = env_resp.get("info", {})
+        # The env client returns a synthetic response on FastAPI 422 — that's
+        # a schema_rejection (e.g. unknown kind, missing required field).
+        # Score it the same as a parse-side malformed completion.
+        is_schema_rejection = info.get("error") == "schema_rejection"
+        if is_schema_rejection:
+            reward = SCHEMA_REJECTION + PER_TURN_COST
+        else:
+            reward = float(env_resp.get("reward", 0.0))
+        done = bool(env_resp.get("done", False))
+
+        # The embedded observation carries duplicate flags etc.
+        obs = env_resp.get("observation", {})
+
+        sample = TurnSample(
+            turn=turn_idx,
+            prompt_messages=prompt_at_turn,
+            completion_text=completion,
+            reward=reward,
+            env_response=env_resp,
+            done=done,
+            parse_ok=not is_schema_rejection,
+            parse_failure_code="env_schema_rejection" if is_schema_rejection else None,
+        )
+        samples.append(sample)
+
+        messages = append_completion(messages, completion)
+        messages = append_observation(messages, obs)
+
+        if done:
+            terminal_total = info.get("terminal", {}).get("total")
+            break
+
+    if auto_close:
+        try:
+            env.close(episode_id)
+        except Exception:
+            pass
+
+    _fill_returns(samples, gamma=gamma)
+
+    return Trajectory(
+        episode_id=episode_id,
+        task_id=task_visible.get("id", ""),
+        samples=samples,
+        terminated_naturally=done,
+        terminal_total=terminal_total,
+    )
+
+
+# ---- discounted returns ---------------------------------------------
+
+
+def _fill_returns(samples: list[TurnSample], *, gamma: float) -> None:
+    """In-place fill of ``return_`` on each sample.
+
+    return_t = r_t + gamma * return_{t+1}, with return_{T+1} = 0.
+    """
+    running = 0.0
+    for s in reversed(samples):
+        running = s.reward + gamma * running
+        s.return_ = running
+
+
+# ---- helper for stub-policy demo ------------------------------------
+
+
+def trajectory_summary(traj: Trajectory) -> dict[str, Any]:
+    return {
+        "episode_id": traj.episode_id,
+        "task_id": traj.task_id,
+        "n_turns": len(traj),
+        "total_reward": traj.total_reward,
+        "terminated_naturally": traj.terminated_naturally,
+        "terminal_total": traj.terminal_total,
+        "parse_failures": sum(1 for s in traj.samples if not s.parse_ok),
+    }
diff --git a/graphforge/types/__init__.py b/graphforge/types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e520b4e44b6afec1abeccf4423c1ffad459c470
--- /dev/null
+++ b/graphforge/types/__init__.py
@@ -0,0 +1,42 @@
+"""Type engine.
+
+Responsibilities (PROPOSAL.md §3.1, §4.1, §5.2):
+
+  * Parse function signatures into a typed parameter list.
+  * Validate that every edge's ``arg_mapping`` has type-compatible flow
+    between caller's available bindings and callee's parameter types.
+  * Validate that every body template's expected types match the host
+    node's signature and outgoing edges.
+  * Detect ``Any`` usage for the ``no_any_types`` constraint.
+  * Surface a typed view of the graph for ``query_types``.
+
+The cheap signature parser at :mod:`graphforge.actions.signature` extracts
+parameter names; this module subsumes it with full annotation parsing using
+``ast.parse`` over a synthetic ``def`` so that we get Python's own grammar
+for free.
+
+Public surface (TODO):
+
+    parse_typed_signature(sig: str) -> TypedSignature
+    edge_type_flow(graph, edge) -> list[TypeIssue]
+    type_view(graph, scope) -> dict
+    has_any(graph) -> list[str]
+"""
+
+from __future__ import annotations
+
+
+def parse_typed_signature(sig: str) -> object:  # pragma: no cover — TODO
+    raise NotImplementedError("type engine — parse_typed_signature TODO")
+
+
+def edge_type_flow(graph: object, edge: object) -> list[object]:  # pragma: no cover
+    raise NotImplementedError("type engine — edge_type_flow TODO")
+
+
+def type_view(graph: object, scope: str) -> dict[str, object]:  # pragma: no cover
+    raise NotImplementedError("type engine — type_view TODO")
+
+
+def has_any(graph: object) -> list[str]:  # pragma: no cover
+    raise NotImplementedError("type engine — has_any TODO")
diff --git a/graphforge/validator/__init__.py b/graphforge/validator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9577ad74f05a2ba88e8dc992db1c0b39f2c15d
--- /dev/null
+++ b/graphforge/validator/__init__.py
@@ -0,0 +1,25 @@
+"""Materialization validator: parse + import + mypy --strict.
+
+Responsibilities (PROPOSAL.md §6.2 — "subprocess validation is bounded"):
+
+  * ``parse_check`` — call ``compile(source, ...)`` per file; report errors.
+  * ``import_check`` — write to a temp directory, attempt
+    ``importlib.import_module`` in a subprocess; report errors. (TODO)
+  * ``mypy_check`` — invoke ``mypy --strict`` in a subprocess against the
+    materialized tree, capturing exit code and parsed errors. (TODO)
+  * Hard timeouts: 8s per type-check, 12s for behavioral runs.
+  * Cache results keyed on ``Graph.structural_hash`` so we don't re-run
+    mypy after non-structural changes.
+
+Currently only parse-checking is implemented; ``full_check`` will be extended
+in place as the deeper gates land.
+"""
+
+from graphforge.validator.parse import (
+    ParseError,
+    ValidationReport,
+    full_check,
+    parse_check,
+)
+
+__all__ = ["ParseError", "ValidationReport", "full_check", "parse_check"]
diff --git a/graphforge/validator/parse.py b/graphforge/validator/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..119a90a21fc5fc29afa1e8d70aa8a77e04ec6dca
--- /dev/null
+++ b/graphforge/validator/parse.py
@@ -0,0 +1,75 @@
+"""Parse-only validator.
+
+Calls Python's own ``compile()`` on each materialized file and reports any
+syntax / lexer errors. This is the cheapest gate; the agent receives this
+feedback as part of ``materialize_and_validate``.
+
+Heavier checks (import-resolution, ``mypy --strict``, behavioral tests) live
+elsewhere in :mod:`graphforge.validator` and :mod:`graphforge.behavioral` —
+intentionally separated so the agent can pay for verification incrementally.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(frozen=True)
+class ParseError:
+    filename: str
+    line: int | None
+    column: int | None
+    message: str
+
+
+@dataclass
+class ValidationReport:
+    parse_errors: list[ParseError] = field(default_factory=list)
+
+    @property
+    def ok(self) -> bool:
+        return not self.parse_errors
+
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "ok": self.ok,
+            "parse_errors": [
+                {
+                    "filename": e.filename,
+                    "line": e.line,
+                    "column": e.column,
+                    "message": e.message,
+                }
+                for e in self.parse_errors
+            ],
+        }
+
+
+def parse_check(files: dict[str, str]) -> list[ParseError]:
+    """Compile each ``files[name]`` source. Return collected errors.
+
+    An empty list means every file parsed cleanly.
+    """
+    errors: list[ParseError] = []
+    for filename, source in files.items():
+        try:
+            compile(source, filename, "exec")
+        except SyntaxError as e:
+            errors.append(
+                ParseError(
+                    filename=filename,
+                    line=e.lineno,
+                    column=e.offset,
+                    message=e.msg,
+                )
+            )
+    return errors
+
+
+def full_check(files: dict[str, str]) -> ValidationReport:
+    """Run every validator gate that's currently implemented.
+
+    Today: parse-only. ``mypy --strict`` and import-resolution are added in
+    follow-up commits but the report shape stays the same.
+    """
+    return ValidationReport(parse_errors=parse_check(files))
diff --git a/openenv.yaml b/openenv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25126be5f116f9e5e11fafe7f2c634240ccd4425
--- /dev/null
+++ b/openenv.yaml
@@ -0,0 +1,50 @@
+# OpenEnv manifest — required by the hackathon and by the OpenEnv spec.
+
+name: repo-edit
+version: 0.3.0
+description: >
+  Multi-turn repository-editing environment for long-horizon RL.
+  An LLM agent receives a Knowledge Graph of a real Python repo
+  (nodes: repo / package / module / class / function / method;
+   edges: contains / calls / imports / inherits — all parsed from AST)
+  and must navigate it across multiple turns to apply a code change.
+  Reward is sparse: only granted when submit() passes all unit tests.
+  Designed to push agents beyond shallow reasoning toward structured
+  planning, graph navigation, and durable state tracking.
+
+client:
+  class_name: RepoEditEnv
+  module: env.client
+
+action:
+  class_name: RepoEditAction
+  module: env.actions
+
+observation:
+  class_name: RepoEditObservation
+  module: env.models
+
+state:
+  class_name: RepoEditState
+  module: env.models
+
+environment:
+  class_name: RepoEditEnvironment
+  module: env.environment
+
+default_image: ast-code-edit:latest
+spec_version: 1
+
+tags:
+  - code-generation
+  - ast
+  - dag
+  - grpo
+  - lora
+  - rl
+  - python
+  - single-step
+
+author: "Naga Nithin"
+license: MIT
+homepage: "https://github.com/nithin062006/scaler"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..61507d7a0f9a5492a5aef2e4add8b1b0aa00db0b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,76 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "graphforge"
+version = "0.0.1"
+description = "Graph-first code generation environment for long-horizon RL planning."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "Naga", email = "naganithin@poshmark.com" }]
+dependencies = [
+    "pydantic>=2.6",
+    "fastapi>=0.110",
+    "uvicorn[standard]>=0.27",
+    "httpx>=0.27",
+    "typing-extensions>=4.10",
+    "openenv-core>=0.1.0",
+    "pyyaml>=6.0",
+]
+
+[project.optional-dependencies]
+validation = [
+    "mypy>=1.10",
+]
+behavioral = [
+    "hypothesis>=6.100",
+]
+training = [
+    "torch>=2.2",
+    "transformers>=4.42",
+    "trl>=0.12",
+    "peft>=0.11",
+    "accelerate>=0.30",
+    "datasets>=2.18",
+    "matplotlib>=3.8",
+    "numpy>=1.26",
+]
+demo = [
+    "gradio>=4.30",
+]
+dev = [
+    "pytest>=8.0",
+    "pytest-cov>=5.0",
+    "ruff>=0.4",
+    "mypy>=1.10",
+    "hypothesis>=6.100",
+]
+
+[tool.setuptools.packages.find]
+include = ["graphforge*", "env*"]
+exclude = ["tests*", "notebooks*", "training*", "plots*", "space*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+addopts = "-ra --strict-markers"
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "subprocess: tests that spawn subprocesses (mypy, behavioral)",
+]
+
+[tool.mypy]
+python_version = "3.10"
+strict = true
+ignore_missing_imports = true
+exclude = ["build/", "dist/"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM"]
+ignore = ["E501"]