Spaces:

openenv-community
/

optigami

Sleeping

sissississi Claude Opus 4.6 commited on Mar 8

Commit

2c8a058

1 Parent(s): 85a3e59

Add GRPO training notebook + Trackio integration + SpatialThinker support

- train_origami.ipynb: Full Colab/Northflank notebook for GRPO training
with model selection (SpatialThinker vs vanilla Qwen2.5-VL), per-component
reward logging, evaluation harness, and A/B comparison
- train.py: Switch from W&B to Trackio, add VL model auto-detection
for FastVisionModel vs FastLanguageModel
- trainer/: Updated reward functions and prompts for env/ system

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (6) hide show

train.py +29 -6
train_origami.ipynb +184 -0
trainer/mock_env.py +16 -0
trainer/prompts.py +108 -66
trainer/rewards.py +339 -14
trainer/train.py +11 -4

train.py CHANGED Viewed

@@ -2,9 +2,14 @@
 OrigamiRL — GRPO Training Script
 Code-as-policy: model generates complete fold sequence, gets terminal reward.
 Usage:
     python train.py
-    python train.py --model unsloth/Qwen2.5-7B-Instruct --epochs 3 --output origami-grpo
 """
 import argparse
 import json
@@ -13,10 +18,19 @@ import random
 from pathlib import Path
 from typing import Optional
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', default='unsloth/Qwen2.5-7B-Instruct')
     parser.add_argument('--max_seq_length', type=int, default=2048)
     parser.add_argument('--epochs', type=int, default=3)
     parser.add_argument('--batch_size', type=int, default=2)
@@ -148,20 +162,29 @@ def main():
         return
     # Load model via unsloth
     try:
-        from unsloth import FastLanguageModel
     except ImportError:
         print("ERROR: unsloth not installed. Run: pip install unsloth")
         print("Or run with --dry_run to test the reward function without a model.")
         return
-    model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=args.model,
         max_seq_length=args.max_seq_length,
         load_in_4bit=True,
     )
-    model = FastLanguageModel.get_peft_model(
         model,
         r=32,
         target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
@@ -193,7 +216,7 @@ def main():
         num_generations=args.n_generations,
         temperature=1.0,
         logging_steps=1,
-        report_to="wandb",
         run_name="origami-grpo",
     )

 OrigamiRL — GRPO Training Script
 Code-as-policy: model generates complete fold sequence, gets terminal reward.
+Base model: SpatialThinker (Qwen2.5-VL-7B fine-tuned for spatial reasoning)
+or any Unsloth-compatible model.
 Usage:
     python train.py
+    python train.py --model unsloth/Qwen2.5-VL-7B-Instruct --epochs 3
+    python train.py --model OX-PIXL/SpatialThinker-Qwen2.5-VL-7B --epochs 3
+    python train.py --dry_run  # test rewards without GPU
 """
 import argparse
 import json
 from pathlib import Path
 from typing import Optional
+# VL (vision-language) model identifiers — use FastVisionModel for these
+_VL_MODEL_PATTERNS = ['VL', 'vl', 'Vision', 'vision', 'SpatialThinker', 'SpaceThinker']
+def _is_vl_model(model_name: str) -> bool:
+    return any(p in model_name for p in _VL_MODEL_PATTERNS)
 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='unsloth/Qwen2.5-VL-7B-Instruct',
+                        help='Base model. Use unsloth/Qwen2.5-VL-7B-Instruct or '
+                             'OX-PIXL/SpatialThinker-Qwen2.5-VL-7B for spatial reasoning')
     parser.add_argument('--max_seq_length', type=int, default=2048)
     parser.add_argument('--epochs', type=int, default=3)
     parser.add_argument('--batch_size', type=int, default=2)
         return
     # Load model via unsloth
+    # VL models (SpatialThinker, Qwen2.5-VL) use FastVisionModel
+    # Text-only models use FastLanguageModel
+    is_vl = _is_vl_model(args.model)
     try:
+        if is_vl:
+            from unsloth import FastVisionModel as ModelLoader
+            print(f"Loading VL model (vision-language): {args.model}")
+        else:
+            from unsloth import FastLanguageModel as ModelLoader
+            print(f"Loading text model: {args.model}")
     except ImportError:
         print("ERROR: unsloth not installed. Run: pip install unsloth")
         print("Or run with --dry_run to test the reward function without a model.")
         return
+    model, tokenizer = ModelLoader.from_pretrained(
         model_name=args.model,
         max_seq_length=args.max_seq_length,
         load_in_4bit=True,
     )
+    model = ModelLoader.get_peft_model(
         model,
         r=32,
         target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
         num_generations=args.n_generations,
         temperature=1.0,
         logging_steps=1,
+        report_to="trackio",
         run_name="origami-grpo",
     )

train_origami.ipynb ADDED Viewed

	@@ -0,0 +1,184 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8smrrb11v84",
+   "source": "# Optigami — Origami RL Training (GRPO)\n\n**Train an LLM to generate valid origami fold sequences using verifiable geometric rewards.**\n\nArchitecture:\n- **Environment**: `env/` — CreaseGraph + Kawasaki/Maekawa/BLB verifiers + target matching\n- **Policy model**: SpatialThinker (Qwen2.5-VL-7B) or vanilla Qwen2.5-VL-7B\n- **Training**: Unsloth GRPO — model generates complete fold sequences, gets terminal reward\n- **Tracking**: Trackio — real-time reward curves on HF Spaces\n\n| Reward Component | Weight | What it measures |\n|---|---|---|\n| `progress` | 0.45 | Geometric crease coverage vs target |\n| `economy` | 0.10 | Penalty for excess creases |\n| `kawasaki` | 0.08 | Kawasaki theorem satisfaction |\n| `maekawa` | 0.07 | Maekawa theorem satisfaction |\n| `blb` | 0.05 | Big-Little-Big lemma |\n| `anchored` | 0.05 | Valid anchor point usage |\n| `completion` | +10.0 | Bonus when target reached |",
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "id": "kn1k9d357j",
+   "source": "## 1. Setup\n\n**GPU**: H100 80GB (Northflank/CoreWeave) or A100/T4 (Colab)\n\nInstall dependencies. Unsloth handles efficient model loading + LoRA.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "d10vqzep5b6",
+   "source": "%%capture\n!pip install unsloth trackio shapely numpy datasets\n!pip install --upgrade trl transformers\n\n# Check GPU\nimport torch\nprint(f\"GPU: {torch.cuda.get_device_name(0)}\")\nprint(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "y6wsagz8h",
+   "source": "## 2. Configuration\n\nChoose base model and hyperparameters. Two options:\n- **SpatialThinker** (`OX-PIXL/SpatialThinker-Qwen2.5-VL-7B`): Pre-trained for spatial reasoning via RL\n- **Vanilla Qwen2.5-VL** (`unsloth/Qwen2.5-VL-7B-Instruct`): Standard vision-language model\n\nWe'll compare both to see which learns origami folding faster.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "dh1zapl0w5s",
+   "source": "# ── Config ──────────────────────────────────────────────────────────────────\n# Toggle MODEL_NAME to switch between SpatialThinker and vanilla Qwen2.5-VL\n\nMODEL_NAME = \"OX-PIXL/SpatialThinker-Qwen2.5-VL-7B\"\n# MODEL_NAME = \"unsloth/Qwen2.5-VL-7B-Instruct\"  # uncomment for vanilla\n\nMAX_SEQ_LENGTH = 2048\nLORA_R = 32\nLORA_ALPHA = 32\nEPOCHS = 3\nBATCH_SIZE = 2\nGRAD_ACCUM = 4\nLR = 5e-6\nN_GENERATIONS = 8       # completions sampled per prompt (GRPO group size)\nMAX_FOLDS = 8           # max folds per episode\nLEVEL = 1               # target difficulty (1=simple, 2=medium, 3=hard)\nMAX_COMPLETION_LEN = 512\nOUTPUT_DIR = \"origami-grpo\"\n\n# Trackio — set your HF Space ID for live dashboard\nTRACKIO_SPACE_ID = None  # e.g. \"your-username/optigami-training\"\n\nprint(f\"Model: {MODEL_NAME}\")\nprint(f\"Config: {EPOCHS} epochs, batch={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, lr={LR}\")\nprint(f\"GRPO: {N_GENERATIONS} generations, max_folds={MAX_FOLDS}, level={LEVEL}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "o5hhfbp0wb",
+   "source": "## 3. Clone Repo & Test Environment\n\nClone the optigami repo (skip if running locally) and verify the environment works.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "94cemjucczl",
+   "source": "import os\n\n# Clone repo if not already present (Colab/Northflank)\nif not os.path.exists(\"env/environment.py\"):\n    !git clone https://huggingface.co/spaces/openenv-community/optigami /content/optigami 2>/dev/null || true\n    os.chdir(\"/content/optigami\")\n\n# Verify env/ is accessible\nfrom env.environment import OrigamiEnvironment\nfrom env.rewards import compute_reward\nfrom env.prompts import parse_fold_list\n\nenv = OrigamiEnvironment(mode=\"code_as_policy\", max_steps=MAX_FOLDS)\nprint(f\"Available targets: {env.available_targets()}\")\nprint(f\"Environment mode: {env.mode}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "2j9mccejyfx",
+   "source": "# ── Dry run: test reward function ───────────────────────────────────────────\n# Verify rewards work before loading the model\n\nimport copy\n\ndef make_reward_fn(env_template, max_folds):\n    \"\"\"Reward function: clone env, run completion, return total reward.\"\"\"\n    def reward_fn(completions, prompts=None, **kwargs):\n        rewards = []\n        target_names = kwargs.get(\"target_names\", [None] * len(completions))\n        for completion, target_name in zip(completions, target_names):\n            try:\n                e = env_template.clone()\n                e.reset(target_name=target_name)\n                _, reward_dict, _, _ = e.step(completion)\n                rewards.append(float(reward_dict[\"total\"]))\n            except Exception:\n                rewards.append(-0.1)\n        return rewards\n    return reward_fn\n\nreward_fn = make_reward_fn(env, MAX_FOLDS)\n\ntest_completions = [\n    '<folds>[{\"instruction\": \"Valley fold along horizontal center\", \"from\": [0, 0.5], \"to\": [1, 0.5], \"assignment\": \"V\"}]</folds>',\n    '<folds>[{\"instruction\": \"Bad fold\", \"from\": [0.3, 0.3], \"to\": [0.7, 0.7], \"assignment\": \"V\"}]</folds>',\n    'not valid JSON',\n]\ntarget_names = [\"half_horizontal\"] * 3\nrewards = reward_fn(test_completions, target_names=target_names)\n\nfor comp, r in zip([\"perfect fold\", \"partial fold\", \"garbage\"], rewards):\n    print(f\"  {comp}: reward = {r:.3f}\")\nprint(\"\\nReward function OK ✓\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46gs2p1cy4",
+   "source": "## 4. Load Model + LoRA\n\nLoad the VL model with Unsloth's `FastVisionModel` (4-bit quantized) and apply LoRA adapters.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "82f76od6d2k",
+   "source": "from unsloth import FastVisionModel\n\nmodel, tokenizer = FastVisionModel.from_pretrained(\n    model_name=MODEL_NAME,\n    max_seq_length=MAX_SEQ_LENGTH,\n    load_in_4bit=True,\n)\n\nmodel = FastVisionModel.get_peft_model(\n    model,\n    r=LORA_R,\n    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n                     \"gate_proj\", \"up_proj\", \"down_proj\"],\n    lora_alpha=LORA_ALPHA,\n    lora_dropout=0,\n    use_gradient_checkpointing=\"unsloth\",\n)\n\nprint(f\"Model loaded: {MODEL_NAME}\")\nprint(f\"LoRA rank: {LORA_R}, alpha: {LORA_ALPHA}\")\nprint(f\"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67dyfrj23y",
+   "source": "## 5. Build Dataset\n\nGenerate prompts from all level-appropriate targets. Each prompt embeds the target crease pattern description and asks the model to output `<folds>[...]</folds>`.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "1msqpzj5fwu",
+   "source": "import random\nfrom datasets import Dataset\n\ndef build_dataset(env, level=1):\n    \"\"\"Build training dataset of prompts from env targets.\"\"\"\n    all_names = env.available_targets()\n    level_names = [\n        n for n in all_names\n        if env._targets[n].get(\"level\", 1) == level\n    ]\n    if not level_names:\n        level_names = all_names\n\n    items = []\n    for name in level_names:\n        obs = env.reset(target_name=name)\n        items.append({\"prompt\": obs[\"prompt\"], \"target_name\": name})\n\n    # Repeat each target 10x; ensure at least 50 examples\n    repeat = max(10, (50 + len(items) - 1) // len(items))\n    items = items * repeat\n    random.shuffle(items)\n    return items\n\ndataset_items = build_dataset(env, level=LEVEL)\nhf_dataset = Dataset.from_list(dataset_items)\n\nprint(f\"Dataset: {len(dataset_items)} examples\")\nprint(f\"Targets in dataset: {sorted(set(d['target_name'] for d in dataset_items))}\")\nprint(f\"\\nSample prompt (first 300 chars):\\n{dataset_items[0]['prompt'][:300]}...\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7n3r3nsw8ae",
+   "source": "## 6. Trackio Setup\n\nInitialize Trackio for real-time training visualization. Trackio is a free W&B alternative that deploys a Gradio dashboard to HF Spaces.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "bgru9rsw95b",
+   "source": "import trackio\n\n# Initialize Trackio run\ntrackio_kwargs = {\n    \"project_name\": \"optigami\",\n    \"run_name\": f\"grpo-{MODEL_NAME.split('/')[-1]}-level{LEVEL}\",\n}\nif TRACKIO_SPACE_ID:\n    trackio_kwargs[\"space_id\"] = TRACKIO_SPACE_ID\n\ntrackio.init(**trackio_kwargs)\nprint(f\"Trackio initialized: {trackio_kwargs['run_name']}\")\nif TRACKIO_SPACE_ID:\n    print(f\"Dashboard: https://huggingface.co/spaces/{TRACKIO_SPACE_ID}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "n8aqymlszo",
+   "source": "## 7. GRPO Training\n\nRun GRPO with Trackio logging. The trainer samples `N_GENERATIONS` completions per prompt, computes rewards via the environment, and updates the policy using group-relative advantages.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "ci4imd9ws7v",
+   "source": "from trl import GRPOConfig, GRPOTrainer\n\n# ── Per-component reward functions for detailed logging ─────────────────────\nREWARD_COMPONENTS = [\"kawasaki\", \"maekawa\", \"blb\", \"progress\", \"economy\", \"completion\"]\n\ndef make_component_fn(env_template, component):\n    \"\"\"Create a reward function that returns a single component's value.\"\"\"\n    def component_fn(completions, target_name=None, **kwargs):\n        target_names = target_name if isinstance(target_name, list) else [target_name] * len(completions)\n        rewards = []\n        for completion, tn in zip(completions, target_names):\n            try:\n                e = env_template.clone()\n                e.reset(target_name=tn)\n                _, reward_dict, _, _ = e.step(completion)\n                rewards.append(float(reward_dict.get(component, 0.0)))\n            except Exception:\n                rewards.append(0.0)\n        return rewards\n    component_fn.__name__ = f\"reward_{component}\"\n    return component_fn\n\n# Main reward function (returns total reward)\ndef wrapped_reward_fn(completions, target_name=None, **kwargs):\n    \"\"\"Main reward function — extracts target_name from batch columns.\"\"\"\n    target_names = target_name if isinstance(target_name, list) else [target_name] * len(completions)\n    return reward_fn(completions, target_names=target_names)\n\n# Build list of all reward functions: [total, kawasaki, maekawa, blb, progress, economy, completion]\nall_reward_fns = [wrapped_reward_fn] + [\n    make_component_fn(env, c) for c in REWARD_COMPONENTS\n]\n\n# ── GRPO Config ─────────────────────────────────────────────────────────────\nconfig = GRPOConfig(\n    output_dir=OUTPUT_DIR,\n    num_train_epochs=EPOCHS,\n    per_device_train_batch_size=BATCH_SIZE,\n    gradient_accumulation_steps=GRAD_ACCUM,\n    learning_rate=LR,\n    max_completion_length=MAX_COMPLETION_LEN,\n    num_generations=N_GENERATIONS,\n    temperature=1.0,\n    logging_steps=1,\n    save_steps=50,\n    report_to=\"trackio\",\n    run_name=f\"grpo-{MODEL_NAME.split('/')[-1]}-level{LEVEL}\",\n)\n\ntrainer = GRPOTrainer(\n    model=model,\n    config=config,\n    train_dataset=hf_dataset,\n    reward_funcs=all_reward_fns,\n    tokenizer=tokenizer,\n)\n\nprint(f\"Trainer ready. Starting GRPO training...\")\nprint(f\"  Model: {MODEL_NAME}\")\nprint(f\"  Dataset: {len(hf_dataset)} examples\")\nprint(f\"  Reward functions: total + {REWARD_COMPONENTS}\")\nprint(f\"  Logging to: Trackio\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "mwy1i7d509",
+   "source": "# ── Train! ──────────────────────────────────────────────────────────────────\ntrainer.train()\n\n# Save model + tokenizer\nmodel.save_pretrained(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f\"\\nTraining complete. Model saved to {OUTPUT_DIR}/\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54iu59w1ipb",
+   "source": "## 8. Evaluation\n\nRun the trained model on all targets and measure solve rates + average rewards.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "7loiy5mrsi3",
+   "source": "# ── Evaluate trained model on all targets ───────────────────────────────────\nfrom transformers import TextStreamer\n\nFastVisionModel.for_inference(model)\n\neval_results = {}\nn_samples = 5  # generations per target\n\nfor target_name in env.available_targets():\n    obs = env.reset(target_name=target_name)\n    prompt = obs[\"prompt\"]\n\n    # Tokenize prompt\n    messages = [{\"role\": \"user\", \"content\": prompt}]\n    input_ids = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(model.device)\n\n    target_rewards = []\n    target_solved = 0\n\n    for _ in range(n_samples):\n        outputs = model.generate(\n            input_ids=input_ids,\n            max_new_tokens=MAX_COMPLETION_LEN,\n            temperature=0.7,\n            do_sample=True,\n        )\n        completion = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)\n\n        # Score completion\n        e = env.clone()\n        e.reset(target_name=target_name)\n        try:\n            _, reward_dict, _, _ = e.step(completion)\n            total = float(reward_dict[\"total\"])\n            solved = reward_dict.get(\"completion\", 0) > 0\n        except Exception:\n            total = -0.1\n            solved = False\n\n        target_rewards.append(total)\n        if solved:\n            target_solved += 1\n\n    eval_results[target_name] = {\n        \"avg_reward\": sum(target_rewards) / len(target_rewards),\n        \"max_reward\": max(target_rewards),\n        \"solve_rate\": target_solved / n_samples,\n    }\n    print(f\"  {target_name:20s}  avg={eval_results[target_name]['avg_reward']:.3f}  \"\n          f\"max={eval_results[target_name]['max_reward']:.3f}  \"\n          f\"solved={target_solved}/{n_samples}\")\n\n# Summary\navg_solve = sum(r[\"solve_rate\"] for r in eval_results.values()) / len(eval_results)\navg_reward = sum(r[\"avg_reward\"] for r in eval_results.values()) / len(eval_results)\nprint(f\"\\nOverall: avg_reward={avg_reward:.3f}, solve_rate={avg_solve:.1%}\")\n\n# Log to Trackio\ntrackio.log({\"eval/avg_reward\": avg_reward, \"eval/solve_rate\": avg_solve})\nfor name, res in eval_results.items():\n    trackio.log({f\"eval/{name}_reward\": res[\"avg_reward\"], f\"eval/{name}_solved\": res[\"solve_rate\"]})",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9dr4ht05r",
+   "source": "## 9. A/B Comparison: SpatialThinker vs Vanilla Qwen2.5-VL\n\nTo compare both models, run this notebook twice:\n1. First run with `MODEL_NAME = \"OX-PIXL/SpatialThinker-Qwen2.5-VL-7B\"`\n2. Second run with `MODEL_NAME = \"unsloth/Qwen2.5-VL-7B-Instruct\"`\n\nBoth runs log to the same Trackio project (`optigami`) with different run names, so you can overlay the reward curves directly in the dashboard.\n\nThe cell below loads saved eval results from both runs for comparison (run after both trainings complete).",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "qwwd4wyuhnq",
+   "source": "# ── Save eval results for comparison ────────────────────────────────────────\nimport json\n\nmodel_tag = MODEL_NAME.split(\"/\")[-1]\neval_path = f\"eval_results_{model_tag}_level{LEVEL}.json\"\n\nwith open(eval_path, \"w\") as f:\n    json.dump(eval_results, f, indent=2)\nprint(f\"Eval results saved to {eval_path}\")\n\n# ── Compare (run after both models are trained) ────────────────────────────\nspatial_path = f\"eval_results_SpatialThinker-Qwen2.5-VL-7B_level{LEVEL}.json\"\nvanilla_path = f\"eval_results_Qwen2.5-VL-7B-Instruct_level{LEVEL}.json\"\n\nif os.path.exists(spatial_path) and os.path.exists(vanilla_path):\n    with open(spatial_path) as f:\n        spatial = json.load(f)\n    with open(vanilla_path) as f:\n        vanilla = json.load(f)\n\n    print(f\"\\n{'Target':<22} {'SpatialThinker':>16} {'Vanilla Qwen':>16} {'Delta':>10}\")\n    print(\"-\" * 66)\n    for target in sorted(set(list(spatial.keys()) + list(vanilla.keys()))):\n        s_r = spatial.get(target, {}).get(\"avg_reward\", 0)\n        v_r = vanilla.get(target, {}).get(\"avg_reward\", 0)\n        delta = s_r - v_r\n        print(f\"  {target:<20} {s_r:>14.3f}   {v_r:>14.3f}   {delta:>+8.3f}\")\n\n    s_avg = sum(r[\"avg_reward\"] for r in spatial.values()) / len(spatial)\n    v_avg = sum(r[\"avg_reward\"] for r in vanilla.values()) / len(vanilla)\n    print(f\"\\n  {'OVERALL':<20} {s_avg:>14.3f}   {v_avg:>14.3f}   {s_avg - v_avg:>+8.3f}\")\n\n    s_solve = sum(r[\"solve_rate\"] for r in spatial.values()) / len(spatial)\n    v_solve = sum(r[\"solve_rate\"] for r in vanilla.values()) / len(vanilla)\n    print(f\"  {'Solve Rate':<20} {s_solve:>13.1%}   {v_solve:>13.1%}   {s_solve - v_solve:>+7.1%}\")\nelse:\n    print(f\"Run both models to compare. Looking for:\\n  {spatial_path}\\n  {vanilla_path}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "812csd43vxk",
+   "source": "## 10. Push to HuggingFace Hub (optional)\n\nUpload the trained LoRA adapter to HF for deployment or further fine-tuning.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "h38kp70n16q",
+   "source": "# ── Push to HF Hub (uncomment and set your repo) ───────────────────────────\n# from huggingface_hub import login\n# login(token=\"hf_...\")  # or use HF_TOKEN env var\n#\n# HF_REPO = \"your-username/optigami-grpo-spatialthinker\"\n# model.push_to_hub(HF_REPO)\n# tokenizer.push_to_hub(HF_REPO)\n# print(f\"Model pushed to https://huggingface.co/{HF_REPO}\")\n\ntrackio.finish()\nprint(\"Done! Check your Trackio dashboard for training curves.\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  },
+  "colab": {
+   "provenance": [],
+   "gpuType": "A100"
+  },
+  "accelerator": "GPU"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

trainer/mock_env.py CHANGED Viewed

@@ -135,6 +135,22 @@ def apply_fold_mock(state: PaperState, fold: dict) -> tuple[PaperState, str | No
     if fold_type not in ("valley", "mountain"):
         return state, f"Unknown fold type: {fold_type}"
     if not (0 < angle_deg <= 180):
         return state, f"Angle must be in (0, 180], got {angle_deg}"

     if fold_type not in ("valley", "mountain"):
         return state, f"Unknown fold type: {fold_type}"
+    # angle=0 means "no fold" — return unchanged copy
+    if angle_deg == 0:
+        return PaperState(
+            vertices=state.vertices.copy(), edges=state.edges.copy(),
+            faces=[f[:] for f in state.faces],
+            assignments=state.assignments[:], fold_angles=state.fold_angles.copy(),
+            rest_lengths=state.rest_lengths.copy(), strain=state.strain.copy(),
+            energy=state.energy, face_orders=state.face_orders[:],
+            num_layers=state.num_layers, material=state.material,
+            bounding_box=state.bounding_box.copy(),
+            deployment_ratio=state.deployment_ratio, is_valid=state.is_valid,
+            kawasaki_violation=state.kawasaki_violation,
+            maekawa_violation=state.maekawa_violation,
+            self_intersections=state.self_intersections,
+        ), None
     if not (0 < angle_deg <= 180):
         return state, f"Angle must be in (0, 180], got {angle_deg}"

trainer/prompts.py CHANGED Viewed

@@ -1,49 +1,99 @@
 """
 Prompt templates for origami fold strategy generation.
-The LLM receives a task description and paper state, then generates
-a fold_strategy(paper_state) function that returns fold operations.
 """
 SYSTEM_PROMPT = """\
-You are an origami engineer. You design fold patterns for real-world applications \
-like solar panel packing, deployable shelters, and medical stents.
-You will be given a folding task with material constraints. Write a Python function \
-`fold_strategy(paper_state)` that returns a list of fold operations to achieve the goal.
 Rules:
 - Only use native Python (no imports except math, itertools, functools)
 - Each fold: {"type": "valley"|"mountain", "line": {"start": [x,y], "end": [x,y]}, "angle": 0-180}
-- Fold lines must intersect the paper boundaries
 - Fewer folds is better (efficiency matters)
-- Respect material strain limits
-- Output ONLY the function in ```python ... ``` backticks\
 """
 TASK_TEMPLATES = {
     "half_fold": {
         "name": "half_fold",
         "prompt": """\
 TASK: Fold a {width}m x {height}m {material} sheet in half to minimize one dimension.
 MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
 CONSTRAINTS: Maximum {max_folds} fold operations.
-TARGET: Deployment ratio <= 0.5 (folded area is half or less of original)
-CURRENT STATE:
-  Sheet: {width}m x {height}m, flat (0 folds applied)
-  Bounding box: {width}m x {height}m x 0.0m
-Write a fold_strategy(paper_state) function that returns a list of fold operations.
-Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
-```python
-def fold_strategy(paper_state):
-    # Your code here
-    return [...]
-```""",
         "target_ratio": 0.5,
         "max_folds": 3,
     },
@@ -53,21 +103,14 @@ def fold_strategy(paper_state):
         "prompt": """\
 TASK: Fold a {width}m x {height}m {material} sheet into thirds (like a letter).
 MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
 CONSTRAINTS: Maximum {max_folds} fold operations.
-TARGET: Deployment ratio <= 0.33
-CURRENT STATE:
-  Sheet: {width}m x {height}m, flat (0 folds applied)
-Write a fold_strategy(paper_state) function that returns a list of fold operations.
-Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
-```python
-def fold_strategy(paper_state):
-    # Your code here
-    return [...]
-```""",
         "target_ratio": 0.33,
         "max_folds": 5,
     },
@@ -78,30 +121,22 @@ def fold_strategy(paper_state):
 TASK: Fold a {width}m x {height}m Mylar sheet to minimize packed volume for a solar panel.
 The folded panel must be deployable (unfold cleanly to near-original area).
 MATERIAL: Mylar (thickness: 0.05mm, Young's modulus: 4 GPa, max strain: 3%)
 CONSTRAINTS:
   - Maximum {max_folds} fold operations
   - Must pack into bounding box <= 15cm x 15cm x 5cm
-  - Must deploy to >= 80% of original area
   - No self-intersections
-TARGET: Deployment ratio <= 0.05 (95% volume reduction)
-CURRENT STATE:
-  Sheet: {width}m x {height}m, flat (0 folds applied)
-  Bounding box: {width}m x {height}m x 0.0m
-HINT: Consider tessellated patterns like Miura-ori — alternating mountain and valley
-folds in a grid create a highly compact, single-DOF deployable structure.
-Write a fold_strategy(paper_state) function that returns a list of fold operations.
-Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
-```python
-def fold_strategy(paper_state):
-    # Your code here
-    return [...]
-```""",
         "target_ratio": 0.05,
         "max_folds": 20,
     },
@@ -111,29 +146,26 @@ def fold_strategy(paper_state):
         "prompt": """\
 TASK: Fold a {width}m x {height}m Nitinol sheet into a compact cylinder for a medical stent.
 MATERIAL: Nitinol (thickness: 0.1mm, Young's modulus: 75 GPa, max strain: 8%)
 CONSTRAINTS:
   - Maximum {max_folds} fold operations
-  - Compressed diameter: 3mm
-  - Deployed diameter: 10mm
-  - Must be radially deployable
-TARGET: Minimize packed cross-section while maintaining deployability.
-Write a fold_strategy(paper_state) function that returns a list of fold operations.
-```python
-def fold_strategy(paper_state):
-    # Your code here
-    return [...]
-```""",
         "target_ratio": 0.1,
         "max_folds": 15,
     },
 }
-# Default task configs for each level
 TASK_CONFIGS = {
     "half_fold": {
         "width": 1.0, "height": 1.0, "material": "paper",
@@ -158,6 +190,16 @@ def build_prompt(task_name: str = "half_fold", **overrides) -> str:
     """Build a complete user prompt for a given task."""
     task = TASK_TEMPLATES[task_name]
     config = {**TASK_CONFIGS[task_name], **overrides}
     return task["prompt"].format(**config)

 """
 Prompt templates for origami fold strategy generation.
+Inspired by SpatialThinker (arXiv 2511.07403): the model must produce
+a structured spatial representation BEFORE generating code.
+Output format (4 stages):
+  <observe>  — Describe the paper geometry and constraints
+  <plan>     — Structured fold plan with coordinates and reasoning
+  <code>     — The fold_strategy() function
+  <verify>   — Predict expected outcome (deployment ratio, fold count)
+Dense rewards check each stage independently, not just code execution.
 """
+# ---------------------------------------------------------------------------
+# System prompt — defines the structured output format
+# ---------------------------------------------------------------------------
 SYSTEM_PROMPT = """\
+You are an origami engineer specializing in computational fold design.
+You solve folding tasks by reasoning spatially about paper geometry.
+You MUST respond in exactly this 4-stage format:
+<observe>
+Describe the paper: dimensions, material, coordinate system.
+Identify key geometric features (center, edges, diagonals, symmetry axes).
+Note constraints (max strain, max folds, target ratio).
+</observe>
+<plan>
+{
+  "strategy": "description of overall approach",
+  "folds": [
+    {
+      "description": "what this fold does",
+      "type": "valley or mountain",
+      "line_start": [x, y],
+      "line_end": [x, y],
+      "angle": 180,
+      "reasoning": "why these coordinates"
+    }
+  ],
+  "expected_ratio": 0.5,
+  "expected_folds": 1
+}
+</plan>
+<code>
+```python
+def fold_strategy(paper_state):
+    # Implementation matching the plan above
+    return [...]
+```
+</code>
+<verify>
+Expected deployment ratio: X.XX
+Expected fold count: N
+Expected max strain: X.XXXX
+Potential issues: ...
+</verify>
 Rules:
 - Only use native Python (no imports except math, itertools, functools)
 - Each fold: {"type": "valley"|"mountain", "line": {"start": [x,y], "end": [x,y]}, "angle": 0-180}
+- Fold lines must cross the paper boundary (intersect at least 2 edges)
+- Valley = fold toward you (+Z), Mountain = fold away (-Z)
+- angle=180 = fully folded, smaller = partial fold
+- Each fold changes the geometry — later folds operate on already-folded paper
 - Fewer folds is better (efficiency matters)
+- Respect material strain limits\
 """
+# ---------------------------------------------------------------------------
+# Task templates — each includes spatial context
+# ---------------------------------------------------------------------------
 TASK_TEMPLATES = {
     "half_fold": {
         "name": "half_fold",
         "prompt": """\
 TASK: Fold a {width}m x {height}m {material} sheet in half to minimize one dimension.
+PAPER GEOMETRY:
+  Corners: (0,0), ({width},0), ({width},{height}), (0,{height})
+  Center: ({cx},{cy})
+  Horizontal midline: y={cy} from (0,{cy}) to ({width},{cy})
+  Vertical midline: x={cx} from ({cx},0) to ({cx},{height})
+  Diagonals: (0,0)→({width},{height}) and ({width},0)→(0,{height})
 MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
 CONSTRAINTS: Maximum {max_folds} fold operations.
+TARGET: Deployment ratio <= 0.5""",
         "target_ratio": 0.5,
         "max_folds": 3,
     },
         "prompt": """\
 TASK: Fold a {width}m x {height}m {material} sheet into thirds (like a letter).
+PAPER GEOMETRY:
+  Corners: (0,0), ({width},0), ({width},{height}), (0,{height})
+  Third lines: y={t1:.4f} and y={t2:.4f}
+  Center: ({cx},{cy})
 MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
 CONSTRAINTS: Maximum {max_folds} fold operations.
+TARGET: Deployment ratio <= 0.33""",
         "target_ratio": 0.33,
         "max_folds": 5,
     },
 TASK: Fold a {width}m x {height}m Mylar sheet to minimize packed volume for a solar panel.
 The folded panel must be deployable (unfold cleanly to near-original area).
+PAPER GEOMETRY:
+  Corners: (0,0), ({width},0), ({width},{height}), (0,{height})
+  Center: ({cx},{cy})
+  Area: {area}m²
 MATERIAL: Mylar (thickness: 0.05mm, Young's modulus: 4 GPa, max strain: 3%)
 CONSTRAINTS:
   - Maximum {max_folds} fold operations
   - Must pack into bounding box <= 15cm x 15cm x 5cm
   - No self-intersections
+TARGET: Deployment ratio <= 0.05 (95% area reduction)
+HINT: Tessellated patterns (alternating M/V folds in a grid) achieve high
+compaction with single-DOF deployment. Consider dividing the sheet into
+a regular grid of panels.""",
         "target_ratio": 0.05,
         "max_folds": 20,
     },
         "prompt": """\
 TASK: Fold a {width}m x {height}m Nitinol sheet into a compact cylinder for a medical stent.
+PAPER GEOMETRY:
+  Corners: (0,0), ({width},0), ({width},{height}), (0,{height})
+  Center: ({cx},{cy})
 MATERIAL: Nitinol (thickness: 0.1mm, Young's modulus: 75 GPa, max strain: 8%)
 CONSTRAINTS:
   - Maximum {max_folds} fold operations
+  - Compressed diameter: 3mm, Deployed diameter: 10mm
+TARGET: Deployment ratio <= 0.1""",
         "target_ratio": 0.1,
         "max_folds": 15,
     },
 }
+# ---------------------------------------------------------------------------
+# Config and builders
+# ---------------------------------------------------------------------------
 TASK_CONFIGS = {
     "half_fold": {
         "width": 1.0, "height": 1.0, "material": "paper",
     """Build a complete user prompt for a given task."""
     task = TASK_TEMPLATES[task_name]
     config = {**TASK_CONFIGS[task_name], **overrides}
+    # Add computed geometry values
+    w = config["width"]
+    h = config["height"]
+    config["cx"] = w / 2
+    config["cy"] = h / 2
+    config["area"] = w * h
+    config["t1"] = h / 3
+    config["t2"] = 2 * h / 3
     return task["prompt"].format(**config)

trainer/rewards.py CHANGED Viewed

@@ -1,17 +1,22 @@
 """
 Reward functions for origami GRPO training.
-Three reward functions following the 2048 pattern:
-  1. code_valid     — Does the generated code parse and produce fold instructions?
-  2. physically_valid — Are the folds geometrically/physically valid?
-  3. fold_quality   — How good is the folding solution (compactness, efficiency)?
-Lexicographic gating (from SpatialThinker): if code doesn't parse,
-all downstream rewards are 0. This prevents reward hacking.
 """
 import ast
 import sys
 import math
 import traceback
 from typing import Callable
@@ -60,23 +65,57 @@ except ImportError:
 # ---------------------------------------------------------------------------
 def extract_function(text: str) -> str | None:
-    """Extract a Python function from triple-backtick code blocks."""
-    if text.count("```") < 2:
         return None
-    first = text.find("```") + 3
-    second = text.find("```", first)
-    fx = text[first:second].strip()
-    fx = fx.removeprefix("python\n").removeprefix("python\r\n")
     # Find the def statement
-    def_idx = fx.find("def ")
     if def_idx == -1:
         return None
-    fx = fx[def_idx:]
     if fx.startswith("def fold_strategy("):
         return fx
     return None
 def check_imports_stdlib_only(code: str) -> tuple[bool, str]:
     """Check that code only imports from Python stdlib."""
     try:
@@ -386,3 +425,289 @@ def fold_quality(completions, **kwargs) -> list[float]:
             scores.append(-3.0)
     return scores

 """
 Reward functions for origami GRPO training.
+SpatialThinker-style dense rewards (arXiv 2511.07403):
+  1. format_reward     (0.10) — All 4 tags present, valid JSON plan, valid function
+  2. spatial_reward    (0.20) — Fold coordinates in plan are within bounds, lines valid
+  3. execution_reward  (0.50) — Physical validity + fold quality (code execution)
+  4. consistency_reward(0.20) — Plan matches code, verify matches actual results
+Plus legacy rewards for backwards compatibility:
+  - code_valid, physically_valid, fold_quality
+Lexicographic gating: if code doesn't parse, downstream rewards are 0.
 """
 import ast
+import re
 import sys
+import json
 import math
 import traceback
 from typing import Callable
 # ---------------------------------------------------------------------------
 def extract_function(text: str) -> str | None:
+    """Extract fold_strategy() from <code> blocks or triple-backtick code blocks."""
+    # Try <code> block first (SpatialThinker format)
+    code_match = re.search(r'<code>(.*?)</code>', text, re.DOTALL)
+    if code_match:
+        code_block = code_match.group(1).strip()
+    elif text.count("```") >= 2:
+        first = text.find("```") + 3
+        second = text.find("```", first)
+        code_block = text[first:second].strip()
+    else:
         return None
+    code_block = code_block.removeprefix("```python\n").removeprefix("```python\r\n")
+    code_block = code_block.removeprefix("python\n").removeprefix("python\r\n")
+    code_block = code_block.rstrip("`").strip()
     # Find the def statement
+    def_idx = code_block.find("def ")
     if def_idx == -1:
         return None
+    fx = code_block[def_idx:]
     if fx.startswith("def fold_strategy("):
         return fx
     return None
+def extract_section(text: str, tag: str) -> str | None:
+    """Extract content between <tag>...</tag>."""
+    match = re.search(rf'<{tag}>(.*?)</{tag}>', text, re.DOTALL)
+    return match.group(1).strip() if match else None
+def extract_plan_json(text: str) -> dict | None:
+    """Extract and parse the JSON fold plan from <plan> block."""
+    plan_text = extract_section(text, "plan")
+    if not plan_text:
+        return None
+    try:
+        return json.loads(plan_text)
+    except json.JSONDecodeError:
+        # Try to find JSON object within the plan text
+        brace_start = plan_text.find("{")
+        brace_end = plan_text.rfind("}")
+        if brace_start >= 0 and brace_end > brace_start:
+            try:
+                return json.loads(plan_text[brace_start:brace_end + 1])
+            except json.JSONDecodeError:
+                pass
+    return None
 def check_imports_stdlib_only(code: str) -> tuple[bool, str]:
     """Check that code only imports from Python stdlib."""
     try:
             scores.append(-3.0)
     return scores
+# ---------------------------------------------------------------------------
+# SpatialThinker Dense Rewards (weight 0.10 + 0.20 + 0.50 + 0.20 = 1.0)
+# ---------------------------------------------------------------------------
+REQUIRED_TAGS = ["observe", "plan", "code", "verify"]
+def format_reward(completions, **kwargs) -> list[float]:
+    """
+    SpatialThinker format reward (weight: 0.10).
+    Checks that the response has all 4 structured tags, valid JSON in <plan>,
+    and a parseable function in <code>.
+    Score range: [0.0, 1.0]
+    """
+    scores = []
+    for completion in completions:
+        response = completion[0]["content"]
+        score = 0.0
+        # Check each required tag (0.15 each = 0.60 for all 4)
+        tags_present = 0
+        for tag in REQUIRED_TAGS:
+            if extract_section(response, tag) is not None:
+                tags_present += 1
+        score += 0.15 * tags_present
+        # Valid JSON in <plan> (0.20)
+        plan = extract_plan_json(response)
+        if plan is not None:
+            score += 0.20
+            # Plan has required fields (0.05 bonus)
+            if "folds" in plan and isinstance(plan["folds"], list):
+                score += 0.05
+        # Valid function in <code> (0.15)
+        fn = extract_function(response)
+        if fn is not None:
+            score += 0.15
+        scores.append(score)
+    return scores
+def spatial_reward(completions, **kwargs) -> list[float]:
+    """
+    SpatialThinker spatial plan quality reward (weight: 0.20).
+    Checks that fold coordinates in <plan> are geometrically valid:
+    - Within paper bounds
+    - Line endpoints form valid fold lines (cross the paper)
+    - Fold types are valid
+    - Expected ratio/count are reasonable
+    Score range: [0.0, 1.0]
+    """
+    w = _current_task["width"]
+    h = _current_task["height"]
+    scores = []
+    for completion in completions:
+        response = completion[0]["content"]
+        plan = extract_plan_json(response)
+        if plan is None:
+            scores.append(0.0)
+            continue
+        score = 0.0
+        folds = plan.get("folds", [])
+        if not folds:
+            scores.append(0.0)
+            continue
+        # Score each fold in the plan
+        valid_folds = 0
+        for fold in folds:
+            fold_score = 0.0
+            # Has required fields
+            has_type = fold.get("type") in ("valley", "mountain")
+            has_start = isinstance(fold.get("line_start"), list) and len(fold.get("line_start", [])) == 2
+            has_end = isinstance(fold.get("line_end"), list) and len(fold.get("line_end", [])) == 2
+            if has_type:
+                fold_score += 0.25
+            if has_start and has_end:
+                fold_score += 0.25
+                # Coordinates within paper bounds (with small tolerance)
+                sx, sy = fold["line_start"]
+                ex, ey = fold["line_end"]
+                tol = 0.01
+                in_bounds = (
+                    -tol <= sx <= w + tol and -tol <= sy <= h + tol and
+                    -tol <= ex <= w + tol and -tol <= ey <= h + tol
+                )
+                if in_bounds:
+                    fold_score += 0.25
+                # Start != end (not a degenerate line)
+                dist = math.sqrt((ex - sx)**2 + (ey - sy)**2)
+                if dist > 0.01:
+                    fold_score += 0.25
+            if fold_score > 0.5:
+                valid_folds += 1
+        # Proportion of valid folds
+        score = valid_folds / len(folds) if folds else 0.0
+        # Bonus: expected_ratio is reasonable (0.0 to 1.0)
+        expected = plan.get("expected_ratio")
+        if isinstance(expected, (int, float)) and 0.0 < expected <= 1.0:
+            score = min(1.0, score + 0.1)
+        scores.append(min(1.0, score))
+    return scores
+def execution_reward(completions, **kwargs) -> list[float]:
+    """
+    SpatialThinker execution/accuracy reward (weight: 0.50).
+    Combines code validity, physical validity, and fold quality into
+    one normalized score. This is the main reward signal.
+    Score range: [0.0, 1.0]
+    """
+    scores = []
+    for completion in completions:
+        response = completion[0]["content"]
+        function_code = extract_function(response)
+        # Gate: no function → 0
+        if function_code is None:
+            scores.append(0.0)
+            continue
+        ok, info = check_imports_stdlib_only(function_code)
+        if not ok:
+            scores.append(0.0)
+            continue
+        try:
+            strategy_fn = create_sandboxed_function(function_code)
+        except Exception:
+            scores.append(0.0)
+            continue
+        try:
+            paper = _create_sheet(
+                _current_task["width"],
+                _current_task["height"],
+                _current_task["material"],
+            )
+            original = paper
+            final_state, applied, error = execute_fold_strategy(
+                strategy_fn, paper, _current_task["max_folds"]
+            )
+            if error or len(applied) == 0:
+                scores.append(0.0)
+                continue
+            val = validate_paper(final_state)
+            metrics = compute_metrics(final_state, original)
+            deploy_ratio = metrics.get("deployment_ratio", 1.0)
+            max_strain = metrics.get("max_strain", 0.0)
+            # Physical validity component (0-0.3)
+            phys = 0.3
+            if not val.is_valid:
+                phys -= 0.1 * val.kawasaki_violation
+                phys -= 0.1 * val.maekawa_violation
+                if val.self_intersection_count > 0:
+                    phys -= 0.15
+            mat_limit = _current_task["material"].max_strain
+            if max_strain > mat_limit:
+                phys -= 0.05
+            phys = max(0.0, phys)
+            # Quality component (0-0.5)
+            compactness = 1.0 - deploy_ratio
+            quality = 0.5 * compactness
+            # Target bonus (0-0.2)
+            target = 0.0
+            if deploy_ratio <= _current_task["target_ratio"]:
+                target = 0.2
+            score = phys + quality + target
+            scores.append(min(1.0, score))
+        except Exception:
+            scores.append(0.0)
+    return scores
+def consistency_reward(completions, **kwargs) -> list[float]:
+    """
+    SpatialThinker consistency reward (weight: 0.20).
+    Checks that <plan> matches <code> and <verify> matches actual results.
+    - Plan fold count matches code fold count
+    - Verify predictions close to actual metrics
+    Score range: [0.0, 1.0]
+    """
+    scores = []
+    for completion in completions:
+        response = completion[0]["content"]
+        plan = extract_plan_json(response)
+        verify = extract_section(response, "verify")
+        function_code = extract_function(response)
+        # Need at least plan + code to check consistency
+        if plan is None or function_code is None:
+            scores.append(0.0)
+            continue
+        score = 0.0
+        # 1. Plan fold count vs code fold count (0.4)
+        plan_folds = plan.get("folds", [])
+        plan_count = len(plan_folds)
+        try:
+            strategy_fn = create_sandboxed_function(function_code)
+            paper = _create_sheet(
+                _current_task["width"],
+                _current_task["height"],
+                _current_task["material"],
+            )
+            original = paper
+            final_state, applied, error = execute_fold_strategy(
+                strategy_fn, paper, _current_task["max_folds"]
+            )
+            if error or len(applied) == 0:
+                scores.append(0.0)
+                continue
+            actual_count = len(applied)
+            if plan_count == actual_count:
+                score += 0.4
+            elif abs(plan_count - actual_count) <= 1:
+                score += 0.2
+            # 2. Verify predictions vs actual (0.6)
+            if verify:
+                metrics = compute_metrics(final_state, original)
+                actual_ratio = metrics.get("deployment_ratio", 1.0)
+                # Extract predicted ratio from verify text
+                ratio_match = re.search(
+                    r'deployment\s*ratio[:\s]*([\d.]+)', verify, re.IGNORECASE)
+                if ratio_match:
+                    predicted_ratio = float(ratio_match.group(1))
+                    error_pct = abs(predicted_ratio - actual_ratio)
+                    if error_pct < 0.05:
+                        score += 0.4
+                    elif error_pct < 0.15:
+                        score += 0.2
+                    elif error_pct < 0.3:
+                        score += 0.1
+                # Extract predicted fold count
+                count_match = re.search(
+                    r'fold\s*count[:\s]*(\d+)', verify, re.IGNORECASE)
+                if count_match:
+                    predicted_count = int(count_match.group(1))
+                    if predicted_count == actual_count:
+                        score += 0.2
+                    elif abs(predicted_count - actual_count) <= 1:
+                        score += 0.1
+        except Exception:
+            scores.append(0.0)
+            continue
+        scores.append(min(1.0, score))
+    return scores

trainer/train.py CHANGED Viewed

@@ -19,7 +19,10 @@ if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
 from trainer.prompts import build_prompt, SYSTEM_PROMPT, get_task_target_ratio, get_task_max_folds
-from trainer.rewards import code_valid, physically_valid, fold_quality, set_task_config
 try:
     from engine.materials import get_material
@@ -167,14 +170,18 @@ def main():
     # ========================================================================
     # 6. Create trainer and start training
     # ========================================================================
     trainer = GRPOTrainer(
         model=model,
         processing_class=tokenizer,
         reward_funcs=[
-            code_valid,          # Reward 1: valid Python?
-            physically_valid,    # Reward 2: physically possible folds?
-            fold_quality,        # Reward 3: how good is the solution?
         ],
         args=training_args,
         train_dataset=dataset,
     )

     sys.path.insert(0, PROJECT_ROOT)
 from trainer.prompts import build_prompt, SYSTEM_PROMPT, get_task_target_ratio, get_task_max_folds
+from trainer.rewards import (
+    code_valid, physically_valid, fold_quality, set_task_config,
+    format_reward, spatial_reward, execution_reward, consistency_reward,
+)
 try:
     from engine.materials import get_material
     # ========================================================================
     # 6. Create trainer and start training
     # ========================================================================
+    # SpatialThinker dense rewards (weighted: 0.10 + 0.20 + 0.50 + 0.20)
+    # These replace the legacy 3-reward setup with structured spatial reasoning
     trainer = GRPOTrainer(
         model=model,
         processing_class=tokenizer,
         reward_funcs=[
+            format_reward,       # 0.10 — 4-stage format compliance
+            spatial_reward,      # 0.20 — fold plan geometric validity
+            execution_reward,    # 0.50 — code execution + physical quality
+            consistency_reward,  # 0.20 — plan↔code↔verify agreement
         ],
+        reward_weights=[0.10, 0.20, 0.50, 0.20],
         args=training_args,
         train_dataset=dataset,
     )