Spaces:
Running
Running
Commit ·
aa44758
1
Parent(s): 61b2b6b
Add GRPO trainer scaffold with mock environment
Browse files- trainer/mock_env.py: PaperState, flat sheet creation, mock fold execution
- trainer/prompts.py: system prompt + 4 task templates (half_fold → stent_fold)
- trainer/rewards.py: 3 reward functions (code_valid, physically_valid, fold_quality)
- trainer/train.py: full GRPO training script (Unsloth + TRL + Trackio)
- RESEARCH_NOTES.md: comprehensive research notes from 10 sources
- .gitignore: exclude .reference/ directory
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- .gitignore +3 -0
- RESEARCH_NOTES.md +1053 -0
- trainer/__init__.py +0 -0
- trainer/mock_env.py +233 -0
- trainer/prompts.py +169 -0
- trainer/rewards.py +340 -0
- trainer/train.py +201 -0
.gitignore
CHANGED
|
@@ -21,3 +21,6 @@
|
|
| 21 |
npm-debug.log*
|
| 22 |
yarn-debug.log*
|
| 23 |
yarn-error.log*
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
npm-debug.log*
|
| 22 |
yarn-debug.log*
|
| 23 |
yarn-error.log*
|
| 24 |
+
|
| 25 |
+
# Reference repos (not pushed to HF)
|
| 26 |
+
.reference/
|
RESEARCH_NOTES.md
ADDED
|
@@ -0,0 +1,1053 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optigami Research Notes
|
| 2 |
+
|
| 3 |
+
Comprehensive notes on all sources, tools, and architecture for the Optigami project.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Table of Contents
|
| 8 |
+
|
| 9 |
+
1. [Project Architecture Overview](#1-project-architecture-overview)
|
| 10 |
+
2. [Paper: OrigamiSpace (2511.18450)](#2-paper-origamispace-251118450)
|
| 11 |
+
3. [Paper: SpatialThinker (2511.07403)](#3-paper-spatialthinker-251107403)
|
| 12 |
+
4. [Paper: Automating Rigid Origami Design (2211.13219)](#4-paper-automating-rigid-origami-design-221113219)
|
| 13 |
+
5. [Tool: FOLD Format (edemaine/fold)](#5-tool-fold-format)
|
| 14 |
+
6. [Tool: Origami Simulator](#6-tool-origami-simulator)
|
| 15 |
+
7. [Tool: GamiBench](#7-tool-gamibench)
|
| 16 |
+
8. [Tool: SpatialThinker Codebase](#8-tool-spatialthinker-codebase)
|
| 17 |
+
9. [Tool: Trackio](#9-tool-trackio)
|
| 18 |
+
10. [Tool: Unsloth + GRPO Training](#10-tool-unsloth--grpo-training)
|
| 19 |
+
11. [Unsloth ART / GRPO Trainer Plan](#11-unsloth-art--grpo-trainer-plan)
|
| 20 |
+
12. [Current Project State](#12-current-project-state)
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## 1. Project Architecture Overview
|
| 25 |
+
|
| 26 |
+
```
|
| 27 |
+
+---------------------------------------------------+
|
| 28 |
+
| OpenEnv Server |
|
| 29 |
+
| +-----------+ +----------+ +--------------+ |
|
| 30 |
+
| | State | | Action | | Reward | |
|
| 31 |
+
| | (FOLD JSON| | (LLM | | (Dense, | |
|
| 32 |
+
| | + target)| | output) | | verifiable) | |
|
| 33 |
+
| +-----------+ +----------+ +--------------+ |
|
| 34 |
+
| | | | |
|
| 35 |
+
| v v v |
|
| 36 |
+
| +-----------------------------------------------+|
|
| 37 |
+
| | Paper Geometry Engine (Python) ||
|
| 38 |
+
| | - Polygon state (Shapely) ||
|
| 39 |
+
| | - Fold operations (reflection across line) ||
|
| 40 |
+
| | - Kawasaki/Maekawa constraint checks ||
|
| 41 |
+
| | - Layer tracking ||
|
| 42 |
+
| | - FOLD format import/export ||
|
| 43 |
+
| +-----------------------------------------------+|
|
| 44 |
+
| | |
|
| 45 |
+
| v |
|
| 46 |
+
| +-----------------------------------------------+|
|
| 47 |
+
| | Three.js Visualizer (Demo only) ||
|
| 48 |
+
| | - 3D fold animation ||
|
| 49 |
+
| | - Strain heatmap ||
|
| 50 |
+
| | - Instruction stream ||
|
| 51 |
+
| +-----------------------------------------------+|
|
| 52 |
+
+---------------------------------------------------+
|
| 53 |
+
| ^
|
| 54 |
+
v |
|
| 55 |
+
+---------------------------------------------------+
|
| 56 |
+
| Unsloth ART / GRPO Trainer |
|
| 57 |
+
| - Qwen2.5-VL-7B or Qwen3-4B base model |
|
| 58 |
+
| - LoRA/QLoRA for efficient training |
|
| 59 |
+
| - Multi-turn rollouts |
|
| 60 |
+
+---------------------------------------------------+
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**Three major components:**
|
| 64 |
+
1. **OpenEnv Server** - RL environment serving state/action/reward for origami folding
|
| 65 |
+
2. **Paper Geometry Engine** - Python-based origami math (Shapely polygons, fold reflections, constraint checking)
|
| 66 |
+
3. **Unsloth ART / GRPO Trainer** - RL fine-tuning of vision-language models for origami reasoning
|
| 67 |
+
|
| 68 |
+
**Current focus:** Unsloth ART / GRPO Trainer
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## 2. Paper: OrigamiSpace (2511.18450)
|
| 73 |
+
|
| 74 |
+
**Title:** ORIGAMISPACE: Benchmarking Multimodal LLMs in Multi-Step Spatial Reasoning with Mathematical Constraints
|
| 75 |
+
**Authors:** Rui Xu, Dakuan Lu, Zicheng Zhao, Xiaoyu Tan, Xintao Wang, Siyu Yuan, Jiangjie Chen, Yinghui Xu
|
| 76 |
+
**Date:** November 23, 2025
|
| 77 |
+
**Venue:** arXiv (cs.AI)
|
| 78 |
+
|
| 79 |
+
### Dataset
|
| 80 |
+
- **350 primary instances** + 471 auxiliary (without folding processes)
|
| 81 |
+
- Each instance: CP diagram, compiled flat pattern, folding process (multi-step images), final 3D shape
|
| 82 |
+
- Complexity: Easy (3-9 steps), Medium (10-19), Hard (20-30), avg 8.2 steps
|
| 83 |
+
- **1,620 total questions** across 4 tasks
|
| 84 |
+
|
| 85 |
+
### Four Evaluation Tasks
|
| 86 |
+
|
| 87 |
+
| Task | Questions | Description |
|
| 88 |
+
|------|-----------|-------------|
|
| 89 |
+
| Pattern Prediction | 350 | CP diagram -> predict final 3D shape (multiple choice) |
|
| 90 |
+
| Multi-step Spatial Reasoning | 250 | Shuffled fold images -> correct chronological sequence |
|
| 91 |
+
| Spatial Relationship Prediction | 900 | 3 subtypes: pose localization, layering analysis, geometric change |
|
| 92 |
+
| End-to-End CP Code Generation | 120 | Flat layout + folded shape -> generate CP code |
|
| 93 |
+
|
| 94 |
+
### Compiler Architecture (Critical for OpenEnv)
|
| 95 |
+
Four-category error feedback system:
|
| 96 |
+
1. **CSE (CP Code Syntax Error):** Validates vertices, edges, faces, crease types; checks Euler's formula V-E+F=2
|
| 97 |
+
2. **GIF (Geometrically Impossible Fold):** Maekawa's theorem |M-V|=2, Kawasaki's theorem sum(alpha_i)=2pi, Big-Little-Big angle constraint
|
| 98 |
+
3. **PSI (Paper Self-Intersection):** Cyclic layering, collision detection (discrete + CCD), octrees/BVHs
|
| 99 |
+
4. **AFS (Ambiguous Folding State):** Multiple valid M/V assignments, non-unique stacking
|
| 100 |
+
|
| 101 |
+
### CP Code Evaluation (4 dimensions, 0.25 weight each)
|
| 102 |
+
1. **Topological Structure Similarity (TSS):** Vertex/edge/face count comparison, s_v = e^(-0.5|V_gen - V_ref| / min(V_gen, V_ref))
|
| 103 |
+
2. **Geometric Similarity (GS):** Hausdorff distance, s_p = e^(-5 * d_H), dihedral angle distribution, aspect ratio
|
| 104 |
+
3. **Constraint Satisfaction (CS):** Taco-Taco, Taco-Tortilla, transitivity, Maekawa/Kawasaki
|
| 105 |
+
4. **Final Folded State (FFS):** Shape similarity, layering comparison, stacking order
|
| 106 |
+
|
| 107 |
+
### Learning Approaches
|
| 108 |
+
- **In-Context Learning:** Single-pass, detailed instructions + examples
|
| 109 |
+
- **Environmental Learning:** Iterative model<->compiler loop, max 10 rounds, performance saturates after 8-10
|
| 110 |
+
- **Reinforcement Learning (TRICO/PPO-based):**
|
| 111 |
+
- Training data: 471 instances from environmental learning
|
| 112 |
+
- Model: Qwen2.5-VL-32B
|
| 113 |
+
- **Rewards:** Intermediate (success bonus + quality progress), step penalty, final evaluation score
|
| 114 |
+
- Result: RL-trained 32B exceeded 72B baseline
|
| 115 |
+
|
| 116 |
+
### Key Results
|
| 117 |
+
- Best closed-source: GPT-4o (42.71% pattern), Gemini2.5-pro (53.45% multi-step)
|
| 118 |
+
- Best open-source: Qwen2.5-VL-72B (36.29% pattern, 39.10% multi-step)
|
| 119 |
+
- Expert human: 98.45% pattern, 100% multi-step
|
| 120 |
+
- **Constraint satisfaction is the primary bottleneck** (~30% for top models)
|
| 121 |
+
- Human-model gap: 20-45 percentage points
|
| 122 |
+
|
| 123 |
+
### Relevance to Optigami
|
| 124 |
+
- **Direct blueprint for our OpenEnv server**: the compiler architecture with 4 error types is exactly what we need
|
| 125 |
+
- The CP code evaluation framework (TSS/GS/CS/FFS) can be our reward function
|
| 126 |
+
- Environmental learning approach maps to multi-turn rollouts in GRPO
|
| 127 |
+
- Confirms Qwen2.5-VL as viable base model (they used 32B, we target 7B)
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## 3. Paper: SpatialThinker (2511.07403)
|
| 132 |
+
|
| 133 |
+
**Title:** SpatialThinker: Reinforcing 3D Reasoning in Multimodal LLMs via Spatial Rewards
|
| 134 |
+
**Authors:** Hunar Batra, Haoqin Tu, Hardy Chen, Yuanze Lin, Cihang Xie, Ronald Clark
|
| 135 |
+
**Date:** November 10, 2025
|
| 136 |
+
**Venue:** NeurIPS 2025 Workshops (SpaVLE, EWM, ARLET, SEA)
|
| 137 |
+
|
| 138 |
+
### Core Innovation
|
| 139 |
+
Dense spatial rewards + GRPO for training Qwen2.5-VL on spatial reasoning tasks. Key insight: **sparse rewards lead to reward hacking; dense multi-objective rewards with lexicographic gating prevent this.**
|
| 140 |
+
|
| 141 |
+
### GRPO Training Configuration
|
| 142 |
+
- **Rollouts:** 8 samples per query, temperature 1.0
|
| 143 |
+
- **Batch size:** rollout=512, global=128
|
| 144 |
+
- **Training:** 75 steps (~5 episodes)
|
| 145 |
+
- **Hardware:** 4x NVIDIA H100 80GB
|
| 146 |
+
- **Time:** ~13h (3B), ~15h (7B)
|
| 147 |
+
- **Advantage:** A(i) = (r(i) - mu) / (sigma + epsilon), epsilon=1e-6
|
| 148 |
+
- **Loss:** PPO-style with clip(epsilon_l=0.2, epsilon_h=0.3), KL penalty beta=0.01
|
| 149 |
+
|
| 150 |
+
### Dense Spatial Reward Design (CRITICAL - template for our rewards)
|
| 151 |
+
|
| 152 |
+
**4-component reward with lexicographic gating:**
|
| 153 |
+
|
| 154 |
+
```
|
| 155 |
+
R_total = I[R_format=1] * (w_format*R_f + w_count*R_c + w_accuracy*R_a + I[R_accuracy=1]*w_spatial*R_s)
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
| Component | Weight | Description |
|
| 159 |
+
|-----------|--------|-------------|
|
| 160 |
+
| Format (R_f) | 0.1 | JSON-parseable scene graph with required fields |
|
| 161 |
+
| Count (R_c) | 0.2 | Penalizes deviation in object/relation counts (lambda_obj=0.7, lambda_rel=0.3) |
|
| 162 |
+
| Accuracy (R_a) | 0.5 | Binary exact string match |
|
| 163 |
+
| Spatial (R_s) | 0.2 | Hungarian matching with CIoU, activated ONLY when answer correct |
|
| 164 |
+
|
| 165 |
+
**Lexicographic gating is essential:** format compliance gates all rewards; spatial rewards only activate on correct answers. Without gating, severe reward hacking occurs (74.9% -> 23.7% with naive spatial rewards).
|
| 166 |
+
|
| 167 |
+
### STVQA-7K Dataset
|
| 168 |
+
- 7,587 spatial VQA pairs from Visual Genome scene graphs
|
| 169 |
+
- Generated by Claude Sonnet, validated by GPT-4o pass@2
|
| 170 |
+
- 9 spatial categories, 34 additional spatial predicates beyond standard VG150
|
| 171 |
+
- 90/10 train/val split
|
| 172 |
+
|
| 173 |
+
### Key Results
|
| 174 |
+
- SpatialThinker-7B surpasses GPT-4o on 3DSRBench by +12.1%
|
| 175 |
+
- Dense reward RL: +7.2% avg across 12 benchmarks (1.8x the +4.0% from sparse GRPO)
|
| 176 |
+
- Outperforms models trained on millions of samples (trained on only 7K)
|
| 177 |
+
|
| 178 |
+
### Relevance to Optigami
|
| 179 |
+
- **Direct template for our GRPO training pipeline**
|
| 180 |
+
- Dense reward design with lexicographic gating prevents reward hacking
|
| 181 |
+
- Proves Qwen2.5-VL-7B is excellent base for spatial reasoning RL
|
| 182 |
+
- veRL/EasyR1 framework for training infrastructure
|
| 183 |
+
- Shows 7K samples sufficient for strong results
|
| 184 |
+
|
| 185 |
+
---
|
| 186 |
+
|
| 187 |
+
## 4. Paper: Automating Rigid Origami Design (2211.13219)
|
| 188 |
+
|
| 189 |
+
**Title:** Automating Rigid Origami Design
|
| 190 |
+
**Authors:** Jeremia Geiger, Karolis Martinkus, Oliver Richter, Roger Wattenhofer
|
| 191 |
+
**Date:** November 2022 (revised April 2023)
|
| 192 |
+
**Venue:** IJCAI 2023 AI, Arts & Creativity Special Track
|
| 193 |
+
|
| 194 |
+
### Core Contribution
|
| 195 |
+
- Formulates rigid origami design as discrete optimization: the **"rigid origami game"**
|
| 196 |
+
- Based on "three units method" principle
|
| 197 |
+
- Framework supports diverse objectives via abstract reward functions
|
| 198 |
+
- Generates optimized, application-specific crease patterns
|
| 199 |
+
|
| 200 |
+
### Methodology
|
| 201 |
+
- Multiple search methods within optimization framework
|
| 202 |
+
- Flexible objective definition for application-specific requirements
|
| 203 |
+
- Can approximate target shapes and produce functional designs
|
| 204 |
+
|
| 205 |
+
### Relevance to Optigami
|
| 206 |
+
- Validates the "origami as game/environment" paradigm we're building
|
| 207 |
+
- Their reward formulation approach (function-based, abstract) aligns with our OpenEnv design
|
| 208 |
+
- Discrete optimization over crease patterns = the action space for our RL agent
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
## 5. Tool: FOLD Format
|
| 213 |
+
|
| 214 |
+
**Repo:** https://github.com/edemaine/fold
|
| 215 |
+
**Authors:** Erik Demaine (MIT), Jason Ku (MIT), Robert Lang
|
| 216 |
+
**License:** MIT
|
| 217 |
+
|
| 218 |
+
### What It Is
|
| 219 |
+
FOLD (Flexible Origami List Datastructure) - JSON-based file format (.fold) for representing origami models. The **standard interchange format** for computational origami.
|
| 220 |
+
|
| 221 |
+
### Data Structure
|
| 222 |
+
```json
|
| 223 |
+
{
|
| 224 |
+
"vertices_coords": [[x,y], ...], // 2D or 3D coordinates
|
| 225 |
+
"edges_vertices": [[v1,v2], ...], // Edge endpoints
|
| 226 |
+
"edges_assignment": ["M","V",...], // Mountain/Valley/Boundary/Flat/Unassigned
|
| 227 |
+
"faces_vertices": [[v1,v2,v3], ...], // Face vertex lists
|
| 228 |
+
"faceOrders": [[f1,f2,order], ...], // Stacking/layering order
|
| 229 |
+
"frame_*": ... // Multiple frames (folding states)
|
| 230 |
+
}
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### JavaScript API
|
| 234 |
+
```javascript
|
| 235 |
+
// Browser
|
| 236 |
+
<script src="https://edemaine.github.io/fold/dist/fold.js"></script>
|
| 237 |
+
|
| 238 |
+
// Node.js
|
| 239 |
+
npm install --save fold
|
| 240 |
+
|
| 241 |
+
// Usage: FOLD.moduleName.functionName
|
| 242 |
+
FOLD.filter.collapseNearbyVertices(foldObject)
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
### CLI Tools
|
| 246 |
+
- `fold-convert`: ORIPA .opx -> .fold conversion
|
| 247 |
+
- `fold-convert --flat-fold`: Compute flat-folded state
|
| 248 |
+
|
| 249 |
+
### Supported Software Ecosystem
|
| 250 |
+
OrigamiSimulator, Freeform Origami (Tachi), Rabbit Ear (Kraft), ORIPA, Crease Pattern Editor, Rhino Grasshopper
|
| 251 |
+
|
| 252 |
+
### Relevance to Optigami
|
| 253 |
+
- **Core data format for OpenEnv state representation**
|
| 254 |
+
- JSON = easy Python/JS interop
|
| 255 |
+
- Stacking order (faceOrders) = layer tracking
|
| 256 |
+
- edges_assignment = mountain/valley fold type
|
| 257 |
+
- Import/export between geometry engine and visualizer
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
## 6. Tool: Origami Simulator
|
| 262 |
+
|
| 263 |
+
**Repo:** https://github.com/amandaghassaei/OrigamiSimulator
|
| 264 |
+
**URL:** origamisimulator.org
|
| 265 |
+
**Author:** Amanda Ghassaei
|
| 266 |
+
**License:** MIT
|
| 267 |
+
**Stack:** JavaScript (68.4%), Three.js, GPU fragment shaders
|
| 268 |
+
|
| 269 |
+
### Capabilities
|
| 270 |
+
- Real-time GPU-accelerated folding simulation
|
| 271 |
+
- Folds ALL creases simultaneously (not sequential)
|
| 272 |
+
- Realistic bending simulation between creases
|
| 273 |
+
- Strain visualization (internal stress during folding)
|
| 274 |
+
- Fold Percent slider: 0% (flat) to 100% (fully folded) to -100% (inverted)
|
| 275 |
+
|
| 276 |
+
### File Formats
|
| 277 |
+
- **Input:** SVG, FOLD
|
| 278 |
+
- **Export:** FOLD, STL, OBJ
|
| 279 |
+
|
| 280 |
+
### Physics Engine
|
| 281 |
+
- **Stiffness-based finite element approach:** Triangulated faces are rigid panels connected by rotational hinges along fold lines
|
| 282 |
+
- Each fold edge has a **target angle** (+/-pi for mountain/valley), driven by angular spring forces
|
| 283 |
+
- Solver computes nodal displacements at each timestep to reach equilibrium
|
| 284 |
+
- **Fold stiffness:** Controls how strongly hinges drive toward target angle
|
| 285 |
+
- **Face stiffness:** Controls rigidity of triangulated faces (resistance to bending/deformation)
|
| 286 |
+
- **Damping:** Controls oscillation decay rate
|
| 287 |
+
- **Strain metric:** Per-triangle deviation of edge lengths from rest lengths (flat state)
|
| 288 |
+
- Self-intersection is NOT prevented (folds through itself if geometry demands it)
|
| 289 |
+
- Based on Schenk & Guest structural engineering approach
|
| 290 |
+
- Tomohiro Tachi's freeform origami variations
|
| 291 |
+
- Ruling-aware triangulation for curved creases
|
| 292 |
+
- GPU fragment shaders for parallel computation
|
| 293 |
+
|
| 294 |
+
### Programmatic Usage
|
| 295 |
+
- Core simulation can be driven **headlessly** (without UI) by importing solver module
|
| 296 |
+
- Feed FOLD JSON data -> step simulation programmatically
|
| 297 |
+
- FOLD is JSON, so easy to generate crease patterns from Python and pass to simulator
|
| 298 |
+
- Can embed in other web pages as a component
|
| 299 |
+
|
| 300 |
+
### Dependencies
|
| 301 |
+
- Three.js (3D rendering)
|
| 302 |
+
- FOLD API (internal data structure)
|
| 303 |
+
- Earcut + cdt2d (polygon triangulation)
|
| 304 |
+
- numeric.js (linear algebra)
|
| 305 |
+
- CCapture (GIF/WebM export)
|
| 306 |
+
|
| 307 |
+
### Relevance to Optigami
|
| 308 |
+
- **Direct integration for Three.js Visualizer component**
|
| 309 |
+
- Strain heatmap capability already built in
|
| 310 |
+
- FOLD format native support
|
| 311 |
+
- Can be used for visual verification of generated fold patterns
|
| 312 |
+
- Export to STL/OBJ for 3D shape comparison in rewards
|
| 313 |
+
|
| 314 |
+
---
|
| 315 |
+
|
| 316 |
+
## 7. Tool: GamiBench
|
| 317 |
+
|
| 318 |
+
**Repo:** https://github.com/stvngo/GamiBench
|
| 319 |
+
**Dataset:** https://huggingface.co/datasets/stvngo/GamiBench
|
| 320 |
+
**Paper:** arXiv 2512.22207
|
| 321 |
+
**License:** MIT
|
| 322 |
+
|
| 323 |
+
### Benchmark Design
|
| 324 |
+
- 186 valid + 186 impossible crease patterns
|
| 325 |
+
- 6 viewpoints per pattern (top, bottom, front, back, right, left)
|
| 326 |
+
- **777 total samples** in HuggingFace dataset (45.4 MB)
|
| 327 |
+
- 186 label classes (named origami patterns)
|
| 328 |
+
|
| 329 |
+
### Task Types
|
| 330 |
+
1. Standard tasks (2D CP -> 3D prediction)
|
| 331 |
+
2. Alternative-view tasks
|
| 332 |
+
3. Impossible tasks (validity checking)
|
| 333 |
+
|
| 334 |
+
### Dataset Schema
|
| 335 |
+
```python
|
| 336 |
+
{
|
| 337 |
+
"image": PIL.Image, # Origami pattern/fold image
|
| 338 |
+
"label": int, # 0-185 class label
|
| 339 |
+
"split": str # Split identifier
|
| 340 |
+
}
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
### Loading
|
| 344 |
+
```python
|
| 345 |
+
from datasets import load_dataset
|
| 346 |
+
dataset = load_dataset("stvngo/GamiBench")
|
| 347 |
+
```
|
| 348 |
+
|
| 349 |
+
### Model Support
|
| 350 |
+
- OpenAI (GPT-4, GPT-4o-mini)
|
| 351 |
+
- Anthropic (Claude 4.5 Sonnet)
|
| 352 |
+
- Google (Gemini)
|
| 353 |
+
- xAI (Grok)
|
| 354 |
+
- OpenRouter models
|
| 355 |
+
|
| 356 |
+
### Code Structure
|
| 357 |
+
```
|
| 358 |
+
models/ # Model wrappers & factory
|
| 359 |
+
evaluators/ # BaseEvaluator: evaluate(), evaluate_single()
|
| 360 |
+
benchmarks/ # Benchmark implementations
|
| 361 |
+
configs/ # YAML/JSON configuration
|
| 362 |
+
utils/ # Shared helpers
|
| 363 |
+
pipeline.py # Orchestration
|
| 364 |
+
run.py # Entry point
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
### Relevance to Optigami
|
| 368 |
+
- **Evaluation benchmark for our trained model**
|
| 369 |
+
- 186 origami patterns = potential training/eval data
|
| 370 |
+
- Impossible patterns useful for constraint satisfaction testing
|
| 371 |
+
- Multi-view evaluation tests true 3D understanding
|
| 372 |
+
- Config-driven, reproducible evaluation pipeline
|
| 373 |
+
|
| 374 |
+
---
|
| 375 |
+
|
| 376 |
+
## 8. Tool: SpatialThinker Codebase
|
| 377 |
+
|
| 378 |
+
**Repo:** https://github.com/hunarbatra/SpatialThinker
|
| 379 |
+
**Paper:** arXiv 2511.07403
|
| 380 |
+
|
| 381 |
+
### Architecture
|
| 382 |
+
- Built on Qwen2.5-VL (3B and 7B variants)
|
| 383 |
+
- Uses veRL/EasyR1 for RL training
|
| 384 |
+
- vLLM 0.8.0 for inference during rollouts
|
| 385 |
+
|
| 386 |
+
### Code Structure
|
| 387 |
+
```
|
| 388 |
+
scripts/ # Training bash scripts per model size
|
| 389 |
+
evaluation/ # 18+ benchmark evaluation suite
|
| 390 |
+
data_gen/ # Data synthesis pipeline
|
| 391 |
+
verl/ # RL training framework (GRPO)
|
| 392 |
+
```
|
| 393 |
+
|
| 394 |
+
### Data Generation Pipeline
|
| 395 |
+
1. Generate raw QA pairs (12K-56K options)
|
| 396 |
+
2. Balance/filter with 50% spatial relations focus
|
| 397 |
+
3. Validate via GPT-4o (~75% pass rate)
|
| 398 |
+
4. Upload to HuggingFace
|
| 399 |
+
|
| 400 |
+
### Requirements
|
| 401 |
+
- Python 3.9+
|
| 402 |
+
- Transformers >= 4.49.0
|
| 403 |
+
- Flash-Attn >= 2.4.3
|
| 404 |
+
- vLLM >= 0.7.3
|
| 405 |
+
|
| 406 |
+
### Relevance to Optigami
|
| 407 |
+
- **Reference implementation for our GRPO training setup**
|
| 408 |
+
- veRL/EasyR1 framework = our training infrastructure
|
| 409 |
+
- Dense reward design directly applicable
|
| 410 |
+
- Data generation pipeline can be adapted for origami QA pairs
|
| 411 |
+
|
| 412 |
+
---
|
| 413 |
+
|
| 414 |
+
## 9. Tool: Trackio
|
| 415 |
+
|
| 416 |
+
**Repo:** https://github.com/gradio-app/trackio
|
| 417 |
+
**Author:** Hugging Face / Gradio team
|
| 418 |
+
**License:** MIT
|
| 419 |
+
|
| 420 |
+
### What It Is
|
| 421 |
+
Lightweight, local-first experiment tracking (Weights & Biases alternative). API-compatible with wandb.
|
| 422 |
+
|
| 423 |
+
### Key Features
|
| 424 |
+
- `import trackio as wandb` - drop-in W&B replacement
|
| 425 |
+
- Non-blocking `log()` with background queue (0.5s drain interval)
|
| 426 |
+
- SQLite local storage at `~/.cache/huggingface/trackio`
|
| 427 |
+
- Optional HuggingFace Spaces deployment for dashboards
|
| 428 |
+
- Slack/Discord webhook alerts (INFO/WARN/ERROR)
|
| 429 |
+
- 2,000 logs/8s single run; 32,000 logs/14s with 32 threads
|
| 430 |
+
|
| 431 |
+
### Usage
|
| 432 |
+
```python
|
| 433 |
+
import trackio
|
| 434 |
+
|
| 435 |
+
trackio.init(project="optigami-grpo", config={"lr": 1e-6, "model": "Qwen2.5-VL-7B"})
|
| 436 |
+
trackio.log({"step": step, "reward": reward, "loss": loss})
|
| 437 |
+
trackio.alert(title="Training spike", text="...", level=trackio.AlertLevel.WARN)
|
| 438 |
+
trackio.finish()
|
| 439 |
+
|
| 440 |
+
# Dashboard
|
| 441 |
+
trackio.show(project="optigami-grpo")
|
| 442 |
+
trackio.sync(project="optigami-grpo", space_id="openenv-community/optigami-training")
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
### Relevance to Optigami
|
| 446 |
+
- **Training metrics dashboard for GRPO training runs**
|
| 447 |
+
- Can deploy live dashboard to HF Spaces
|
| 448 |
+
- Track reward components, loss, constraint satisfaction rates
|
| 449 |
+
- Alert on training anomalies (reward hacking, loss spikes)
|
| 450 |
+
|
| 451 |
+
---
|
| 452 |
+
|
| 453 |
+
## 10. Tool: Unsloth + GRPO Training
|
| 454 |
+
|
| 455 |
+
**Repo:** https://github.com/unslothai/unsloth
|
| 456 |
+
**Docs:** https://unsloth.ai/docs
|
| 457 |
+
|
| 458 |
+
### GRPO Algorithm in Unsloth
|
| 459 |
+
1. Generate N responses per prompt (8+ recommended)
|
| 460 |
+
2. Score each with custom reward functions
|
| 461 |
+
3. Z-score normalize rewards across group -> advantages
|
| 462 |
+
4. PPO-style policy update (no value model or reward model needed)
|
| 463 |
+
|
| 464 |
+
### Memory Efficiency
|
| 465 |
+
- **90% less VRAM** vs standard GRPO
|
| 466 |
+
- 20K context, 8 generations, Llama 8B: 54.3GB (vs 510.8GB standard)
|
| 467 |
+
- QLoRA 4-bit: model params (GB) ~ VRAM needed
|
| 468 |
+
- Shared GPU memory with vLLM inference engine
|
| 469 |
+
|
| 470 |
+
### Vision Model Support
|
| 471 |
+
- Qwen2.5-VL-7B directly supported
|
| 472 |
+
- Qwen3-VL-8B, Gemma 3 (4B) also available
|
| 473 |
+
- `FastVisionModel.get_peft_model()` with granular layer control:
|
| 474 |
+
- `finetune_vision_layers`, `finetune_language_layers`
|
| 475 |
+
- `finetune_attention_modules`, `finetune_mlp_modules`
|
| 476 |
+
|
| 477 |
+
### LoRA Configuration
|
| 478 |
+
```python
|
| 479 |
+
model = FastVisionModel.get_peft_model(
|
| 480 |
+
model,
|
| 481 |
+
r=16, # LoRA rank
|
| 482 |
+
lora_alpha=16, # alpha == r recommended
|
| 483 |
+
lora_dropout=0,
|
| 484 |
+
finetune_vision_layers=True,
|
| 485 |
+
finetune_language_layers=True,
|
| 486 |
+
finetune_attention_modules=True,
|
| 487 |
+
finetune_mlp_modules=True,
|
| 488 |
+
)
|
| 489 |
+
```
|
| 490 |
+
|
| 491 |
+
### GRPOConfig Options
|
| 492 |
+
```python
|
| 493 |
+
GRPOConfig(
|
| 494 |
+
loss_type='grpo', # or 'gspo', 'dr_grpo'
|
| 495 |
+
epsilon=0.2,
|
| 496 |
+
epsilon_high=0.28,
|
| 497 |
+
delta=1.5,
|
| 498 |
+
# ... standard training args
|
| 499 |
+
)
|
| 500 |
+
```
|
| 501 |
+
|
| 502 |
+
### vLLM Integration
|
| 503 |
+
- Shared memory between Unsloth and vLLM saves 3-5GB
|
| 504 |
+
- A100 40GB: ~4000 tokens/sec, T4 16GB: ~300 tokens/sec
|
| 505 |
+
- `fast_inference=True` enables vLLM backend
|
| 506 |
+
|
| 507 |
+
### Training Requirements
|
| 508 |
+
- Minimum 300 steps before meaningful progress
|
| 509 |
+
- 500+ data rows recommended (works with 10+)
|
| 510 |
+
- Models >= 1.5B parameters for reasoning tokens
|
| 511 |
+
- Steps = rows x epochs; increase generations (8->16) for more data
|
| 512 |
+
|
| 513 |
+
### Vision Data Format
|
| 514 |
+
```python
|
| 515 |
+
[
|
| 516 |
+
{"role": "user", "content": [
|
| 517 |
+
{"type": "text", "text": "instruction"},
|
| 518 |
+
{"type": "image", "image": pil_image}
|
| 519 |
+
]},
|
| 520 |
+
{"role": "assistant", "content": [
|
| 521 |
+
{"type": "text", "text": "response"}
|
| 522 |
+
]}
|
| 523 |
+
]
|
| 524 |
+
```
|
| 525 |
+
|
| 526 |
+
### GRPO vs PPO vs DPO Comparison
|
| 527 |
+
|
| 528 |
+
| Aspect | PPO | DPO | GRPO |
|
| 529 |
+
|--------|-----|-----|------|
|
| 530 |
+
| Critic/Value model | Required (same size as policy) | Not needed | **Not needed** |
|
| 531 |
+
| Reference model | Required | Required | Required (old policy) |
|
| 532 |
+
| Training data | Online rollouts | Offline preference pairs | **Online rollouts + group scoring** |
|
| 533 |
+
| Reward signal | Scalar per token/step | Implicit from preferences | **Verifiable/explicit** |
|
| 534 |
+
| VRAM overhead | ~2x (policy + critic) | ~2x (policy + ref) | **~1.5x (no critic)** |
|
| 535 |
+
|
| 536 |
+
### GRPO Advantage Estimation
|
| 537 |
+
```
|
| 538 |
+
A_i = (r_i - mean(r_1..r_G)) / std(r_1..r_G)
|
| 539 |
+
```
|
| 540 |
+
By sampling G completions and normalizing rewards within the group, GRPO creates its own baseline without a value network - halving VRAM vs PPO.
|
| 541 |
+
|
| 542 |
+
### Complete Unsloth GRPO Code Example
|
| 543 |
+
```python
|
| 544 |
+
from unsloth import FastLanguageModel, PatchFastRL
|
| 545 |
+
PatchFastRL("GRPO", FastLanguageModel) # Patch TRL with Unsloth optimizations
|
| 546 |
+
|
| 547 |
+
from trl import GRPOConfig, GRPOTrainer
|
| 548 |
+
|
| 549 |
+
# Load model with QLoRA
|
| 550 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 551 |
+
model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
|
| 552 |
+
max_seq_length=4096,
|
| 553 |
+
load_in_4bit=True,
|
| 554 |
+
dtype=None,
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
# Add LoRA adapters
|
| 558 |
+
model = FastLanguageModel.get_peft_model(
|
| 559 |
+
model,
|
| 560 |
+
r=64, # Higher rank for reasoning tasks
|
| 561 |
+
target_modules=[
|
| 562 |
+
"q_proj", "k_proj", "v_proj", "o_proj",
|
| 563 |
+
"gate_proj", "up_proj", "down_proj",
|
| 564 |
+
],
|
| 565 |
+
lora_alpha=64, # alpha == r recommended
|
| 566 |
+
lora_dropout=0, # Unsloth recommends 0
|
| 567 |
+
bias="none",
|
| 568 |
+
use_gradient_checkpointing="unsloth", # Unsloth's optimized GC
|
| 569 |
+
random_state=3407,
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
# Reward functions (TRL accepts a list, scores are summed)
|
| 573 |
+
def correctness_reward(completions, ground_truth, **kwargs):
|
| 574 |
+
rewards = []
|
| 575 |
+
for completion, gt in zip(completions, ground_truth):
|
| 576 |
+
answer_match = re.search(r'</think>\s*(.*?)$', completion, re.DOTALL)
|
| 577 |
+
if answer_match and answer_match.group(1).strip() == gt.strip():
|
| 578 |
+
rewards.append(1.0)
|
| 579 |
+
else:
|
| 580 |
+
rewards.append(0.0)
|
| 581 |
+
return rewards
|
| 582 |
+
|
| 583 |
+
def format_reward(completions, **kwargs):
|
| 584 |
+
return [0.5 if ("<think>" in c and "</think>" in c) else 0.0 for c in completions]
|
| 585 |
+
|
| 586 |
+
# GRPO Config
|
| 587 |
+
config = GRPOConfig(
|
| 588 |
+
output_dir="./grpo_output",
|
| 589 |
+
num_generations=8, # Group size G
|
| 590 |
+
max_completion_length=2048,
|
| 591 |
+
per_device_train_batch_size=1,
|
| 592 |
+
gradient_accumulation_steps=4,
|
| 593 |
+
num_train_epochs=1,
|
| 594 |
+
learning_rate=5e-6,
|
| 595 |
+
lr_scheduler_type="cosine",
|
| 596 |
+
warmup_ratio=0.1,
|
| 597 |
+
beta=0.04, # KL penalty coefficient
|
| 598 |
+
max_grad_norm=0.1,
|
| 599 |
+
logging_steps=1,
|
| 600 |
+
save_steps=250,
|
| 601 |
+
bf16=True,
|
| 602 |
+
loss_type='grpo', # or 'gspo', 'dr_grpo'
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
trainer = GRPOTrainer(
|
| 606 |
+
model=model,
|
| 607 |
+
config=config,
|
| 608 |
+
train_dataset=dataset,
|
| 609 |
+
reward_funcs=[correctness_reward, format_reward],
|
| 610 |
+
tokenizer=tokenizer,
|
| 611 |
+
)
|
| 612 |
+
trainer.train()
|
| 613 |
+
|
| 614 |
+
# Save LoRA adapter
|
| 615 |
+
model.save_pretrained("./grpo_lora_adapter")
|
| 616 |
+
# Optional: merge and push
|
| 617 |
+
# model.save_pretrained_merged("./grpo_merged", tokenizer)
|
| 618 |
+
# model.push_to_hub_merged("username/model-name", tokenizer)
|
| 619 |
+
```
|
| 620 |
+
|
| 621 |
+
### Vision GRPO with Qwen2.5-VL
|
| 622 |
+
```python
|
| 623 |
+
from unsloth import FastVisionModel, PatchFastRL
|
| 624 |
+
PatchFastRL("GRPO", FastVisionModel)
|
| 625 |
+
|
| 626 |
+
model, tokenizer = FastVisionModel.from_pretrained(
|
| 627 |
+
"unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
|
| 628 |
+
max_seq_length=4096,
|
| 629 |
+
load_in_4bit=True,
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
# For VLMs: typically freeze vision encoder, train language layers
|
| 633 |
+
model = FastVisionModel.get_peft_model(
|
| 634 |
+
model,
|
| 635 |
+
r=16, # Lower rank often sufficient for VLMs
|
| 636 |
+
lora_alpha=16,
|
| 637 |
+
lora_dropout=0,
|
| 638 |
+
bias="none",
|
| 639 |
+
use_gradient_checkpointing="unsloth",
|
| 640 |
+
finetune_vision_layers=False, # Keep vision encoder frozen
|
| 641 |
+
finetune_language_layers=True,
|
| 642 |
+
finetune_attention_modules=True,
|
| 643 |
+
finetune_mlp_modules=True,
|
| 644 |
+
)
|
| 645 |
+
```
|
| 646 |
+
|
| 647 |
+
### Unsloth ART (Agentic Reasoning Training)
|
| 648 |
+
|
| 649 |
+
ART extends GRPO for multi-turn agentic tasks:
|
| 650 |
+
|
| 651 |
+
1. **Multi-turn rollouts:** Model interacts with environment over multiple turns (actions + observations)
|
| 652 |
+
2. **Environment integration:** Custom env provides observations and final rewards
|
| 653 |
+
3. **Verifiable rewards:** Emphasizes automatically verifiable outcomes
|
| 654 |
+
|
| 655 |
+
**Multi-turn pattern:**
|
| 656 |
+
```
|
| 657 |
+
Turn 1: User prompt -> Model <think> + action -> Environment observation
|
| 658 |
+
Turn 2: Observation -> Model <think> + action -> Environment observation
|
| 659 |
+
Turn 3: Observation -> Model final answer -> Reward computed
|
| 660 |
+
```
|
| 661 |
+
|
| 662 |
+
**Implementation options for multi-turn:**
|
| 663 |
+
1. **Single-generation (simpler):** Model outputs full plan/sequence in one generation; reward function evaluates the whole sequence
|
| 664 |
+
2. **Custom rollout loop (advanced):** Alternate model generation and env response, collect full trajectory, compute GRPO gradients on combined trajectory
|
| 665 |
+
|
| 666 |
+
### Key Hyperparameters Reference
|
| 667 |
+
|
| 668 |
+
| Parameter | Range | Notes |
|
| 669 |
+
|-----------|-------|-------|
|
| 670 |
+
| `num_generations` (G) | 4-16 | 8 common. More = better advantages, more VRAM |
|
| 671 |
+
| `beta` (KL penalty) | 0.01-0.1 | 0.04 default. Higher = stay closer to reference |
|
| 672 |
+
| `learning_rate` | 1e-6 to 1e-5 | Lower than SFT. 5e-6 starting point |
|
| 673 |
+
| `max_completion_length` | 512-4096 | Task-dependent |
|
| 674 |
+
| `r` (LoRA rank) | 16-128 | 64 for reasoning, 16 for VLM |
|
| 675 |
+
| `gradient_accumulation_steps` | 4-16 | Effective batch = per_device * accum * GPUs |
|
| 676 |
+
| `max_grad_norm` | 0.1-1.0 | 0.1 for stability |
|
| 677 |
+
| `warmup_ratio` | 0.05-0.1 | Important for RL stability |
|
| 678 |
+
| `epsilon` (clip) | 0.2 | PPO-style clipping |
|
| 679 |
+
| `epsilon_high` | 0.28 | Asymmetric upper clip |
|
| 680 |
+
|
| 681 |
+
### Qwen2.5-VL-7B Model Specifics
|
| 682 |
+
- Vision encoder: ViT with 2D-RoPE (handles arbitrary image resolutions via dynamic patching)
|
| 683 |
+
- LLM backbone: 28 layers, 3584 hidden dim, 28 attn heads, GQA with 4 KV heads
|
| 684 |
+
- Context: up to 32K tokens (128K with YaRN)
|
| 685 |
+
- Supports: single image, multi-image, video frames
|
| 686 |
+
- Unsloth IDs: `unsloth/Qwen2.5-VL-7B-Instruct`, `unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit`
|
| 687 |
+
|
| 688 |
+
### Qwen3-4B Model Specifics
|
| 689 |
+
- Hybrid thinking: can switch between `<think>` mode and direct response
|
| 690 |
+
- ~4B parameters, efficient for RL training
|
| 691 |
+
- MoE variants also available
|
| 692 |
+
- Unsloth IDs: `unsloth/Qwen3-4B`, `unsloth/Qwen3-4B-bnb-4bit`
|
| 693 |
+
|
| 694 |
+
---
|
| 695 |
+
|
| 696 |
+
## 11. Unsloth ART / GRPO Trainer Plan
|
| 697 |
+
|
| 698 |
+
### Phase 1: Data Preparation
|
| 699 |
+
|
| 700 |
+
**Training Data Sources:**
|
| 701 |
+
1. OrigamiSpace dataset (471 auxiliary instances) - CP diagrams, fold sequences, 3D shapes
|
| 702 |
+
2. GamiBench dataset (777 samples, 186 patterns) - crease patterns with multi-view 3D
|
| 703 |
+
3. Synthetic data generation pipeline (following SpatialThinker approach):
|
| 704 |
+
- Generate origami QA pairs with Claude/GPT
|
| 705 |
+
- Validate with GPT-4o pass@2
|
| 706 |
+
- Balance across difficulty levels
|
| 707 |
+
|
| 708 |
+
**Data Format for GRPO:**
|
| 709 |
+
```python
|
| 710 |
+
# Each training example = a prompt with origami task
|
| 711 |
+
{
|
| 712 |
+
"prompt": [
|
| 713 |
+
{"role": "user", "content": [
|
| 714 |
+
{"type": "image", "image": cp_diagram_image},
|
| 715 |
+
{"type": "text", "text": "Given this crease pattern, describe the folding sequence and predict the final 3D shape. Output your answer as a FOLD JSON."}
|
| 716 |
+
]}
|
| 717 |
+
]
|
| 718 |
+
}
|
| 719 |
+
```
|
| 720 |
+
|
| 721 |
+
### Phase 2: Reward Function Design
|
| 722 |
+
|
| 723 |
+
**Following SpatialThinker's lexicographic gating pattern, adapted for origami:**
|
| 724 |
+
|
| 725 |
+
```python
|
| 726 |
+
def origami_reward(prompt, response, ground_truth):
|
| 727 |
+
# Component 1: Format reward (gate)
|
| 728 |
+
r_format = check_valid_fold_json(response) # 0 or 1
|
| 729 |
+
|
| 730 |
+
# Component 2: Constraint satisfaction
|
| 731 |
+
r_constraints = check_origami_constraints(response)
|
| 732 |
+
# - Maekawa's theorem: |M-V| = 2
|
| 733 |
+
# - Kawasaki's theorem: sum(alpha_i) = 2*pi
|
| 734 |
+
# - Euler's formula: V - E + F = 2
|
| 735 |
+
# - No self-intersection
|
| 736 |
+
|
| 737 |
+
# Component 3: Topological similarity
|
| 738 |
+
r_topology = compute_tss(response, ground_truth)
|
| 739 |
+
# Vertex/edge/face counts, connectivity
|
| 740 |
+
|
| 741 |
+
# Component 4: Geometric similarity
|
| 742 |
+
r_geometry = compute_hausdorff_similarity(response, ground_truth)
|
| 743 |
+
|
| 744 |
+
# Component 5: Final shape match
|
| 745 |
+
r_shape = compute_folded_state_similarity(response, ground_truth)
|
| 746 |
+
|
| 747 |
+
# Lexicographic gating
|
| 748 |
+
if r_format == 0:
|
| 749 |
+
return 0.0
|
| 750 |
+
|
| 751 |
+
total = (0.1 * r_format +
|
| 752 |
+
0.25 * r_constraints +
|
| 753 |
+
0.2 * r_topology +
|
| 754 |
+
0.2 * r_geometry +
|
| 755 |
+
0.25 * r_shape)
|
| 756 |
+
|
| 757 |
+
return total
|
| 758 |
+
```
|
| 759 |
+
|
| 760 |
+
### Phase 3: Training Infrastructure
|
| 761 |
+
|
| 762 |
+
**Option A: Unsloth (simpler, less VRAM)**
|
| 763 |
+
```python
|
| 764 |
+
from unsloth import FastVisionModel
|
| 765 |
+
from trl import GRPOConfig, GRPOTrainer
|
| 766 |
+
|
| 767 |
+
model, tokenizer = FastVisionModel.from_pretrained(
|
| 768 |
+
"unsloth/Qwen2.5-VL-7B-Instruct",
|
| 769 |
+
load_in_4bit=True,
|
| 770 |
+
fast_inference=True,
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
model = FastVisionModel.get_peft_model(model, r=16, lora_alpha=16)
|
| 774 |
+
|
| 775 |
+
config = GRPOConfig(
|
| 776 |
+
loss_type="grpo",
|
| 777 |
+
num_generations=8,
|
| 778 |
+
max_new_tokens=2048,
|
| 779 |
+
per_device_train_batch_size=1,
|
| 780 |
+
gradient_accumulation_steps=16,
|
| 781 |
+
num_train_epochs=3,
|
| 782 |
+
learning_rate=1e-6,
|
| 783 |
+
)
|
| 784 |
+
|
| 785 |
+
trainer = GRPOTrainer(
|
| 786 |
+
model=model,
|
| 787 |
+
config=config,
|
| 788 |
+
train_dataset=dataset,
|
| 789 |
+
reward_funcs=[origami_reward],
|
| 790 |
+
)
|
| 791 |
+
|
| 792 |
+
trainer.train()
|
| 793 |
+
```
|
| 794 |
+
|
| 795 |
+
**Option B: veRL/EasyR1 (following SpatialThinker, more control)**
|
| 796 |
+
- Uses veRL framework with GRPO
|
| 797 |
+
- vLLM backend for fast rollouts
|
| 798 |
+
- More complex but battle-tested for spatial reasoning
|
| 799 |
+
- Better for multi-turn rollouts
|
| 800 |
+
|
| 801 |
+
### Phase 4: Multi-Turn Rollouts
|
| 802 |
+
|
| 803 |
+
Following OrigamiSpace's environmental learning approach:
|
| 804 |
+
1. Model generates CP code / fold sequence
|
| 805 |
+
2. OpenEnv compiler validates and returns error feedback
|
| 806 |
+
3. Model refines based on error type (CSE/GIF/PSI/AFS)
|
| 807 |
+
4. Repeat up to 10 rounds
|
| 808 |
+
5. Final reward based on best attempt
|
| 809 |
+
|
| 810 |
+
**Environment class pattern:**
|
| 811 |
+
```python
|
| 812 |
+
class OrigamiEnv:
|
| 813 |
+
def __init__(self, task):
|
| 814 |
+
self.task = task
|
| 815 |
+
self.state = task["initial_state"] # FOLD JSON
|
| 816 |
+
self.steps = 0
|
| 817 |
+
self.max_steps = 10
|
| 818 |
+
self.history = []
|
| 819 |
+
|
| 820 |
+
def step(self, action: str):
|
| 821 |
+
"""Process model's fold action, return compiler feedback."""
|
| 822 |
+
self.steps += 1
|
| 823 |
+
# Validate through compiler (CSE/GIF/PSI/AFS checks)
|
| 824 |
+
result = self.compile_and_validate(action)
|
| 825 |
+
observation = f"Step {self.steps}: {result['error_type']}: {result['message']}"
|
| 826 |
+
self.state = result.get("new_state", self.state)
|
| 827 |
+
self.history.append((action, observation))
|
| 828 |
+
done = self.steps >= self.max_steps or result.get("valid", False)
|
| 829 |
+
reward = self.compute_reward() if done else 0.0
|
| 830 |
+
return observation, reward, done
|
| 831 |
+
|
| 832 |
+
def compute_reward(self):
|
| 833 |
+
"""4-dimensional evaluation: TSS + GS + CS + FFS."""
|
| 834 |
+
return (0.25 * tss(self.state, self.task["target"]) +
|
| 835 |
+
0.25 * gs(self.state, self.task["target"]) +
|
| 836 |
+
0.25 * cs(self.state) +
|
| 837 |
+
0.25 * ffs(self.state, self.task["target"]))
|
| 838 |
+
|
| 839 |
+
def multi_turn_reward(completions, prompts, **kwargs):
|
| 840 |
+
"""Wrap environment interaction into GRPO reward function."""
|
| 841 |
+
rewards = []
|
| 842 |
+
for completion, prompt in zip(completions, prompts):
|
| 843 |
+
env = OrigamiEnv(extract_task(prompt))
|
| 844 |
+
actions = parse_actions(completion)
|
| 845 |
+
total_reward = 0.0
|
| 846 |
+
for action in actions:
|
| 847 |
+
obs, reward, done = env.step(action)
|
| 848 |
+
total_reward += reward
|
| 849 |
+
if done:
|
| 850 |
+
break
|
| 851 |
+
rewards.append(total_reward)
|
| 852 |
+
return rewards
|
| 853 |
+
```
|
| 854 |
+
|
| 855 |
+
### Phase 5: Evaluation
|
| 856 |
+
|
| 857 |
+
1. **GamiBench** - standard origami spatial reasoning benchmark
|
| 858 |
+
2. **OrigamiSpace tasks** - 4-task evaluation suite
|
| 859 |
+
3. **Custom metrics:**
|
| 860 |
+
- Constraint satisfaction rate (Maekawa/Kawasaki)
|
| 861 |
+
- Compilation success rate
|
| 862 |
+
- Topological/geometric similarity scores
|
| 863 |
+
|
| 864 |
+
### Phase 6: Monitoring with Trackio
|
| 865 |
+
|
| 866 |
+
```python
|
| 867 |
+
import trackio
|
| 868 |
+
|
| 869 |
+
trackio.init(
|
| 870 |
+
project="optigami-grpo",
|
| 871 |
+
space_id="openenv-community/optigami-training",
|
| 872 |
+
config={
|
| 873 |
+
"model": "Qwen2.5-VL-7B",
|
| 874 |
+
"lora_r": 16,
|
| 875 |
+
"num_generations": 8,
|
| 876 |
+
"learning_rate": 1e-6,
|
| 877 |
+
}
|
| 878 |
+
)
|
| 879 |
+
|
| 880 |
+
# In training loop
|
| 881 |
+
trackio.log({
|
| 882 |
+
"step": step,
|
| 883 |
+
"reward/total": total_reward,
|
| 884 |
+
"reward/format": format_reward,
|
| 885 |
+
"reward/constraints": constraint_reward,
|
| 886 |
+
"reward/topology": topology_reward,
|
| 887 |
+
"reward/geometry": geometry_reward,
|
| 888 |
+
"reward/shape": shape_reward,
|
| 889 |
+
"loss": loss,
|
| 890 |
+
"compilation_rate": compilation_rate,
|
| 891 |
+
})
|
| 892 |
+
```
|
| 893 |
+
|
| 894 |
+
---
|
| 895 |
+
|
| 896 |
+
## 12. GitHub Reference Repo (ianalin123/optigami)
|
| 897 |
+
|
| 898 |
+
Located at `.reference/optigami-github/` (gitignored, not pushed to HF).
|
| 899 |
+
|
| 900 |
+
### What It Contains
|
| 901 |
+
A complete research repository with detailed architecture docs and a reference 2048 GRPO implementation.
|
| 902 |
+
|
| 903 |
+
### Key Files
|
| 904 |
+
|
| 905 |
+
| File | Contents |
|
| 906 |
+
|------|----------|
|
| 907 |
+
| `research/plan/architecture.md` | **Full architecture spec**: action space, state, physics engine, reward functions, OpenEnv integration, rendering pipeline, project structure, implementation order |
|
| 908 |
+
| `research/openenv/2048_example.py` | **636-line reference implementation** of OpenEnv + GRPO for 2048 game (Unsloth + TRL) |
|
| 909 |
+
| `research/openenv/overview.md` | OpenEnv framework API, types, project structure, deployment to HF Spaces |
|
| 910 |
+
| `research/origami/fold_types_deep.md` | All fold operations, Huzita-Justin axioms, crane step-by-step, compression patterns |
|
| 911 |
+
| `research/origami/math_physics_deep.md` | Kawasaki/Maekawa theorems with code, bar-and-hinge model, energy formulas |
|
| 912 |
+
| `research/origami/rendering_research.md` | Rendering options comparison |
|
| 913 |
+
| `research/origami/fold_format.md` | FOLD file format details |
|
| 914 |
+
|
| 915 |
+
### Architecture Decisions (from GitHub repo)
|
| 916 |
+
|
| 917 |
+
| Decision | Choice |
|
| 918 |
+
|----------|--------|
|
| 919 |
+
| LLM interaction | **Code-as-policy** (LLM writes `fold_strategy()` function) |
|
| 920 |
+
| Action space | Named fold ops (valley/mountain + fold line + angle) |
|
| 921 |
+
| State format | FOLD-compatible JSON |
|
| 922 |
+
| Physics engine | Bar-and-hinge model (NumPy port of Ghassaei) |
|
| 923 |
+
| Validation | Kawasaki + Maekawa + triangle-triangle intersection |
|
| 924 |
+
| Primary task | Solar panel packing (Miura-ori discovery) |
|
| 925 |
+
| Training | GRPO via TRL + Unsloth |
|
| 926 |
+
| Deployment | Docker Space on HF Spaces |
|
| 927 |
+
|
| 928 |
+
### Action Space (Code-as-Policy)
|
| 929 |
+
The LLM generates a `fold_strategy(paper_state)` function returning fold instructions:
|
| 930 |
+
```python
|
| 931 |
+
def fold_strategy(paper_state: dict) -> list[dict]:
|
| 932 |
+
# paper_state contains: vertices, edges, assignments, fold_angles, material, etc.
|
| 933 |
+
return [
|
| 934 |
+
{"type": "valley", "line": {"start": [0,0.5], "end": [1,0.5]}, "angle": 180},
|
| 935 |
+
{"type": "mountain", "line": {"start": [0.5,0], "end": [0.5,0.5]}, "angle": 180},
|
| 936 |
+
]
|
| 937 |
+
```
|
| 938 |
+
|
| 939 |
+
### Reward Functions (3 from 2048 pattern, adapted for origami)
|
| 940 |
+
|
| 941 |
+
1. **`code_valid`**: +1.0 valid function, -0.5 exec fails, -2.0 syntax error
|
| 942 |
+
2. **`physically_valid`**: +1.0 all valid, -2.0 per Kawasaki/Maekawa violation, -5.0 self-intersection
|
| 943 |
+
3. **`fold_quality`**: +20.0 * compactness, +10.0 meets volume target, +5.0 deployable, -0.5 per fold
|
| 944 |
+
|
| 945 |
+
### Physics Engine (Bar-and-Hinge Model)
|
| 946 |
+
```python
|
| 947 |
+
E_total = E_bar + E_facet + E_fold
|
| 948 |
+
E_bar = sum (1/2) * k_axial * (L - L0)^2 # stretching
|
| 949 |
+
E_facet = sum (1/2) * k_facet * l * (theta-pi)^2 # panel bending
|
| 950 |
+
E_fold = sum (1/2) * k_fold * l * (rho-rho_t)^2 # crease folding
|
| 951 |
+
```
|
| 952 |
+
|
| 953 |
+
### Planned Project Structure
|
| 954 |
+
```
|
| 955 |
+
engine/ # Core simulation (numpy/scipy)
|
| 956 |
+
paper.py # Paper data structure, FOLD I/O
|
| 957 |
+
fold_engine.py # Apply folds (quaternion rotation)
|
| 958 |
+
physics.py # Bar-and-hinge energy, strain
|
| 959 |
+
validation.py # Kawasaki, Maekawa, self-intersection
|
| 960 |
+
metrics.py # Deployment ratio, compactness
|
| 961 |
+
materials.py # Material definitions
|
| 962 |
+
|
| 963 |
+
environment/ # OpenEnv server
|
| 964 |
+
models.py # Action, Observation, State
|
| 965 |
+
origami_environment.py # Environment (reset/step/state)
|
| 966 |
+
tasks.py # Task pool / curriculum
|
| 967 |
+
app.py # create_app()
|
| 968 |
+
Dockerfile
|
| 969 |
+
|
| 970 |
+
client/ # OpenEnv client + training bridge
|
| 971 |
+
reward_functions.py # code_valid, physically_valid, fold_quality
|
| 972 |
+
|
| 973 |
+
training/ # Colab notebook
|
| 974 |
+
train_origami.ipynb # GRPO training (Unsloth + TRL)
|
| 975 |
+
prompts.py # LLM prompt templates
|
| 976 |
+
```
|
| 977 |
+
|
| 978 |
+
### Implementation Order (from architecture.md)
|
| 979 |
+
1. **Phase 1: Engine** - paper.py, fold_engine.py, validation.py, metrics.py
|
| 980 |
+
2. **Phase 2: OpenEnv Server** - models.py, origami_environment.py, app.py, Dockerfile
|
| 981 |
+
3. **Phase 3: Reward + Training** - reward_functions.py, prompts.py, train_origami.ipynb
|
| 982 |
+
4. **Phase 4: Rendering + Demo** - matplotlib headless, React + R3F app
|
| 983 |
+
|
| 984 |
+
### 2048 Reference Implementation (Key Patterns)
|
| 985 |
+
The `2048_example.py` shows the exact Unsloth + OpenEnv + GRPO pattern:
|
| 986 |
+
- `PatchFastRL` not used (text model, not vision) - for our VLM use `FastVisionModel`
|
| 987 |
+
- `extract_function()` parses code from ```python blocks
|
| 988 |
+
- `create_locked_down_function()` sandboxes execution
|
| 989 |
+
- `check_python_modules()` prevents non-stdlib imports
|
| 990 |
+
- `execute_with_time_limit(5)` wraps strategy execution
|
| 991 |
+
- Dataset: 1000x replicated prompt, `report_to="trackio"`
|
| 992 |
+
- GRPOConfig: temp=1.0, lr=2e-4, max_steps=600, num_generations=2
|
| 993 |
+
- Three reward functions passed as list to `GRPOTrainer`
|
| 994 |
+
|
| 995 |
+
---
|
| 996 |
+
|
| 997 |
+
## 13. Current Project State
|
| 998 |
+
|
| 999 |
+
### Repository
|
| 1000 |
+
- **Location:** HuggingFace Space `openenv-community/optigami`
|
| 1001 |
+
- **Framework:** Create React App (React 19.1.0)
|
| 1002 |
+
- **Status:** Fresh scaffold - default CRA boilerplate
|
| 1003 |
+
- **Build:** `npm run build` -> `build/index.html` (HF Spaces static SDK)
|
| 1004 |
+
|
| 1005 |
+
### File Structure
|
| 1006 |
+
```
|
| 1007 |
+
optigami/
|
| 1008 |
+
package.json # React app dependencies
|
| 1009 |
+
README.md # CRA default + HF Space metadata
|
| 1010 |
+
public/ # Static assets (favicon, manifest)
|
| 1011 |
+
src/
|
| 1012 |
+
App.js # Default CRA component (placeholder)
|
| 1013 |
+
App.css
|
| 1014 |
+
index.js # Entry point
|
| 1015 |
+
index.css
|
| 1016 |
+
logo.svg
|
| 1017 |
+
reportWebVitals.js
|
| 1018 |
+
setupTests.js
|
| 1019 |
+
App.test.js
|
| 1020 |
+
```
|
| 1021 |
+
|
| 1022 |
+
### What Needs to Be Built
|
| 1023 |
+
1. **Python backend** - Paper Geometry Engine with Shapely, FOLD import/export, constraint checking
|
| 1024 |
+
2. **GRPO training scripts** - Unsloth or veRL-based, with origami reward functions
|
| 1025 |
+
3. **Data pipeline** - Load/process OrigamiSpace + GamiBench datasets
|
| 1026 |
+
4. **Three.js frontend** - Replace CRA boilerplate with origami visualizer (possibly integrate OrigamiSimulator)
|
| 1027 |
+
5. **OpenEnv server** - API connecting geometry engine to trainer
|
| 1028 |
+
|
| 1029 |
+
---
|
| 1030 |
+
|
| 1031 |
+
## Key Takeaways for Immediate Work (GRPO Trainer)
|
| 1032 |
+
|
| 1033 |
+
1. **Use Unsloth for simplicity** - 90% VRAM savings, built-in vLLM, QLoRA support for Qwen2.5-VL-7B
|
| 1034 |
+
2. **Dense rewards with lexicographic gating** - format gate -> constraints -> topology -> geometry -> shape match (SpatialThinker pattern)
|
| 1035 |
+
3. **OrigamiSpace's 4-error compiler** is the gold standard for reward signal generation
|
| 1036 |
+
4. **Start with 500+ origami examples** - GamiBench (777) + OrigamiSpace (471) = 1248 examples
|
| 1037 |
+
5. **8 generations per prompt**, temperature 1.0, 300+ training steps minimum
|
| 1038 |
+
6. **Multi-turn: max 10 rounds** with compiler feedback (performance saturates after 8-10)
|
| 1039 |
+
7. **Track with Trackio** - deploy dashboard to HF Spaces for real-time monitoring
|
| 1040 |
+
8. **Evaluate on GamiBench** for standardized comparison against other MLLMs
|
| 1041 |
+
|
| 1042 |
+
---
|
| 1043 |
+
|
| 1044 |
+
## Cross-Reference: Tool Compatibility Matrix
|
| 1045 |
+
|
| 1046 |
+
| Component | FOLD | OrigamiSim | GamiBench | SpatialThinker | Unsloth | Trackio |
|
| 1047 |
+
|-----------|------|------------|-----------|----------------|---------|---------|
|
| 1048 |
+
| State representation | Core | Import | - | - | - | - |
|
| 1049 |
+
| Visualization | Export | Core | - | - | - | - |
|
| 1050 |
+
| Training data | - | - | Core | Augment | - | - |
|
| 1051 |
+
| RL training | - | - | Eval | Template | Core | Monitor |
|
| 1052 |
+
| Reward functions | Validate | Strain | - | Template | Integrate | Log |
|
| 1053 |
+
| Constraint checking | Structure | Physics | Impossible set | - | - | - |
|
trainer/__init__.py
ADDED
|
File without changes
|
trainer/mock_env.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Mock origami environment for trainer development.
|
| 3 |
+
|
| 4 |
+
Returns fake PaperState responses so we can iterate on the GRPO loop
|
| 5 |
+
without waiting for the real physics engine. The mock applies geometric
|
| 6 |
+
transforms (vertex rotations around fold lines) but skips energy/strain
|
| 7 |
+
computation — those return plausible dummy values.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import math
|
| 11 |
+
import numpy as np
|
| 12 |
+
from dataclasses import dataclass, field
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class Material:
|
| 17 |
+
name: str = "paper"
|
| 18 |
+
thickness_mm: float = 0.1
|
| 19 |
+
youngs_modulus_gpa: float = 2.0
|
| 20 |
+
max_strain: float = 0.03 # 3%
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class PaperState:
|
| 25 |
+
vertices: np.ndarray # (N, 3)
|
| 26 |
+
edges: np.ndarray # (E, 2)
|
| 27 |
+
faces: list[list[int]]
|
| 28 |
+
assignments: list[str] # M/V/B per edge
|
| 29 |
+
fold_angles: np.ndarray # (E,) degrees
|
| 30 |
+
|
| 31 |
+
rest_lengths: np.ndarray # (E,)
|
| 32 |
+
strain: np.ndarray # (N,)
|
| 33 |
+
energy: float = 0.0
|
| 34 |
+
|
| 35 |
+
face_orders: list[tuple] = field(default_factory=list)
|
| 36 |
+
num_layers: int = 1
|
| 37 |
+
|
| 38 |
+
material: Material = field(default_factory=Material)
|
| 39 |
+
|
| 40 |
+
bounding_box: np.ndarray = field(default_factory=lambda: np.array([1.0, 1.0, 0.0]))
|
| 41 |
+
deployment_ratio: float = 1.0
|
| 42 |
+
is_valid: bool = True
|
| 43 |
+
kawasaki_violation: float = 0.0
|
| 44 |
+
maekawa_violation: float = 0.0
|
| 45 |
+
self_intersections: int = 0
|
| 46 |
+
|
| 47 |
+
def to_dict(self) -> dict:
|
| 48 |
+
return {
|
| 49 |
+
"width": float(self.bounding_box[0]),
|
| 50 |
+
"height": float(self.bounding_box[1]),
|
| 51 |
+
"material": {
|
| 52 |
+
"name": self.material.name,
|
| 53 |
+
"thickness_mm": self.material.thickness_mm,
|
| 54 |
+
"youngs_modulus_gpa": self.material.youngs_modulus_gpa,
|
| 55 |
+
},
|
| 56 |
+
"vertices": self.vertices.tolist(),
|
| 57 |
+
"edges": self.edges.tolist(),
|
| 58 |
+
"assignments": self.assignments,
|
| 59 |
+
"fold_angles": self.fold_angles.tolist(),
|
| 60 |
+
"num_layers_at_center": self.num_layers,
|
| 61 |
+
"bounding_box": {
|
| 62 |
+
"x": float(self.bounding_box[0]),
|
| 63 |
+
"y": float(self.bounding_box[1]),
|
| 64 |
+
"z": float(self.bounding_box[2]),
|
| 65 |
+
},
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def create_flat_sheet(width: float = 1.0, height: float = 1.0,
|
| 70 |
+
material: Material | None = None) -> PaperState:
|
| 71 |
+
"""Create a flat rectangular sheet with 4 vertices, 5 edges (incl diagonal), 2 faces."""
|
| 72 |
+
verts = np.array([
|
| 73 |
+
[0, 0, 0],
|
| 74 |
+
[width, 0, 0],
|
| 75 |
+
[width, height, 0],
|
| 76 |
+
[0, height, 0],
|
| 77 |
+
], dtype=float)
|
| 78 |
+
|
| 79 |
+
edges = np.array([
|
| 80 |
+
[0, 1], [1, 2], [2, 3], [3, 0], # boundary
|
| 81 |
+
[0, 2], # diagonal
|
| 82 |
+
], dtype=int)
|
| 83 |
+
|
| 84 |
+
faces = [[0, 1, 2], [0, 2, 3]]
|
| 85 |
+
assignments = ["B", "B", "B", "B", "F"] # boundary + flat diagonal
|
| 86 |
+
fold_angles = np.zeros(len(edges))
|
| 87 |
+
rest_lengths = np.array([np.linalg.norm(verts[e[1]] - verts[e[0]]) for e in edges])
|
| 88 |
+
strain = np.zeros(len(verts))
|
| 89 |
+
|
| 90 |
+
mat = material or Material()
|
| 91 |
+
return PaperState(
|
| 92 |
+
vertices=verts, edges=edges, faces=faces,
|
| 93 |
+
assignments=assignments, fold_angles=fold_angles,
|
| 94 |
+
rest_lengths=rest_lengths, strain=strain,
|
| 95 |
+
material=mat,
|
| 96 |
+
bounding_box=np.array([width, height, 0.0]),
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _rotate_points(points: np.ndarray, axis_point: np.ndarray,
|
| 101 |
+
axis_dir: np.ndarray, angle_rad: float) -> np.ndarray:
|
| 102 |
+
"""Rotate points around an arbitrary axis using Rodrigues' formula."""
|
| 103 |
+
k = axis_dir / np.linalg.norm(axis_dir)
|
| 104 |
+
translated = points - axis_point
|
| 105 |
+
cos_a = math.cos(angle_rad)
|
| 106 |
+
sin_a = math.sin(angle_rad)
|
| 107 |
+
rotated = (translated * cos_a +
|
| 108 |
+
np.cross(k, translated) * sin_a +
|
| 109 |
+
k * (np.dot(translated, k).reshape(-1, 1)) * (1 - cos_a))
|
| 110 |
+
return rotated + axis_point
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def apply_fold_mock(state: PaperState, fold: dict) -> tuple[PaperState, str | None]:
|
| 114 |
+
"""
|
| 115 |
+
Apply a single fold operation to the paper state (mock version).
|
| 116 |
+
|
| 117 |
+
fold = {
|
| 118 |
+
"type": "valley" | "mountain",
|
| 119 |
+
"line": {"start": [x, y], "end": [x, y]},
|
| 120 |
+
"angle": 0-180
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
Returns (new_state, error_string_or_None).
|
| 124 |
+
"""
|
| 125 |
+
fold_type = fold.get("type", "valley")
|
| 126 |
+
line = fold.get("line", {})
|
| 127 |
+
angle_deg = fold.get("angle", 180)
|
| 128 |
+
|
| 129 |
+
start = np.array(line.get("start", [0, 0]), dtype=float)
|
| 130 |
+
end = np.array(line.get("end", [0, 0]), dtype=float)
|
| 131 |
+
|
| 132 |
+
if np.allclose(start, end):
|
| 133 |
+
return state, "Fold line has zero length"
|
| 134 |
+
|
| 135 |
+
if fold_type not in ("valley", "mountain"):
|
| 136 |
+
return state, f"Unknown fold type: {fold_type}"
|
| 137 |
+
|
| 138 |
+
if not (0 < angle_deg <= 180):
|
| 139 |
+
return state, f"Angle must be in (0, 180], got {angle_deg}"
|
| 140 |
+
|
| 141 |
+
# Fold direction: valley folds up (+z), mountain folds down (-z)
|
| 142 |
+
sign = 1.0 if fold_type == "valley" else -1.0
|
| 143 |
+
angle_rad = sign * math.radians(angle_deg)
|
| 144 |
+
|
| 145 |
+
# Determine which vertices are on the "folding" side of the line
|
| 146 |
+
line_dir_2d = end - start
|
| 147 |
+
normal_2d = np.array([-line_dir_2d[1], line_dir_2d[0]]) # perpendicular
|
| 148 |
+
|
| 149 |
+
new_verts = state.vertices.copy()
|
| 150 |
+
for i, v in enumerate(new_verts):
|
| 151 |
+
point_2d = v[:2] - start
|
| 152 |
+
side = np.dot(point_2d, normal_2d)
|
| 153 |
+
if side > 1e-9: # on the positive side → rotate
|
| 154 |
+
axis_point = np.array([start[0], start[1], 0.0])
|
| 155 |
+
axis_dir = np.array([line_dir_2d[0], line_dir_2d[1], 0.0])
|
| 156 |
+
new_verts[i] = _rotate_points(
|
| 157 |
+
v.reshape(1, -1), axis_point, axis_dir, angle_rad
|
| 158 |
+
).flatten()
|
| 159 |
+
|
| 160 |
+
# Update bounding box (clamp near-zero values from floating point)
|
| 161 |
+
bb = np.ptp(new_verts, axis=0) # max - min per axis
|
| 162 |
+
bb = np.where(np.abs(bb) < 1e-10, 0.0, bb)
|
| 163 |
+
# Add minimum thickness per layer (material thickness)
|
| 164 |
+
thickness = state.material.thickness_mm / 1000.0 # convert mm to m
|
| 165 |
+
num_layers = state.num_layers + 1
|
| 166 |
+
bb[2] = max(bb[2], thickness * num_layers)
|
| 167 |
+
|
| 168 |
+
# Mock strain: small random value per vertex
|
| 169 |
+
new_strain = np.random.uniform(0, 0.01, len(new_verts))
|
| 170 |
+
|
| 171 |
+
# Mock energy
|
| 172 |
+
new_energy = state.energy + 0.1 * angle_deg / 180.0
|
| 173 |
+
|
| 174 |
+
# Update assignments — add new edge as M or V
|
| 175 |
+
new_assignments = state.assignments.copy()
|
| 176 |
+
|
| 177 |
+
# Deployment ratio estimate: each full fold (180°) halves the area in one direction.
|
| 178 |
+
# Partial folds reduce proportionally. This is a mock approximation —
|
| 179 |
+
# the real engine will compute from actual face overlaps.
|
| 180 |
+
fold_factor = angle_deg / 180.0 # 1.0 for full fold, 0.5 for 90°, etc.
|
| 181 |
+
deploy_ratio = state.deployment_ratio * (1.0 - 0.5 * fold_factor)
|
| 182 |
+
|
| 183 |
+
new_state = PaperState(
|
| 184 |
+
vertices=new_verts,
|
| 185 |
+
edges=state.edges.copy(),
|
| 186 |
+
faces=state.faces.copy(),
|
| 187 |
+
assignments=new_assignments,
|
| 188 |
+
fold_angles=state.fold_angles.copy(),
|
| 189 |
+
rest_lengths=state.rest_lengths.copy(),
|
| 190 |
+
strain=new_strain,
|
| 191 |
+
energy=new_energy,
|
| 192 |
+
material=state.material,
|
| 193 |
+
bounding_box=bb,
|
| 194 |
+
deployment_ratio=deploy_ratio,
|
| 195 |
+
num_layers=state.num_layers + 1,
|
| 196 |
+
is_valid=True,
|
| 197 |
+
kawasaki_violation=0.0,
|
| 198 |
+
maekawa_violation=0.0,
|
| 199 |
+
self_intersections=0,
|
| 200 |
+
)
|
| 201 |
+
return new_state, None
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def execute_fold_strategy(strategy_fn, paper_state: PaperState,
|
| 205 |
+
max_folds: int = 20) -> tuple[PaperState, list[dict], str | None]:
|
| 206 |
+
"""
|
| 207 |
+
Execute a fold_strategy function against the mock environment.
|
| 208 |
+
|
| 209 |
+
Returns (final_state, applied_folds, error_or_None).
|
| 210 |
+
"""
|
| 211 |
+
state_dict = paper_state.to_dict()
|
| 212 |
+
try:
|
| 213 |
+
folds = strategy_fn(state_dict)
|
| 214 |
+
except Exception as e:
|
| 215 |
+
return paper_state, [], f"Strategy function raised: {e}"
|
| 216 |
+
|
| 217 |
+
if not isinstance(folds, list):
|
| 218 |
+
return paper_state, [], "Strategy must return a list of fold dicts"
|
| 219 |
+
|
| 220 |
+
applied = []
|
| 221 |
+
current = paper_state
|
| 222 |
+
for i, fold in enumerate(folds):
|
| 223 |
+
if i >= max_folds:
|
| 224 |
+
break
|
| 225 |
+
if not isinstance(fold, dict):
|
| 226 |
+
return current, applied, f"Fold {i} is not a dict"
|
| 227 |
+
|
| 228 |
+
current, error = apply_fold_mock(current, fold)
|
| 229 |
+
if error:
|
| 230 |
+
return current, applied, f"Fold {i} failed: {error}"
|
| 231 |
+
applied.append(fold)
|
| 232 |
+
|
| 233 |
+
return current, applied, None
|
trainer/prompts.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prompt templates for origami fold strategy generation.
|
| 3 |
+
|
| 4 |
+
The LLM receives a task description and paper state, then generates
|
| 5 |
+
a fold_strategy(paper_state) function that returns fold operations.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
SYSTEM_PROMPT = """\
|
| 9 |
+
You are an origami engineer. You design fold patterns for real-world applications \
|
| 10 |
+
like solar panel packing, deployable shelters, and medical stents.
|
| 11 |
+
|
| 12 |
+
You will be given a folding task with material constraints. Write a Python function \
|
| 13 |
+
`fold_strategy(paper_state)` that returns a list of fold operations to achieve the goal.
|
| 14 |
+
|
| 15 |
+
Rules:
|
| 16 |
+
- Only use native Python (no imports except math, itertools, functools)
|
| 17 |
+
- Each fold: {"type": "valley"|"mountain", "line": {"start": [x,y], "end": [x,y]}, "angle": 0-180}
|
| 18 |
+
- Fold lines must intersect the paper boundaries
|
| 19 |
+
- Fewer folds is better (efficiency matters)
|
| 20 |
+
- Respect material strain limits
|
| 21 |
+
- Output ONLY the function in ```python ... ``` backticks\
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
TASK_TEMPLATES = {
|
| 26 |
+
"half_fold": {
|
| 27 |
+
"name": "half_fold",
|
| 28 |
+
"prompt": """\
|
| 29 |
+
TASK: Fold a {width}m x {height}m {material} sheet in half to minimize one dimension.
|
| 30 |
+
|
| 31 |
+
MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
|
| 32 |
+
CONSTRAINTS: Maximum {max_folds} fold operations.
|
| 33 |
+
TARGET: Deployment ratio <= 0.5 (folded area is half or less of original)
|
| 34 |
+
|
| 35 |
+
CURRENT STATE:
|
| 36 |
+
Sheet: {width}m x {height}m, flat (0 folds applied)
|
| 37 |
+
Bounding box: {width}m x {height}m x 0.0m
|
| 38 |
+
|
| 39 |
+
Write a fold_strategy(paper_state) function that returns a list of fold operations.
|
| 40 |
+
Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
def fold_strategy(paper_state):
|
| 44 |
+
# Your code here
|
| 45 |
+
return [...]
|
| 46 |
+
```""",
|
| 47 |
+
"target_ratio": 0.5,
|
| 48 |
+
"max_folds": 3,
|
| 49 |
+
},
|
| 50 |
+
|
| 51 |
+
"letter_fold": {
|
| 52 |
+
"name": "letter_fold",
|
| 53 |
+
"prompt": """\
|
| 54 |
+
TASK: Fold a {width}m x {height}m {material} sheet into thirds (like a letter).
|
| 55 |
+
|
| 56 |
+
MATERIAL: {material} (thickness: {thickness_mm}mm, max strain: {max_strain_pct}%)
|
| 57 |
+
CONSTRAINTS: Maximum {max_folds} fold operations.
|
| 58 |
+
TARGET: Deployment ratio <= 0.33
|
| 59 |
+
|
| 60 |
+
CURRENT STATE:
|
| 61 |
+
Sheet: {width}m x {height}m, flat (0 folds applied)
|
| 62 |
+
|
| 63 |
+
Write a fold_strategy(paper_state) function that returns a list of fold operations.
|
| 64 |
+
Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
|
| 65 |
+
|
| 66 |
+
```python
|
| 67 |
+
def fold_strategy(paper_state):
|
| 68 |
+
# Your code here
|
| 69 |
+
return [...]
|
| 70 |
+
```""",
|
| 71 |
+
"target_ratio": 0.33,
|
| 72 |
+
"max_folds": 5,
|
| 73 |
+
},
|
| 74 |
+
|
| 75 |
+
"solar_panel": {
|
| 76 |
+
"name": "solar_panel",
|
| 77 |
+
"prompt": """\
|
| 78 |
+
TASK: Fold a {width}m x {height}m Mylar sheet to minimize packed volume for a solar panel.
|
| 79 |
+
The folded panel must be deployable (unfold cleanly to near-original area).
|
| 80 |
+
|
| 81 |
+
MATERIAL: Mylar (thickness: 0.05mm, Young's modulus: 4 GPa, max strain: 3%)
|
| 82 |
+
CONSTRAINTS:
|
| 83 |
+
- Maximum {max_folds} fold operations
|
| 84 |
+
- Must pack into bounding box <= 15cm x 15cm x 5cm
|
| 85 |
+
- Must deploy to >= 80% of original area
|
| 86 |
+
- No self-intersections
|
| 87 |
+
|
| 88 |
+
TARGET: Deployment ratio <= 0.05 (95% volume reduction)
|
| 89 |
+
|
| 90 |
+
CURRENT STATE:
|
| 91 |
+
Sheet: {width}m x {height}m, flat (0 folds applied)
|
| 92 |
+
Bounding box: {width}m x {height}m x 0.0m
|
| 93 |
+
|
| 94 |
+
HINT: Consider tessellated patterns like Miura-ori — alternating mountain and valley
|
| 95 |
+
folds in a grid create a highly compact, single-DOF deployable structure.
|
| 96 |
+
|
| 97 |
+
Write a fold_strategy(paper_state) function that returns a list of fold operations.
|
| 98 |
+
Each fold: {{"type": "valley"|"mountain", "line": {{"start": [x,y], "end": [x,y]}}, "angle": 0-180}}
|
| 99 |
+
|
| 100 |
+
```python
|
| 101 |
+
def fold_strategy(paper_state):
|
| 102 |
+
# Your code here
|
| 103 |
+
return [...]
|
| 104 |
+
```""",
|
| 105 |
+
"target_ratio": 0.05,
|
| 106 |
+
"max_folds": 20,
|
| 107 |
+
},
|
| 108 |
+
|
| 109 |
+
"stent_fold": {
|
| 110 |
+
"name": "stent_fold",
|
| 111 |
+
"prompt": """\
|
| 112 |
+
TASK: Fold a {width}m x {height}m Nitinol sheet into a compact cylinder for a medical stent.
|
| 113 |
+
|
| 114 |
+
MATERIAL: Nitinol (thickness: 0.1mm, Young's modulus: 75 GPa, max strain: 8%)
|
| 115 |
+
CONSTRAINTS:
|
| 116 |
+
- Maximum {max_folds} fold operations
|
| 117 |
+
- Compressed diameter: 3mm
|
| 118 |
+
- Deployed diameter: 10mm
|
| 119 |
+
- Must be radially deployable
|
| 120 |
+
|
| 121 |
+
TARGET: Minimize packed cross-section while maintaining deployability.
|
| 122 |
+
|
| 123 |
+
Write a fold_strategy(paper_state) function that returns a list of fold operations.
|
| 124 |
+
|
| 125 |
+
```python
|
| 126 |
+
def fold_strategy(paper_state):
|
| 127 |
+
# Your code here
|
| 128 |
+
return [...]
|
| 129 |
+
```""",
|
| 130 |
+
"target_ratio": 0.1,
|
| 131 |
+
"max_folds": 15,
|
| 132 |
+
},
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# Default task configs for each level
|
| 137 |
+
TASK_CONFIGS = {
|
| 138 |
+
"half_fold": {
|
| 139 |
+
"width": 1.0, "height": 1.0, "material": "paper",
|
| 140 |
+
"thickness_mm": 0.1, "max_strain_pct": 3, "max_folds": 3,
|
| 141 |
+
},
|
| 142 |
+
"letter_fold": {
|
| 143 |
+
"width": 1.0, "height": 1.0, "material": "paper",
|
| 144 |
+
"thickness_mm": 0.1, "max_strain_pct": 3, "max_folds": 5,
|
| 145 |
+
},
|
| 146 |
+
"solar_panel": {
|
| 147 |
+
"width": 1.0, "height": 1.0, "material": "mylar",
|
| 148 |
+
"thickness_mm": 0.05, "max_strain_pct": 3, "max_folds": 20,
|
| 149 |
+
},
|
| 150 |
+
"stent_fold": {
|
| 151 |
+
"width": 0.1, "height": 0.03, "material": "nitinol",
|
| 152 |
+
"thickness_mm": 0.1, "max_strain_pct": 8, "max_folds": 15,
|
| 153 |
+
},
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def build_prompt(task_name: str = "half_fold", **overrides) -> str:
|
| 158 |
+
"""Build a complete user prompt for a given task."""
|
| 159 |
+
task = TASK_TEMPLATES[task_name]
|
| 160 |
+
config = {**TASK_CONFIGS[task_name], **overrides}
|
| 161 |
+
return task["prompt"].format(**config)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def get_task_target_ratio(task_name: str) -> float:
|
| 165 |
+
return TASK_TEMPLATES[task_name]["target_ratio"]
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def get_task_max_folds(task_name: str) -> int:
|
| 169 |
+
return TASK_TEMPLATES[task_name]["max_folds"]
|
trainer/rewards.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reward functions for origami GRPO training.
|
| 3 |
+
|
| 4 |
+
Three reward functions following the 2048 pattern:
|
| 5 |
+
1. code_valid — Does the generated code parse and produce fold instructions?
|
| 6 |
+
2. physically_valid — Are the folds geometrically/physically valid?
|
| 7 |
+
3. fold_quality — How good is the folding solution (compactness, efficiency)?
|
| 8 |
+
|
| 9 |
+
Lexicographic gating (from SpatialThinker): if code doesn't parse,
|
| 10 |
+
all downstream rewards are 0. This prevents reward hacking.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import ast
|
| 14 |
+
import sys
|
| 15 |
+
import math
|
| 16 |
+
import traceback
|
| 17 |
+
from typing import Callable
|
| 18 |
+
|
| 19 |
+
from trainer.mock_env import (
|
| 20 |
+
PaperState, create_flat_sheet, execute_fold_strategy, Material
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Helpers
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
def extract_function(text: str) -> str | None:
|
| 29 |
+
"""Extract a Python function from triple-backtick code blocks."""
|
| 30 |
+
if text.count("```") < 2:
|
| 31 |
+
return None
|
| 32 |
+
first = text.find("```") + 3
|
| 33 |
+
second = text.find("```", first)
|
| 34 |
+
fx = text[first:second].strip()
|
| 35 |
+
fx = fx.removeprefix("python\n").removeprefix("python\r\n")
|
| 36 |
+
# Find the def statement
|
| 37 |
+
def_idx = fx.find("def ")
|
| 38 |
+
if def_idx == -1:
|
| 39 |
+
return None
|
| 40 |
+
fx = fx[def_idx:]
|
| 41 |
+
if fx.startswith("def fold_strategy("):
|
| 42 |
+
return fx
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def check_imports_stdlib_only(code: str) -> tuple[bool, str]:
|
| 47 |
+
"""Check that code only imports from Python stdlib."""
|
| 48 |
+
try:
|
| 49 |
+
tree = ast.parse(code)
|
| 50 |
+
except SyntaxError as e:
|
| 51 |
+
return False, f"syntax error: {e}"
|
| 52 |
+
|
| 53 |
+
ALLOWED_MODULES = {
|
| 54 |
+
"math", "itertools", "functools", "collections", "copy",
|
| 55 |
+
"operator", "typing", "random", "heapq", "bisect",
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
for node in ast.walk(tree):
|
| 59 |
+
if isinstance(node, ast.Import):
|
| 60 |
+
for alias in node.names:
|
| 61 |
+
root = alias.name.split(".")[0]
|
| 62 |
+
if root not in ALLOWED_MODULES:
|
| 63 |
+
return False, f"non-stdlib import: {alias.name}"
|
| 64 |
+
elif isinstance(node, ast.ImportFrom):
|
| 65 |
+
if node.module:
|
| 66 |
+
root = node.module.split(".")[0]
|
| 67 |
+
if root not in ALLOWED_MODULES:
|
| 68 |
+
return False, f"non-stdlib import: {node.module}"
|
| 69 |
+
|
| 70 |
+
return True, "ok"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def create_sandboxed_function(code: str) -> Callable:
|
| 74 |
+
"""
|
| 75 |
+
Execute the function code in a restricted namespace.
|
| 76 |
+
Returns the fold_strategy function object.
|
| 77 |
+
"""
|
| 78 |
+
allowed_builtins = {
|
| 79 |
+
"range", "len", "int", "float", "str", "list", "dict", "tuple",
|
| 80 |
+
"set", "bool", "abs", "min", "max", "sum", "sorted", "reversed",
|
| 81 |
+
"enumerate", "zip", "map", "filter", "round", "isinstance",
|
| 82 |
+
"True", "False", "None", "print",
|
| 83 |
+
}
|
| 84 |
+
safe_builtins = {k: __builtins__[k] if isinstance(__builtins__, dict)
|
| 85 |
+
else getattr(__builtins__, k)
|
| 86 |
+
for k in allowed_builtins
|
| 87 |
+
if (k in __builtins__ if isinstance(__builtins__, dict)
|
| 88 |
+
else hasattr(__builtins__, k))}
|
| 89 |
+
safe_builtins["__import__"] = __import__ # needed for stdlib imports
|
| 90 |
+
|
| 91 |
+
namespace = {"__builtins__": safe_builtins}
|
| 92 |
+
exec(code, namespace)
|
| 93 |
+
|
| 94 |
+
if "fold_strategy" not in namespace:
|
| 95 |
+
raise ValueError("No fold_strategy function defined")
|
| 96 |
+
|
| 97 |
+
return namespace["fold_strategy"]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
# State for strategy execution
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
|
| 104 |
+
# Current task config (set by train.py before training starts)
|
| 105 |
+
_current_task = {
|
| 106 |
+
"width": 1.0,
|
| 107 |
+
"height": 1.0,
|
| 108 |
+
"material": Material(),
|
| 109 |
+
"target_ratio": 0.5,
|
| 110 |
+
"max_folds": 3,
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
PRINT_EVERY = 5
|
| 114 |
+
_print_counter = 0
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def set_task_config(width=1.0, height=1.0, material=None,
|
| 118 |
+
target_ratio=0.5, max_folds=3):
|
| 119 |
+
global _current_task
|
| 120 |
+
_current_task = {
|
| 121 |
+
"width": width,
|
| 122 |
+
"height": height,
|
| 123 |
+
"material": material or Material(),
|
| 124 |
+
"target_ratio": target_ratio,
|
| 125 |
+
"max_folds": max_folds,
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
# Reward 1: code_valid
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
|
| 133 |
+
def code_valid(completions, **kwargs) -> list[float]:
|
| 134 |
+
"""
|
| 135 |
+
Does the generated code parse as valid Python and produce a callable?
|
| 136 |
+
|
| 137 |
+
+1.0 — valid function that can be created
|
| 138 |
+
-0.5 — correct structure but exec/sandbox fails
|
| 139 |
+
-2.0 — no function found or syntax error
|
| 140 |
+
-20.0 — non-stdlib imports (heavy penalty)
|
| 141 |
+
"""
|
| 142 |
+
scores = []
|
| 143 |
+
for completion in completions:
|
| 144 |
+
response = completion[0]["content"]
|
| 145 |
+
function_code = extract_function(response)
|
| 146 |
+
|
| 147 |
+
if function_code is None:
|
| 148 |
+
scores.append(-2.0)
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
ok, info = check_imports_stdlib_only(function_code)
|
| 152 |
+
if not ok:
|
| 153 |
+
if "syntax error" in info:
|
| 154 |
+
scores.append(-2.0)
|
| 155 |
+
else:
|
| 156 |
+
scores.append(-20.0) # non-stdlib imports
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
create_sandboxed_function(function_code)
|
| 161 |
+
scores.append(1.0)
|
| 162 |
+
except Exception:
|
| 163 |
+
scores.append(-0.5)
|
| 164 |
+
|
| 165 |
+
return scores
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
# Reward 2: physically_valid
|
| 170 |
+
# ---------------------------------------------------------------------------
|
| 171 |
+
|
| 172 |
+
def physically_valid(completions, **kwargs) -> list[float]:
|
| 173 |
+
"""
|
| 174 |
+
Are the folds physically possible?
|
| 175 |
+
|
| 176 |
+
+1.0 — all folds valid, no violations
|
| 177 |
+
-2.0 — per Kawasaki/Maekawa violation
|
| 178 |
+
-5.0 — any self-intersection
|
| 179 |
+
-1.0 — strain exceeds material limit
|
| 180 |
+
0.0 — function broken / can't run
|
| 181 |
+
"""
|
| 182 |
+
scores = []
|
| 183 |
+
for completion in completions:
|
| 184 |
+
response = completion[0]["content"]
|
| 185 |
+
function_code = extract_function(response)
|
| 186 |
+
|
| 187 |
+
if function_code is None:
|
| 188 |
+
scores.append(0.0)
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
ok, info = check_imports_stdlib_only(function_code)
|
| 192 |
+
if not ok:
|
| 193 |
+
scores.append(0.0)
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
strategy_fn = create_sandboxed_function(function_code)
|
| 198 |
+
except Exception:
|
| 199 |
+
scores.append(0.0)
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
try:
|
| 203 |
+
paper = create_flat_sheet(
|
| 204 |
+
_current_task["width"],
|
| 205 |
+
_current_task["height"],
|
| 206 |
+
_current_task["material"],
|
| 207 |
+
)
|
| 208 |
+
final_state, applied, error = execute_fold_strategy(
|
| 209 |
+
strategy_fn, paper, _current_task["max_folds"]
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
if error:
|
| 213 |
+
scores.append(0.0)
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
if len(applied) == 0:
|
| 217 |
+
scores.append(0.0)
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
# Score based on validity
|
| 221 |
+
score = 1.0
|
| 222 |
+
score -= 2.0 * final_state.kawasaki_violation
|
| 223 |
+
score -= 2.0 * final_state.maekawa_violation
|
| 224 |
+
if final_state.self_intersections > 0:
|
| 225 |
+
score -= 5.0
|
| 226 |
+
max_strain = float(final_state.strain.max()) if len(final_state.strain) > 0 else 0.0
|
| 227 |
+
if max_strain > _current_task["material"].max_strain:
|
| 228 |
+
score -= 1.0
|
| 229 |
+
|
| 230 |
+
scores.append(score)
|
| 231 |
+
|
| 232 |
+
except TimeoutError:
|
| 233 |
+
scores.append(-1.0)
|
| 234 |
+
except Exception:
|
| 235 |
+
scores.append(0.0)
|
| 236 |
+
|
| 237 |
+
return scores
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# ---------------------------------------------------------------------------
|
| 241 |
+
# Reward 3: fold_quality
|
| 242 |
+
# ---------------------------------------------------------------------------
|
| 243 |
+
|
| 244 |
+
def fold_quality(completions, **kwargs) -> list[float]:
|
| 245 |
+
"""
|
| 246 |
+
How good is the folding solution?
|
| 247 |
+
|
| 248 |
+
+20.0 * compactness — main reward (1 - deployment_ratio)
|
| 249 |
+
+10.0 bonus — if meets target ratio
|
| 250 |
+
-0.5 per fold — efficiency penalty
|
| 251 |
+
-3.0 * overstrain — material stress penalty
|
| 252 |
+
-1.0 — timeout
|
| 253 |
+
-3.0 — exception
|
| 254 |
+
0.0 — function broken
|
| 255 |
+
"""
|
| 256 |
+
global _print_counter
|
| 257 |
+
scores = []
|
| 258 |
+
|
| 259 |
+
for completion in completions:
|
| 260 |
+
response = completion[0]["content"]
|
| 261 |
+
function_code = extract_function(response)
|
| 262 |
+
|
| 263 |
+
should_print = (_print_counter % PRINT_EVERY == 0)
|
| 264 |
+
_print_counter += 1
|
| 265 |
+
|
| 266 |
+
if should_print:
|
| 267 |
+
print(f"\n--- Strategy (sample {_print_counter}) ---")
|
| 268 |
+
print(function_code if function_code else "[no function extracted]")
|
| 269 |
+
|
| 270 |
+
if function_code is None:
|
| 271 |
+
scores.append(0.0)
|
| 272 |
+
continue
|
| 273 |
+
|
| 274 |
+
ok, info = check_imports_stdlib_only(function_code)
|
| 275 |
+
if not ok:
|
| 276 |
+
scores.append(0.0)
|
| 277 |
+
continue
|
| 278 |
+
|
| 279 |
+
try:
|
| 280 |
+
strategy_fn = create_sandboxed_function(function_code)
|
| 281 |
+
except Exception:
|
| 282 |
+
scores.append(0.0)
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
try:
|
| 286 |
+
paper = create_flat_sheet(
|
| 287 |
+
_current_task["width"],
|
| 288 |
+
_current_task["height"],
|
| 289 |
+
_current_task["material"],
|
| 290 |
+
)
|
| 291 |
+
final_state, applied, error = execute_fold_strategy(
|
| 292 |
+
strategy_fn, paper, _current_task["max_folds"]
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
if error:
|
| 296 |
+
if should_print:
|
| 297 |
+
print(f"Error: {error}")
|
| 298 |
+
scores.append(0.0)
|
| 299 |
+
continue
|
| 300 |
+
|
| 301 |
+
num_folds = len(applied)
|
| 302 |
+
if num_folds == 0:
|
| 303 |
+
scores.append(0.0)
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
# Compactness: main reward signal
|
| 307 |
+
compactness = 1.0 - final_state.deployment_ratio
|
| 308 |
+
score = 20.0 * compactness
|
| 309 |
+
|
| 310 |
+
# Bonus for meeting target
|
| 311 |
+
if final_state.deployment_ratio <= _current_task["target_ratio"]:
|
| 312 |
+
score += 10.0
|
| 313 |
+
|
| 314 |
+
# Fold efficiency penalty
|
| 315 |
+
score -= 0.5 * num_folds
|
| 316 |
+
|
| 317 |
+
# Strain penalty
|
| 318 |
+
max_strain = float(final_state.strain.max()) if len(final_state.strain) > 0 else 0.0
|
| 319 |
+
mat_limit = _current_task["material"].max_strain
|
| 320 |
+
if max_strain > mat_limit:
|
| 321 |
+
score -= 3.0 * (max_strain / mat_limit)
|
| 322 |
+
|
| 323 |
+
if should_print:
|
| 324 |
+
print(f"Folds: {num_folds}, Ratio: {final_state.deployment_ratio:.3f}, "
|
| 325 |
+
f"Compactness: {compactness:.3f}, Score: {score:.2f}")
|
| 326 |
+
bb = final_state.bounding_box
|
| 327 |
+
print(f"BBox: {bb[0]:.3f} x {bb[1]:.3f} x {bb[2]:.3f}")
|
| 328 |
+
|
| 329 |
+
scores.append(score)
|
| 330 |
+
|
| 331 |
+
except TimeoutError:
|
| 332 |
+
if should_print:
|
| 333 |
+
print("Timeout!")
|
| 334 |
+
scores.append(-1.0)
|
| 335 |
+
except Exception as e:
|
| 336 |
+
if should_print:
|
| 337 |
+
print(f"Exception: {e}")
|
| 338 |
+
scores.append(-3.0)
|
| 339 |
+
|
| 340 |
+
return scores
|
trainer/train.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Origami GRPO Training Script
|
| 3 |
+
|
| 4 |
+
Usage (Colab with T4/A100):
|
| 5 |
+
python trainer/train.py
|
| 6 |
+
|
| 7 |
+
Or in a notebook:
|
| 8 |
+
%run trainer/train.py
|
| 9 |
+
|
| 10 |
+
Requires: unsloth, trl>=0.22.2, transformers>=4.56.2, trackio, datasets
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
# Ensure project root is on path
|
| 17 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 18 |
+
if PROJECT_ROOT not in sys.path:
|
| 19 |
+
sys.path.insert(0, PROJECT_ROOT)
|
| 20 |
+
|
| 21 |
+
from trainer.prompts import build_prompt, SYSTEM_PROMPT, get_task_target_ratio, get_task_max_folds
|
| 22 |
+
from trainer.rewards import code_valid, physically_valid, fold_quality, set_task_config
|
| 23 |
+
from trainer.mock_env import Material
|
| 24 |
+
|
| 25 |
+
# ============================================================================
|
| 26 |
+
# Config
|
| 27 |
+
# ============================================================================
|
| 28 |
+
|
| 29 |
+
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct"
|
| 30 |
+
MAX_SEQ_LENGTH = 2048
|
| 31 |
+
LORA_RANK = 4
|
| 32 |
+
|
| 33 |
+
# Start with the simplest task
|
| 34 |
+
TASK_NAME = "half_fold"
|
| 35 |
+
|
| 36 |
+
# GRPO hyperparameters (from 2048 reference, adapted for origami)
|
| 37 |
+
LEARNING_RATE = 2e-4
|
| 38 |
+
MAX_STEPS = 600
|
| 39 |
+
NUM_GENERATIONS = 2
|
| 40 |
+
TEMPERATURE = 1.0
|
| 41 |
+
BATCH_SIZE = 1
|
| 42 |
+
GRAD_ACCUM = 1
|
| 43 |
+
DATASET_SIZE = 1000 # replicated prompt dataset
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main():
|
| 47 |
+
# ========================================================================
|
| 48 |
+
# 1. Load model with Unsloth
|
| 49 |
+
# ========================================================================
|
| 50 |
+
from unsloth import FastLanguageModel
|
| 51 |
+
|
| 52 |
+
print(f"Loading model: {MODEL_NAME}")
|
| 53 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 54 |
+
model_name=MODEL_NAME,
|
| 55 |
+
load_in_4bit=True,
|
| 56 |
+
max_seq_length=MAX_SEQ_LENGTH,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# ========================================================================
|
| 60 |
+
# 2. Apply LoRA adapters
|
| 61 |
+
# ========================================================================
|
| 62 |
+
model = FastLanguageModel.get_peft_model(
|
| 63 |
+
model,
|
| 64 |
+
r=LORA_RANK,
|
| 65 |
+
target_modules=[
|
| 66 |
+
"q_proj", "k_proj", "v_proj", "o_proj",
|
| 67 |
+
"gate_proj", "up_proj", "down_proj",
|
| 68 |
+
],
|
| 69 |
+
lora_alpha=LORA_RANK * 2,
|
| 70 |
+
use_gradient_checkpointing="unsloth",
|
| 71 |
+
random_state=3407,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# ========================================================================
|
| 75 |
+
# 3. Build prompt and dataset
|
| 76 |
+
# ========================================================================
|
| 77 |
+
user_prompt = build_prompt(TASK_NAME)
|
| 78 |
+
target_ratio = get_task_target_ratio(TASK_NAME)
|
| 79 |
+
max_folds = get_task_max_folds(TASK_NAME)
|
| 80 |
+
|
| 81 |
+
# Configure reward functions with task parameters
|
| 82 |
+
set_task_config(
|
| 83 |
+
width=1.0,
|
| 84 |
+
height=1.0,
|
| 85 |
+
material=Material(),
|
| 86 |
+
target_ratio=target_ratio,
|
| 87 |
+
max_folds=max_folds,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Create replicated prompt dataset (same pattern as 2048)
|
| 91 |
+
from datasets import Dataset
|
| 92 |
+
|
| 93 |
+
dataset = Dataset.from_list([
|
| 94 |
+
{
|
| 95 |
+
"prompt": [
|
| 96 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 97 |
+
{"role": "user", "content": user_prompt},
|
| 98 |
+
],
|
| 99 |
+
}
|
| 100 |
+
] * DATASET_SIZE)
|
| 101 |
+
|
| 102 |
+
# Calculate prompt token length for max_completion_length
|
| 103 |
+
prompt_tokens = tokenizer.apply_chat_template(
|
| 104 |
+
[
|
| 105 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 106 |
+
{"role": "user", "content": user_prompt},
|
| 107 |
+
],
|
| 108 |
+
add_generation_prompt=True,
|
| 109 |
+
tokenize=True,
|
| 110 |
+
)
|
| 111 |
+
max_prompt_length = len(prompt_tokens) + 1
|
| 112 |
+
max_completion_length = MAX_SEQ_LENGTH - max_prompt_length
|
| 113 |
+
print(f"Prompt tokens: {max_prompt_length}, Max completion: {max_completion_length}")
|
| 114 |
+
|
| 115 |
+
# ========================================================================
|
| 116 |
+
# 4. Test inference before training
|
| 117 |
+
# ========================================================================
|
| 118 |
+
print("\n=== Pre-training inference test ===")
|
| 119 |
+
text = tokenizer.apply_chat_template(
|
| 120 |
+
[
|
| 121 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 122 |
+
{"role": "user", "content": user_prompt},
|
| 123 |
+
],
|
| 124 |
+
tokenize=False,
|
| 125 |
+
add_generation_prompt=True,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
from transformers import TextStreamer
|
| 129 |
+
_ = model.generate(
|
| 130 |
+
**tokenizer(text, return_tensors="pt").to("cuda"),
|
| 131 |
+
temperature=TEMPERATURE,
|
| 132 |
+
max_new_tokens=min(512, max_completion_length),
|
| 133 |
+
streamer=TextStreamer(tokenizer, skip_prompt=True),
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# ========================================================================
|
| 137 |
+
# 5. Configure GRPO training
|
| 138 |
+
# ========================================================================
|
| 139 |
+
from trl import GRPOConfig, GRPOTrainer
|
| 140 |
+
|
| 141 |
+
training_args = GRPOConfig(
|
| 142 |
+
temperature=TEMPERATURE,
|
| 143 |
+
learning_rate=LEARNING_RATE,
|
| 144 |
+
weight_decay=0.001,
|
| 145 |
+
warmup_ratio=0.1,
|
| 146 |
+
lr_scheduler_type="linear",
|
| 147 |
+
optim="adamw_8bit",
|
| 148 |
+
logging_steps=1,
|
| 149 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 150 |
+
gradient_accumulation_steps=GRAD_ACCUM,
|
| 151 |
+
num_generations=NUM_GENERATIONS,
|
| 152 |
+
max_prompt_length=max_prompt_length,
|
| 153 |
+
max_completion_length=max_completion_length,
|
| 154 |
+
max_steps=MAX_STEPS,
|
| 155 |
+
save_steps=100,
|
| 156 |
+
report_to="trackio",
|
| 157 |
+
output_dir="outputs",
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# ========================================================================
|
| 161 |
+
# 6. Create trainer and start training
|
| 162 |
+
# ========================================================================
|
| 163 |
+
trainer = GRPOTrainer(
|
| 164 |
+
model=model,
|
| 165 |
+
processing_class=tokenizer,
|
| 166 |
+
reward_funcs=[
|
| 167 |
+
code_valid, # Reward 1: valid Python?
|
| 168 |
+
physically_valid, # Reward 2: physically possible folds?
|
| 169 |
+
fold_quality, # Reward 3: how good is the solution?
|
| 170 |
+
],
|
| 171 |
+
args=training_args,
|
| 172 |
+
train_dataset=dataset,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
print(f"\n=== Starting GRPO training: {TASK_NAME} ===")
|
| 176 |
+
print(f"Steps: {MAX_STEPS}, Generations: {NUM_GENERATIONS}, LR: {LEARNING_RATE}")
|
| 177 |
+
trainer.train()
|
| 178 |
+
|
| 179 |
+
# ========================================================================
|
| 180 |
+
# 7. Post-training inference
|
| 181 |
+
# ========================================================================
|
| 182 |
+
print("\n=== Post-training inference ===")
|
| 183 |
+
_ = model.generate(
|
| 184 |
+
**tokenizer(text, return_tensors="pt").to("cuda"),
|
| 185 |
+
temperature=TEMPERATURE,
|
| 186 |
+
max_new_tokens=min(1024, max_completion_length),
|
| 187 |
+
streamer=TextStreamer(tokenizer, skip_prompt=True),
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# ========================================================================
|
| 191 |
+
# 8. Save model (optional)
|
| 192 |
+
# ========================================================================
|
| 193 |
+
save_path = "outputs/origami-fold-lora"
|
| 194 |
+
print(f"\nSaving LoRA adapter to {save_path}")
|
| 195 |
+
model.save_pretrained(save_path)
|
| 196 |
+
tokenizer.save_pretrained(save_path)
|
| 197 |
+
print("Done!")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|