Refresh with step119432 resume checkpoint
Browse files- COMMANDS.txt +4 -3
- COMMANDS_resumed.txt +4 -3
- ENVIRONMENT.txt +4 -4
- MANIFEST.txt +4 -0
- README.md +58 -134
- README.md.bak +165 -0
- all_config.yaml +4 -4
- dataset-metadata.json +9 -0
- model.ckpt +2 -2
COMMANDS.txt
CHANGED
|
@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
|
|
| 3 |
data_paths="[data/arc2concept-aug-1000]" \
|
| 4 |
arch.L_layers=2 \
|
| 5 |
arch.H_cycles=3 arch.L_cycles=4 \
|
| 6 |
-
+run_name=
|
| 7 |
checkpoint_every_eval=True \
|
| 8 |
-
epochs=
|
| 9 |
-
|
|
|
|
|
|
| 3 |
data_paths="[data/arc2concept-aug-1000]" \
|
| 4 |
arch.L_layers=2 \
|
| 5 |
arch.H_cycles=3 arch.L_cycles=4 \
|
| 6 |
+
+run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
|
| 7 |
checkpoint_every_eval=True \
|
| 8 |
+
epochs=24000 eval_interval=100 \
|
| 9 |
+
global_batch_size=768 \
|
| 10 |
+
+load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"
|
COMMANDS_resumed.txt
CHANGED
|
@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
|
|
| 3 |
data_paths="[data/arc2concept-aug-1000]" \
|
| 4 |
arch.L_layers=2 \
|
| 5 |
arch.H_cycles=3 arch.L_cycles=4 \
|
| 6 |
-
+run_name=
|
| 7 |
checkpoint_every_eval=True \
|
| 8 |
-
epochs=
|
| 9 |
-
|
|
|
|
|
|
| 3 |
data_paths="[data/arc2concept-aug-1000]" \
|
| 4 |
arch.L_layers=2 \
|
| 5 |
arch.H_cycles=3 arch.L_cycles=4 \
|
| 6 |
+
+run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
|
| 7 |
checkpoint_every_eval=True \
|
| 8 |
+
epochs=24000 eval_interval=100 \
|
| 9 |
+
global_batch_size=768 \
|
| 10 |
+
+load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"
|
ENVIRONMENT.txt
CHANGED
|
@@ -21,20 +21,20 @@ arch:
|
|
| 21 |
beta1: 0.9
|
| 22 |
beta2: 0.95
|
| 23 |
checkpoint_every_eval: true
|
| 24 |
-
checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/
|
| 25 |
data_paths:
|
| 26 |
- data/arc2concept-aug-1000
|
| 27 |
data_paths_test: []
|
| 28 |
ema: true
|
| 29 |
ema_rate: 0.999
|
| 30 |
-
epochs:
|
| 31 |
eval_interval: 100
|
| 32 |
eval_save_outputs: []
|
| 33 |
evaluators:
|
| 34 |
- name: arc@ARC
|
| 35 |
freeze_weights: false
|
| 36 |
global_batch_size: 768
|
| 37 |
-
load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/
|
| 38 |
lr: 0.0001
|
| 39 |
lr_min_ratio: 1.0
|
| 40 |
lr_warmup_steps: 2000
|
|
@@ -42,6 +42,6 @@ min_eval_interval: 0
|
|
| 42 |
project_name: Arc2concept-aug-1000-ACT-torch
|
| 43 |
puzzle_emb_lr: 0.01
|
| 44 |
puzzle_emb_weight_decay: 0.1
|
| 45 |
-
run_name:
|
| 46 |
seed: 0
|
| 47 |
weight_decay: 0.1
|
|
|
|
| 21 |
beta1: 0.9
|
| 22 |
beta2: 0.95
|
| 23 |
checkpoint_every_eval: true
|
| 24 |
+
checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
|
| 25 |
data_paths:
|
| 26 |
- data/arc2concept-aug-1000
|
| 27 |
data_paths_test: []
|
| 28 |
ema: true
|
| 29 |
ema_rate: 0.999
|
| 30 |
+
epochs: 24000
|
| 31 |
eval_interval: 100
|
| 32 |
eval_save_outputs: []
|
| 33 |
evaluators:
|
| 34 |
- name: arc@ARC
|
| 35 |
freeze_weights: false
|
| 36 |
global_batch_size: 768
|
| 37 |
+
load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
|
| 38 |
lr: 0.0001
|
| 39 |
lr_min_ratio: 1.0
|
| 40 |
lr_warmup_steps: 2000
|
|
|
|
| 42 |
project_name: Arc2concept-aug-1000-ACT-torch
|
| 43 |
puzzle_emb_lr: 0.01
|
| 44 |
puzzle_emb_weight_decay: 0.1
|
| 45 |
+
run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
|
| 46 |
seed: 0
|
| 47 |
weight_decay: 0.1
|
MANIFEST.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CHECKPOINT_STEP=119432
|
| 2 |
+
CHECKPOINT_SOURCE=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2/step_119432
|
| 3 |
+
PACKAGED_AT=2025-10-28T23:20:00Z
|
| 4 |
+
SHA256=2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e
|
README.md
CHANGED
|
@@ -7,171 +7,95 @@ tags:
|
|
| 7 |
- program-synthesis
|
| 8 |
- tiny-recursive-models
|
| 9 |
- recursive-reasoning
|
| 10 |
-
-
|
| 11 |
- act
|
| 12 |
- reproducibility
|
| 13 |
datasets:
|
| 14 |
- arc-prize-2025
|
| 15 |
model-index:
|
| 16 |
-
- name: Tiny Recursive Models — ARC-AGI-2
|
| 17 |
results:
|
| 18 |
- task:
|
| 19 |
type: program-synthesis
|
| 20 |
-
name: ARC Prize 2025
|
| 21 |
dataset:
|
| 22 |
name: ARC Prize 2025 Public Evaluation
|
| 23 |
type: arc-prize-2025
|
| 24 |
split: evaluation
|
| 25 |
metrics:
|
|
|
|
|
|
|
|
|
|
| 26 |
- type: accuracy
|
| 27 |
name: ARC Task Solve Rate (pass@2)
|
| 28 |
-
value: 0.
|
| 29 |
- type: accuracy
|
| 30 |
-
name: ARC Task Solve Rate (pass@
|
| 31 |
-
value: 0.
|
| 32 |
- type: accuracy
|
| 33 |
-
name: pass@
|
| 34 |
-
value: 0.
|
| 35 |
---
|
| 36 |
|
| 37 |
-
# Tiny Recursive Models — ARC-AGI-2 (8
|
| 38 |
-
|
| 39 |
-
**Abstract.** This release packages the complete paper-faithful Tiny Recursive Models (TRM) checkpoint achieving **2.92% task solve rate (pass@2)** on ARC-AGI-2, the official ARC Prize 2025 competition metric. The model was trained for the full 100,000 steps (step counter displays 72,385 due to training restarts). With increased sampling, the model achieves 8.19% at pass@100. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
|
| 40 |
|
| 41 |
-
**
|
| 42 |
|
| 43 |
-
**
|
| 44 |
|
| 45 |
-
##
|
| 46 |
-
- **
|
| 47 |
-
|
| 48 |
-
- **
|
| 49 |
-
- **
|
| 50 |
-
- **
|
| 51 |
-
- **
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
## Files Included
|
| 56 |
| Path | Description |
|
| 57 |
| --- | --- |
|
| 58 |
-
| `model.ckpt` | PyTorch checkpoint (
|
| 59 |
-
| `
|
| 60 |
-
| `
|
| 61 |
-
| `
|
| 62 |
-
| `TRM_COMMIT.txt` |
|
| 63 |
-
| `all_config.yaml` |
|
| 64 |
-
| `
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
- **
|
| 69 |
-
- **
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
##
|
| 74 |
-
- **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
|
| 75 |
-
- **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
|
| 76 |
-
- **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
|
| 77 |
-
- **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
|
| 78 |
-
- **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
|
| 79 |
-
|
| 80 |
-
### Key Training Metrics (Weights & Biases)
|
| 81 |
-
- `all/accuracy`: **0.704**
|
| 82 |
-
- `all/lm_loss`: **1.70**
|
| 83 |
-
- `all/q_halt_accuracy`: **0.799**
|
| 84 |
-
- `ARC/pass@1`: **1.67 %**
|
| 85 |
-
- `ARC/pass@10`: **5.83 %**
|
| 86 |
-
- `ARC/pass@100`: **8.19 %**
|
| 87 |
-
- `ARC/pass@1000`: **13.75 %**
|
| 88 |
-
|
| 89 |
-
## Evaluation
|
| 90 |
-
|
| 91 |
-
### ARC-AGI-2 Task Solve Rates
|
| 92 |
-
**These are the real puzzle-solving performance metrics:**
|
| 93 |
-
- **pass@1**: 1.67% (single attempt per task)
|
| 94 |
-
- **pass@2**: **2.92%** (official ARC Prize 2025 competition metric)
|
| 95 |
-
- **pass@10**: 5.83%
|
| 96 |
-
- **pass@100**: 8.19%
|
| 97 |
-
- **pass@1000**: 13.75%
|
| 98 |
-
|
| 99 |
-
### Model-Level Metrics
|
| 100 |
-
**These measure internal model behavior, not task success:**
|
| 101 |
-
- Token-level accuracy: 62.83% (not indicative of puzzle-solving)
|
| 102 |
-
- LM Loss: 2.0186
|
| 103 |
-
- Halt accuracy: 90.7% (ACT controller stopping mechanism)
|
| 104 |
-
|
| 105 |
-
### Evaluation Details
|
| 106 |
-
- Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer
|
| 107 |
-
- Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`
|
| 108 |
-
|
| 109 |
-
## How to Use
|
| 110 |
-
Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
|
| 111 |
-
|
| 112 |
-
```python
|
| 113 |
-
from pathlib import Path
|
| 114 |
-
import torch
|
| 115 |
-
|
| 116 |
-
from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
|
| 117 |
-
from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
|
| 118 |
-
|
| 119 |
-
def load_trm(weights_path: str):
|
| 120 |
-
ckpt = torch.load(weights_path, map_location="cpu")
|
| 121 |
-
model_cfg = ckpt["hyperparameters"]["arch"]
|
| 122 |
-
model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
|
| 123 |
-
load_trm_checkpoint(model, ckpt, strict=True)
|
| 124 |
-
model.eval()
|
| 125 |
-
return model
|
| 126 |
-
|
| 127 |
-
weights = Path("model.ckpt") # replace with hf_hub_download path if needed
|
| 128 |
-
model = load_trm(weights)
|
| 129 |
-
```
|
| 130 |
-
|
| 131 |
-
To fetch the checkpoint programmatically:
|
| 132 |
-
|
| 133 |
```python
|
| 134 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 135 |
|
| 136 |
-
ckpt_path = hf_hub_download(
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
repo_type="model",
|
| 140 |
-
)
|
| 141 |
-
```
|
| 142 |
-
|
| 143 |
-
For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
|
| 144 |
-
|
| 145 |
-
## Reproducibility Checklist
|
| 146 |
-
- ✅ ARC-AGI-2 data builder command versioned in repository.
|
| 147 |
-
- ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
|
| 148 |
-
- ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
|
| 149 |
-
- ✅ W&B metrics exported for independent verification.
|
| 150 |
-
- ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
|
| 151 |
-
|
| 152 |
-
## Citation & Acknowledgements
|
| 153 |
-
If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
|
| 154 |
-
|
| 155 |
```
|
| 156 |
-
@inproceedings{shridhar2025trm,
|
| 157 |
-
title = {Tiny Recursive Models},
|
| 158 |
-
author = {Shridhar, Mohit and et al.},
|
| 159 |
-
year = {2025},
|
| 160 |
-
booktitle = {arXiv preprint arXiv:2502.12345}
|
| 161 |
-
}
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
- **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
|
| 174 |
-
- **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
|
| 175 |
-
- **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
|
| 176 |
|
| 177 |
-
|
|
|
|
| 7 |
- program-synthesis
|
| 8 |
- tiny-recursive-models
|
| 9 |
- recursive-reasoning
|
| 10 |
+
- resume-training
|
| 11 |
- act
|
| 12 |
- reproducibility
|
| 13 |
datasets:
|
| 14 |
- arc-prize-2025
|
| 15 |
model-index:
|
| 16 |
+
- name: Tiny Recursive Models — ARC-AGI-2 (Resume Step 119432)
|
| 17 |
results:
|
| 18 |
- task:
|
| 19 |
type: program-synthesis
|
| 20 |
+
name: ARC Prize 2025 (legacy evaluation mapping)
|
| 21 |
dataset:
|
| 22 |
name: ARC Prize 2025 Public Evaluation
|
| 23 |
type: arc-prize-2025
|
| 24 |
split: evaluation
|
| 25 |
metrics:
|
| 26 |
+
- type: accuracy
|
| 27 |
+
name: ARC Task Solve Rate (pass@1)
|
| 28 |
+
value: 0.0083
|
| 29 |
- type: accuracy
|
| 30 |
name: ARC Task Solve Rate (pass@2)
|
| 31 |
+
value: 0.0083
|
| 32 |
- type: accuracy
|
| 33 |
+
name: ARC Task Solve Rate (pass@10)
|
| 34 |
+
value: 0.0083
|
| 35 |
- type: accuracy
|
| 36 |
+
name: ARC Task Solve Rate (pass@100)
|
| 37 |
+
value: 0.0083
|
| 38 |
---
|
| 39 |
|
| 40 |
+
# Tiny Recursive Models — ARC-AGI-2 (8× H200 Resume, Step 119 432)
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
**What’s new (Nov 2025).** This refresh publishes the best-performing checkpoint from the CoreWeave resume campaign—`trm_arc2_8gpu_resume_step115815_plus100k_v2` at global step **119 432**. The job resumed from TinyRecursiveModels commit `e7b68717` with the full resume guard stack (`trm-common-script` + `trm-pyshim`) and legacy ARC identifier mapping. This is the same checkpoint we attempted to ship to Kaggle; the submission stalled at 0.83 % pass@1 because every task duplicated attempts, so we are documenting the shortfall here instead of claiming leaderboard progress.
|
| 43 |
|
| 44 |
+
**Why the name mentions 119 434.** Internal tracking labelled this snapshot “step 119 434”, but the persisted shard on the CoreWeave PVC is `step_119432`. The W&B records for the run confirm that resume guard initialized at the expected `115 815` step and advanced to the 119k block; no 119 434 shard survived the routine pruning. When downstream tooling expects the 119 434 identifier, point it at this artifact and note the two-step discrepancy.
|
| 45 |
|
| 46 |
+
## Checkpoint Snapshot
|
| 47 |
+
- **Run name**: `trm_arc2_8gpu_resume_step115815_plus100k_v2`
|
| 48 |
+
- **Global step**: 119 432 (3 617 optimizer updates after the 115 815 resume point)
|
| 49 |
+
- **Architecture**: Tiny Recursive Model ACT V1 (`L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE, bfloat16 activations)
|
| 50 |
+
- **Optimizer**: Adam-atan2 (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, EMA 0.999, global batch size 768)
|
| 51 |
+
- **Dataset builder**: Legacy identifier order (`dataset/build_arc_dataset_legacy.py`) targeting `arc2concept-aug-1000`
|
| 52 |
+
- **Resume provenance**:
|
| 53 |
+
- `RESUME_CHECKPOINT_PATH` → `/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815`
|
| 54 |
+
- `RESUME_EXPECTED_STEP` → `115815`
|
| 55 |
+
- `[resume] initializing train_state.step to 115815` appears in pod logs before training continues
|
| 56 |
+
- **PVC retention**: Latest PVC shards now extend to `step_662428`; earlier 119k shards were pruned after packaging this export.
|
| 57 |
|
| 58 |
## Files Included
|
| 59 |
| Path | Description |
|
| 60 |
| --- | --- |
|
| 61 |
+
| `model.ckpt` | Consolidated PyTorch checkpoint (optimizer, EMA, and weights) containing `step_119432/*` tensors. SHA-256: `2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e`. |
|
| 62 |
+
| `COMMANDS.txt` / `COMMANDS_resumed.txt` | Torch distributed launch (8 × H200) showing the resume flags and dataset path. |
|
| 63 |
+
| `ENVIRONMENT.txt` | Hydra-resolved configuration captured on CoreWeave after overlays. |
|
| 64 |
+
| `MANIFEST.txt` | Packaging metadata (checkpoint step, source path, timestamp, sha256). |
|
| 65 |
+
| `TRM_COMMIT.txt` | Upstream TinyRecursiveModels Git SHA (`e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9`). |
|
| 66 |
+
| `all_config.yaml` | Structured config snapshot exported alongside the checkpoint. |
|
| 67 |
+
| `dataset-metadata.json` | Kaggle dataset manifest (kept for parity with previous releases). |
|
| 68 |
+
|
| 69 |
+
## Evaluation Status
|
| 70 |
+
- **Validation (CoreWeave pod evaluator, legacy mapping)**: `pass@1 = 0.83 %`, identical scores for pass@2/5/10/100 because samples were duplicates. Mean token accuracy ≈ 70.1 %, `train/lm_loss` ≈ 0.134 at resume, `all/lm_loss` ≈ 1.56.
|
| 71 |
+
- **Kaggle inference notebook (test split)**: Also produced 259/259 duplicate attempts, yielding 0.83 % pass@1 and no leaderboard improvement. The issue remains unresolved; do not submit this checkpoint to Kaggle until the sampler divergence is fixed.
|
| 72 |
+
- **Copy-mode diagnostics** (`scripts/debug_eval_cpu.py` in legacy mode): 0/120 grid matches (consistent with earlier baselines).
|
| 73 |
+
|
| 74 |
+
The metrics bundled here are sufficient to reproduce our internal dashboards without requiring live W&B access. If you have Weights & Biases credentials, the run is listed under `trm_arc2_8gpu_resume_step115815_plus100k_v2` in project `trm-arc2`; the first logged step after resume exceeds 115 815, confirming the guard executed.
|
| 75 |
+
|
| 76 |
+
## Inference & Reproduction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
```python
|
| 78 |
from huggingface_hub import hf_hub_download
|
| 79 |
+
import torch
|
| 80 |
|
| 81 |
+
ckpt_path = hf_hub_download("seconds0/trm-arc2-8gpu", "model.ckpt")
|
| 82 |
+
state = torch.load(ckpt_path, map_location="cpu")
|
| 83 |
+
print(state["hyperparameters"]["arch"]["hidden_size"]) # 512
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
To recreate the CoreWeave launch:
|
| 87 |
+
```bash
|
| 88 |
+
kubectl apply -f infra/kubernetes/trm-train-8gpu-resume.yaml
|
| 89 |
+
# Ensure ConfigMaps trm-common-script, trm-pyshim-cm, and trm-eval-overlay are applied first.
|
| 90 |
```
|
| 91 |
+
Before submitting jobs, verify:
|
| 92 |
+
1. `RESUME_CHECKPOINT_PATH` points to the 115 815 shard.
|
| 93 |
+
2. `[resume] initializing train_state.step to 115815` appears once training boots.
|
| 94 |
+
3. The first W&B point is ≥115 815 with `train/lm_loss` ≈ 0.13.
|
| 95 |
|
| 96 |
+
## Known Gaps & Next Steps
|
| 97 |
+
1. **Sampler instability** – Deduplicate sampler outputs before retrying Kaggle submissions.
|
| 98 |
+
2. **Identifier remapping** – Remains legacy-only; switching to sorted identifiers requires remapping or finetuning.
|
| 99 |
+
3. **W&B rehydration** – Set `WANDB_API_KEY` locally if you need fresh metrics; the release ships cached configs only.
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
Please cite the Tiny Recursive Models paper and ARC Prize 2025 when using this checkpoint. Contributions, bug reports, and sampler fixes are welcome via the repository issues.
|
README.md.bak
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: pytorch
|
| 3 |
+
license: mit
|
| 4 |
+
pipeline_tag: other
|
| 5 |
+
tags:
|
| 6 |
+
- arc-prize-2025
|
| 7 |
+
- program-synthesis
|
| 8 |
+
- tiny-recursive-models
|
| 9 |
+
- recursive-reasoning
|
| 10 |
+
- kaggle
|
| 11 |
+
- act
|
| 12 |
+
- reproducibility
|
| 13 |
+
datasets:
|
| 14 |
+
- arc-prize-2025
|
| 15 |
+
model-index:
|
| 16 |
+
- name: Tiny Recursive Models — ARC-AGI-2
|
| 17 |
+
results:
|
| 18 |
+
- task:
|
| 19 |
+
type: program-synthesis
|
| 20 |
+
name: ARC Prize 2025
|
| 21 |
+
dataset:
|
| 22 |
+
name: ARC Prize 2025 Public Evaluation
|
| 23 |
+
type: arc-prize-2025
|
| 24 |
+
split: evaluation
|
| 25 |
+
metrics:
|
| 26 |
+
- type: accuracy
|
| 27 |
+
name: Accuracy
|
| 28 |
+
value: 0.6283
|
| 29 |
+
- type: loss
|
| 30 |
+
name: LM Loss
|
| 31 |
+
value: 2.0186
|
| 32 |
+
- type: accuracy
|
| 33 |
+
name: Halt Accuracy
|
| 34 |
+
value: 0.9070
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
# Tiny Recursive Models — ARC-AGI-2 (8×GPU)
|
| 38 |
+
|
| 39 |
+
**Abstract.** This release packages the paper-faithful Tiny Recursive Models (TRM) checkpoint trained on the ARC-AGI-2 augmentation suite. We resume the official 8-GPU run from step 62,976 and continue to step 72,385, preserving upstream hyperparameters, dataset construction, and optimizer settings. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
|
| 40 |
+
|
| 41 |
+
**Special thanks** to Shawn Lewis (CTO of Weights & Biases) and the CoreWeave team (coreweave.com) for their generous contribution of 2 nodes × 8 × H200 GPUs worth of compute time via the CoreWeave Cloud platform. This work would not have been possible without their assistance and trust in the authors.
|
| 42 |
+
|
| 43 |
+
**Note on authorship.** All engineering, documentation, and packaging work in this reproduction project was completed with the assistance of coding-oriented large language models operating under human supervision. The models handled end-to-end implementation—from training orchestration and dataset packaging to documentation and publishing—while humans provided oversight, safety validation, and access control.
|
| 44 |
+
|
| 45 |
+
## Model Summary
|
| 46 |
+
- **Architecture**: Tiny Recursive Model (TRM) with ACT V1 controller
|
| 47 |
+
`L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE positional encodings, bfloat16 activations.
|
| 48 |
+
- **Checkpoint**: `model.ckpt` captured after **72,385** optimizer steps while training on the ARC-AGI-2 augmentation suite (`arc2concept-aug-1000`).
|
| 49 |
+
- **Upstream Commit**: `e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9` (SamsungSAILMontreal/TinyRecursiveModels).
|
| 50 |
+
- **Optimizer**: Adam-atan2 variant (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, global batch size 768).
|
| 51 |
+
- **License**: MIT (inherits upstream TRM license).
|
| 52 |
+
|
| 53 |
+
This release reproduces the ARC-AGI-2 configuration described in the TRM paper using the officially provided dataset builder and training recipe. It is the same checkpoint published for Kaggle inference, packaged here for broader research use.
|
| 54 |
+
|
| 55 |
+
## Files Included
|
| 56 |
+
| Path | Description |
|
| 57 |
+
| --- | --- |
|
| 58 |
+
| `model.ckpt` | PyTorch checkpoint (fp32/bf16 mix) containing model + optimizer state. |
|
| 59 |
+
| `ENVIRONMENT.txt` | Hydra-resolved configuration used for the run (mirrors `all_config.yaml`). |
|
| 60 |
+
| `COMMANDS.txt` | Launch command showing exact training flags. |
|
| 61 |
+
| `COMMANDS_resumed.txt` | Resume command showing restart from step 62,976. |
|
| 62 |
+
| `TRM_COMMIT.txt` | Git SHA for the TinyRecursiveModels source at training time. |
|
| 63 |
+
| `all_config.yaml` | Full structured config exported from the training job. |
|
| 64 |
+
| `step_72385.zip` | Raw checkpoint directory as produced by the trainer (weights, EMA, optimizer). |
|
| 65 |
+
| `wandb_ljxzfy3z_history.csv` / `wandb_ljxzfy3z_summary.json` | Captured metrics from Weights & Biases run `Arc2concept-aug-1000-ACT-torch/ljxzfy3z`. |
|
| 66 |
+
|
| 67 |
+
## Intended Use & Limitations
|
| 68 |
+
- **Primary use**: Research on ARC-AGI-style program synthesis and evaluation, benchmarking Tiny Recursive Models, and reproducing Kaggle ARC Prize 2025 submissions.
|
| 69 |
+
- **Downstream evaluation**: Pair with the official ARC Prize 2025 evaluation set or ARC-AGI-2 validation splits.
|
| 70 |
+
- **Misuse**: The checkpoint is not designed for domains outside program synthesis. No safety mitigations are baked in; users are responsible for verifying results before deployment.
|
| 71 |
+
- **Limitations**: Performance is capped by the paper-faithful hyperparameters; there is no fine-tuning on ARC-AGI-1. As an ACT model, inference cost varies per puzzle and can be high on longer tasks.
|
| 72 |
+
|
| 73 |
+
## Training Procedure
|
| 74 |
+
- **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
|
| 75 |
+
- **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
|
| 76 |
+
- **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
|
| 77 |
+
- **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
|
| 78 |
+
- **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
|
| 79 |
+
|
| 80 |
+
### Key Training Metrics (Weights & Biases)
|
| 81 |
+
- `all/accuracy`: **0.704**
|
| 82 |
+
- `all/lm_loss`: **1.70**
|
| 83 |
+
- `all/q_halt_accuracy`: **0.799**
|
| 84 |
+
- `ARC/pass@1`: **1.67 %**
|
| 85 |
+
- `ARC/pass@10`: **5.83 %**
|
| 86 |
+
- `ARC/pass@100`: **8.19 %**
|
| 87 |
+
- `ARC/pass@1000`: **13.75 %**
|
| 88 |
+
|
| 89 |
+
## Evaluation
|
| 90 |
+
- **ARC Prize 2025 public evaluation (Kaggle GPU)**
|
| 91 |
+
- Accuracy: **0.6283**
|
| 92 |
+
- LM Loss: **2.0186**
|
| 93 |
+
- Halt accuracy: **0.907**
|
| 94 |
+
- Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer.
|
| 95 |
+
- Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`.
|
| 96 |
+
|
| 97 |
+
## How to Use
|
| 98 |
+
Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
|
| 99 |
+
|
| 100 |
+
```python
|
| 101 |
+
from pathlib import Path
|
| 102 |
+
import torch
|
| 103 |
+
|
| 104 |
+
from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
|
| 105 |
+
from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
|
| 106 |
+
|
| 107 |
+
def load_trm(weights_path: str):
|
| 108 |
+
ckpt = torch.load(weights_path, map_location="cpu")
|
| 109 |
+
model_cfg = ckpt["hyperparameters"]["arch"]
|
| 110 |
+
model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
|
| 111 |
+
load_trm_checkpoint(model, ckpt, strict=True)
|
| 112 |
+
model.eval()
|
| 113 |
+
return model
|
| 114 |
+
|
| 115 |
+
weights = Path("model.ckpt") # replace with hf_hub_download path if needed
|
| 116 |
+
model = load_trm(weights)
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
To fetch the checkpoint programmatically:
|
| 120 |
+
|
| 121 |
+
```python
|
| 122 |
+
from huggingface_hub import hf_hub_download
|
| 123 |
+
|
| 124 |
+
ckpt_path = hf_hub_download(
|
| 125 |
+
repo_id="seconds0/trm-arc2-8gpu",
|
| 126 |
+
filename="model.ckpt",
|
| 127 |
+
repo_type="model",
|
| 128 |
+
)
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
|
| 132 |
+
|
| 133 |
+
## Reproducibility Checklist
|
| 134 |
+
- ✅ ARC-AGI-2 data builder command versioned in repository.
|
| 135 |
+
- ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
|
| 136 |
+
- ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
|
| 137 |
+
- ✅ W&B metrics exported for independent verification.
|
| 138 |
+
- ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
|
| 139 |
+
|
| 140 |
+
## Citation & Acknowledgements
|
| 141 |
+
If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
|
| 142 |
+
|
| 143 |
+
```
|
| 144 |
+
@inproceedings{shridhar2025trm,
|
| 145 |
+
title = {Tiny Recursive Models},
|
| 146 |
+
author = {Shridhar, Mohit and et al.},
|
| 147 |
+
year = {2025},
|
| 148 |
+
booktitle = {arXiv preprint arXiv:2502.12345}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
@misc{arcprize2025,
|
| 152 |
+
title = {ARC Prize 2025},
|
| 153 |
+
howpublished = {https://www.kaggle.com/competitions/arc-prize-2025}
|
| 154 |
+
}
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
- Upstream TRM repository: https://github.com/SamsungSAILMontreal/TinyRecursiveModels
|
| 158 |
+
- Tiny Recursive Models paper: https://arxiv.org/abs/2502.12345
|
| 159 |
+
|
| 160 |
+
## Responsible AI Considerations
|
| 161 |
+
- **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
|
| 162 |
+
- **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
|
| 163 |
+
- **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
|
| 164 |
+
|
| 165 |
+
---
|
all_config.yaml
CHANGED
|
@@ -21,20 +21,20 @@ arch:
|
|
| 21 |
beta1: 0.9
|
| 22 |
beta2: 0.95
|
| 23 |
checkpoint_every_eval: true
|
| 24 |
-
checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/
|
| 25 |
data_paths:
|
| 26 |
- data/arc2concept-aug-1000
|
| 27 |
data_paths_test: []
|
| 28 |
ema: true
|
| 29 |
ema_rate: 0.999
|
| 30 |
-
epochs:
|
| 31 |
eval_interval: 100
|
| 32 |
eval_save_outputs: []
|
| 33 |
evaluators:
|
| 34 |
- name: arc@ARC
|
| 35 |
freeze_weights: false
|
| 36 |
global_batch_size: 768
|
| 37 |
-
load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/
|
| 38 |
lr: 0.0001
|
| 39 |
lr_min_ratio: 1.0
|
| 40 |
lr_warmup_steps: 2000
|
|
@@ -42,6 +42,6 @@ min_eval_interval: 0
|
|
| 42 |
project_name: Arc2concept-aug-1000-ACT-torch
|
| 43 |
puzzle_emb_lr: 0.01
|
| 44 |
puzzle_emb_weight_decay: 0.1
|
| 45 |
-
run_name:
|
| 46 |
seed: 0
|
| 47 |
weight_decay: 0.1
|
|
|
|
| 21 |
beta1: 0.9
|
| 22 |
beta2: 0.95
|
| 23 |
checkpoint_every_eval: true
|
| 24 |
+
checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
|
| 25 |
data_paths:
|
| 26 |
- data/arc2concept-aug-1000
|
| 27 |
data_paths_test: []
|
| 28 |
ema: true
|
| 29 |
ema_rate: 0.999
|
| 30 |
+
epochs: 24000
|
| 31 |
eval_interval: 100
|
| 32 |
eval_save_outputs: []
|
| 33 |
evaluators:
|
| 34 |
- name: arc@ARC
|
| 35 |
freeze_weights: false
|
| 36 |
global_batch_size: 768
|
| 37 |
+
load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
|
| 38 |
lr: 0.0001
|
| 39 |
lr_min_ratio: 1.0
|
| 40 |
lr_warmup_steps: 2000
|
|
|
|
| 42 |
project_name: Arc2concept-aug-1000-ACT-torch
|
| 43 |
puzzle_emb_lr: 0.01
|
| 44 |
puzzle_emb_weight_decay: 0.1
|
| 45 |
+
run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
|
| 46 |
seed: 0
|
| 47 |
weight_decay: 0.1
|
dataset-metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "TRM ARC-AGI-2 Weights (8GPU Step 119432)",
|
| 3 |
+
"id": "seconds0/trm-arc2-weights-trm-arc2-8gpu-step119432",
|
| 4 |
+
"subtitle": "8-GPU resume checkpoint for TRM ARC-AGI-2 (step 119432)",
|
| 5 |
+
"description": "Tiny Recursive Models (TRM) ARC-AGI-2 checkpoint captured at global step 119,432 during the legacy resume run. Packaged for Kaggle inference with legacy identifier mapping.",
|
| 6 |
+
"licenses": [
|
| 7 |
+
{ "name": "cc-by-4.0" }
|
| 8 |
+
]
|
| 9 |
+
}
|
model.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e
|
| 3 |
+
size 2467988405
|