seconds-0 commited on
Commit
520035d
·
verified ·
1 Parent(s): 0f72c92

Refresh with step119432 resume checkpoint

Browse files
COMMANDS.txt CHANGED
@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
3
  data_paths="[data/arc2concept-aug-1000]" \
4
  arch.L_layers=2 \
5
  arch.H_cycles=3 arch.L_cycles=4 \
6
- +run_name=trm_arc2_8gpu_eval100 ema=True \
7
  checkpoint_every_eval=True \
8
- epochs=10000 eval_interval=100 \
9
- +load_checkpoint=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
 
 
3
  data_paths="[data/arc2concept-aug-1000]" \
4
  arch.L_layers=2 \
5
  arch.H_cycles=3 arch.L_cycles=4 \
6
+ +run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
7
  checkpoint_every_eval=True \
8
+ epochs=24000 eval_interval=100 \
9
+ global_batch_size=768 \
10
+ +load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"
COMMANDS_resumed.txt CHANGED
@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
3
  data_paths="[data/arc2concept-aug-1000]" \
4
  arch.L_layers=2 \
5
  arch.H_cycles=3 arch.L_cycles=4 \
6
- +run_name=trm_arc2_8gpu_eval100 ema=True \
7
  checkpoint_every_eval=True \
8
- epochs=10000 eval_interval=100 \
9
- +load_checkpoint=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
 
 
3
  data_paths="[data/arc2concept-aug-1000]" \
4
  arch.L_layers=2 \
5
  arch.H_cycles=3 arch.L_cycles=4 \
6
+ +run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
7
  checkpoint_every_eval=True \
8
+ epochs=24000 eval_interval=100 \
9
+ global_batch_size=768 \
10
+ +load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"
ENVIRONMENT.txt CHANGED
@@ -21,20 +21,20 @@ arch:
21
  beta1: 0.9
22
  beta2: 0.95
23
  checkpoint_every_eval: true
24
- checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100
25
  data_paths:
26
  - data/arc2concept-aug-1000
27
  data_paths_test: []
28
  ema: true
29
  ema_rate: 0.999
30
- epochs: 10000
31
  eval_interval: 100
32
  eval_save_outputs: []
33
  evaluators:
34
  - name: arc@ARC
35
  freeze_weights: false
36
  global_batch_size: 768
37
- load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
38
  lr: 0.0001
39
  lr_min_ratio: 1.0
40
  lr_warmup_steps: 2000
@@ -42,6 +42,6 @@ min_eval_interval: 0
42
  project_name: Arc2concept-aug-1000-ACT-torch
43
  puzzle_emb_lr: 0.01
44
  puzzle_emb_weight_decay: 0.1
45
- run_name: trm_arc2_8gpu_eval100
46
  seed: 0
47
  weight_decay: 0.1
 
21
  beta1: 0.9
22
  beta2: 0.95
23
  checkpoint_every_eval: true
24
+ checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
25
  data_paths:
26
  - data/arc2concept-aug-1000
27
  data_paths_test: []
28
  ema: true
29
  ema_rate: 0.999
30
+ epochs: 24000
31
  eval_interval: 100
32
  eval_save_outputs: []
33
  evaluators:
34
  - name: arc@ARC
35
  freeze_weights: false
36
  global_batch_size: 768
37
+ load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
38
  lr: 0.0001
39
  lr_min_ratio: 1.0
40
  lr_warmup_steps: 2000
 
42
  project_name: Arc2concept-aug-1000-ACT-torch
43
  puzzle_emb_lr: 0.01
44
  puzzle_emb_weight_decay: 0.1
45
+ run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
46
  seed: 0
47
  weight_decay: 0.1
MANIFEST.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ CHECKPOINT_STEP=119432
2
+ CHECKPOINT_SOURCE=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2/step_119432
3
+ PACKAGED_AT=2025-10-28T23:20:00Z
4
+ SHA256=2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e
README.md CHANGED
@@ -7,171 +7,95 @@ tags:
7
  - program-synthesis
8
  - tiny-recursive-models
9
  - recursive-reasoning
10
- - kaggle
11
  - act
12
  - reproducibility
13
  datasets:
14
  - arc-prize-2025
15
  model-index:
16
- - name: Tiny Recursive Models — ARC-AGI-2
17
  results:
18
  - task:
19
  type: program-synthesis
20
- name: ARC Prize 2025
21
  dataset:
22
  name: ARC Prize 2025 Public Evaluation
23
  type: arc-prize-2025
24
  split: evaluation
25
  metrics:
 
 
 
26
  - type: accuracy
27
  name: ARC Task Solve Rate (pass@2)
28
- value: 0.0292
29
  - type: accuracy
30
- name: ARC Task Solve Rate (pass@100)
31
- value: 0.0819
32
  - type: accuracy
33
- name: pass@1
34
- value: 0.0167
35
  ---
36
 
37
- # Tiny Recursive Models — ARC-AGI-2 (8×GPU)
38
-
39
- **Abstract.** This release packages the complete paper-faithful Tiny Recursive Models (TRM) checkpoint achieving **2.92% task solve rate (pass@2)** on ARC-AGI-2, the official ARC Prize 2025 competition metric. The model was trained for the full 100,000 steps (step counter displays 72,385 due to training restarts). With increased sampling, the model achieves 8.19% at pass@100. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
40
 
41
- **Special thanks** to Shawn Lewis (CTO of Weights & Biases) and the CoreWeave team (coreweave.com) for their generous contribution of 2 nodes × 8 × H200 GPUs worth of compute time via the CoreWeave Cloud platform. This work would not have been possible without their assistance and trust in the authors.
42
 
43
- **Note on authorship.** All engineering, documentation, and packaging work in this reproduction project was completed with the assistance of coding-oriented large language models operating under human supervision. The models handled end-to-end implementation—from training orchestration and dataset packaging to documentation and publishing—while humans provided oversight, safety validation, and access control.
44
 
45
- ## Model Summary
46
- - **Architecture**: Tiny Recursive Model (TRM) with ACT V1 controller
47
- `L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE positional encodings, bfloat16 activations.
48
- - **Checkpoint**: `model.ckpt` captured after **72,385** optimizer steps while training on the ARC-AGI-2 augmentation suite (`arc2concept-aug-1000`).
49
- - **Upstream Commit**: `e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9` (SamsungSAILMontreal/TinyRecursiveModels).
50
- - **Optimizer**: Adam-atan2 variant (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, global batch size 768).
51
- - **License**: MIT (inherits upstream TRM license).
52
-
53
- This release reproduces the ARC-AGI-2 configuration described in the TRM paper using the officially provided dataset builder and training recipe. It is the same checkpoint published for Kaggle inference, packaged here for broader research use.
 
 
54
 
55
  ## Files Included
56
  | Path | Description |
57
  | --- | --- |
58
- | `model.ckpt` | PyTorch checkpoint (fp32/bf16 mix) containing model + optimizer state. |
59
- | `ENVIRONMENT.txt` | Hydra-resolved configuration used for the run (mirrors `all_config.yaml`). |
60
- | `COMMANDS.txt` | Launch command showing exact training flags. |
61
- | `COMMANDS_resumed.txt` | Resume command showing restart from step 62,976. |
62
- | `TRM_COMMIT.txt` | Git SHA for the TinyRecursiveModels source at training time. |
63
- | `all_config.yaml` | Full structured config exported from the training job. |
64
- | `step_72385.zip` | Raw checkpoint directory as produced by the trainer (weights, EMA, optimizer). |
65
- | `wandb_ljxzfy3z_history.csv` / `wandb_ljxzfy3z_summary.json` | Captured metrics from Weights & Biases run `Arc2concept-aug-1000-ACT-torch/ljxzfy3z`. |
66
-
67
- ## Intended Use & Limitations
68
- - **Primary use**: Research on ARC-AGI-style program synthesis and evaluation, benchmarking Tiny Recursive Models, and reproducing Kaggle ARC Prize 2025 submissions.
69
- - **Downstream evaluation**: Pair with the official ARC Prize 2025 evaluation set or ARC-AGI-2 validation splits.
70
- - **Misuse**: The checkpoint is not designed for domains outside program synthesis. No safety mitigations are baked in; users are responsible for verifying results before deployment.
71
- - **Limitations**: Performance is capped by the paper-faithful hyperparameters; there is no fine-tuning on ARC-AGI-1. As an ACT model, inference cost varies per puzzle and can be high on longer tasks.
72
-
73
- ## Training Procedure
74
- - **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
75
- - **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
76
- - **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
77
- - **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
78
- - **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
79
-
80
- ### Key Training Metrics (Weights & Biases)
81
- - `all/accuracy`: **0.704**
82
- - `all/lm_loss`: **1.70**
83
- - `all/q_halt_accuracy`: **0.799**
84
- - `ARC/pass@1`: **1.67 %**
85
- - `ARC/pass@10`: **5.83 %**
86
- - `ARC/pass@100`: **8.19 %**
87
- - `ARC/pass@1000`: **13.75 %**
88
-
89
- ## Evaluation
90
-
91
- ### ARC-AGI-2 Task Solve Rates
92
- **These are the real puzzle-solving performance metrics:**
93
- - **pass@1**: 1.67% (single attempt per task)
94
- - **pass@2**: **2.92%** (official ARC Prize 2025 competition metric)
95
- - **pass@10**: 5.83%
96
- - **pass@100**: 8.19%
97
- - **pass@1000**: 13.75%
98
-
99
- ### Model-Level Metrics
100
- **These measure internal model behavior, not task success:**
101
- - Token-level accuracy: 62.83% (not indicative of puzzle-solving)
102
- - LM Loss: 2.0186
103
- - Halt accuracy: 90.7% (ACT controller stopping mechanism)
104
-
105
- ### Evaluation Details
106
- - Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer
107
- - Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`
108
-
109
- ## How to Use
110
- Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
111
-
112
- ```python
113
- from pathlib import Path
114
- import torch
115
-
116
- from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
117
- from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
118
-
119
- def load_trm(weights_path: str):
120
- ckpt = torch.load(weights_path, map_location="cpu")
121
- model_cfg = ckpt["hyperparameters"]["arch"]
122
- model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
123
- load_trm_checkpoint(model, ckpt, strict=True)
124
- model.eval()
125
- return model
126
-
127
- weights = Path("model.ckpt") # replace with hf_hub_download path if needed
128
- model = load_trm(weights)
129
- ```
130
-
131
- To fetch the checkpoint programmatically:
132
-
133
  ```python
134
  from huggingface_hub import hf_hub_download
 
135
 
136
- ckpt_path = hf_hub_download(
137
- repo_id="seconds0/trm-arc2-8gpu",
138
- filename="model.ckpt",
139
- repo_type="model",
140
- )
141
- ```
142
-
143
- For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
144
-
145
- ## Reproducibility Checklist
146
- - ✅ ARC-AGI-2 data builder command versioned in repository.
147
- - ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
148
- - ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
149
- - ✅ W&B metrics exported for independent verification.
150
- - ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
151
-
152
- ## Citation & Acknowledgements
153
- If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
154
-
155
  ```
156
- @inproceedings{shridhar2025trm,
157
- title = {Tiny Recursive Models},
158
- author = {Shridhar, Mohit and et al.},
159
- year = {2025},
160
- booktitle = {arXiv preprint arXiv:2502.12345}
161
- }
162
 
163
- @misc{arcprize2025,
164
- title = {ARC Prize 2025},
165
- howpublished = {https://www.kaggle.com/competitions/arc-prize-2025}
166
- }
167
  ```
 
 
 
 
168
 
169
- - Upstream TRM repository: https://github.com/SamsungSAILMontreal/TinyRecursiveModels
170
- - Tiny Recursive Models paper: https://arxiv.org/abs/2502.12345
171
-
172
- ## Responsible AI Considerations
173
- - **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
174
- - **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
175
- - **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
176
 
177
- ---
 
7
  - program-synthesis
8
  - tiny-recursive-models
9
  - recursive-reasoning
10
+ - resume-training
11
  - act
12
  - reproducibility
13
  datasets:
14
  - arc-prize-2025
15
  model-index:
16
+ - name: Tiny Recursive Models — ARC-AGI-2 (Resume Step 119432)
17
  results:
18
  - task:
19
  type: program-synthesis
20
+ name: ARC Prize 2025 (legacy evaluation mapping)
21
  dataset:
22
  name: ARC Prize 2025 Public Evaluation
23
  type: arc-prize-2025
24
  split: evaluation
25
  metrics:
26
+ - type: accuracy
27
+ name: ARC Task Solve Rate (pass@1)
28
+ value: 0.0083
29
  - type: accuracy
30
  name: ARC Task Solve Rate (pass@2)
31
+ value: 0.0083
32
  - type: accuracy
33
+ name: ARC Task Solve Rate (pass@10)
34
+ value: 0.0083
35
  - type: accuracy
36
+ name: ARC Task Solve Rate (pass@100)
37
+ value: 0.0083
38
  ---
39
 
40
+ # Tiny Recursive Models — ARC-AGI-2 (8× H200 Resume, Step 119 432)
 
 
41
 
42
+ **What’s new (Nov 2025).** This refresh publishes the best-performing checkpoint from the CoreWeave resume campaign—`trm_arc2_8gpu_resume_step115815_plus100k_v2` at global step **119 432**. The job resumed from TinyRecursiveModels commit `e7b68717` with the full resume guard stack (`trm-common-script` + `trm-pyshim`) and legacy ARC identifier mapping. This is the same checkpoint we attempted to ship to Kaggle; the submission stalled at 0.83 % pass@1 because every task duplicated attempts, so we are documenting the shortfall here instead of claiming leaderboard progress.
43
 
44
+ **Why the name mentions 119 434.** Internal tracking labelled this snapshot “step 119 434”, but the persisted shard on the CoreWeave PVC is `step_119432`. The W&B records for the run confirm that resume guard initialized at the expected `115 815` step and advanced to the 119k block; no 119 434 shard survived the routine pruning. When downstream tooling expects the 119 434 identifier, point it at this artifact and note the two-step discrepancy.
45
 
46
+ ## Checkpoint Snapshot
47
+ - **Run name**: `trm_arc2_8gpu_resume_step115815_plus100k_v2`
48
+ - **Global step**: 119 432 (3 617 optimizer updates after the 115 815 resume point)
49
+ - **Architecture**: Tiny Recursive Model ACT V1 (`L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE, bfloat16 activations)
50
+ - **Optimizer**: Adam-atan2 (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, EMA 0.999, global batch size 768)
51
+ - **Dataset builder**: Legacy identifier order (`dataset/build_arc_dataset_legacy.py`) targeting `arc2concept-aug-1000`
52
+ - **Resume provenance**:
53
+ - `RESUME_CHECKPOINT_PATH` → `/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815`
54
+ - `RESUME_EXPECTED_STEP` `115815`
55
+ - `[resume] initializing train_state.step to 115815` appears in pod logs before training continues
56
+ - **PVC retention**: Latest PVC shards now extend to `step_662428`; earlier 119k shards were pruned after packaging this export.
57
 
58
  ## Files Included
59
  | Path | Description |
60
  | --- | --- |
61
+ | `model.ckpt` | Consolidated PyTorch checkpoint (optimizer, EMA, and weights) containing `step_119432/*` tensors. SHA-256: `2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e`. |
62
+ | `COMMANDS.txt` / `COMMANDS_resumed.txt` | Torch distributed launch (8 × H200) showing the resume flags and dataset path. |
63
+ | `ENVIRONMENT.txt` | Hydra-resolved configuration captured on CoreWeave after overlays. |
64
+ | `MANIFEST.txt` | Packaging metadata (checkpoint step, source path, timestamp, sha256). |
65
+ | `TRM_COMMIT.txt` | Upstream TinyRecursiveModels Git SHA (`e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9`). |
66
+ | `all_config.yaml` | Structured config snapshot exported alongside the checkpoint. |
67
+ | `dataset-metadata.json` | Kaggle dataset manifest (kept for parity with previous releases). |
68
+
69
+ ## Evaluation Status
70
+ - **Validation (CoreWeave pod evaluator, legacy mapping)**: `pass@1 = 0.83 %`, identical scores for pass@2/5/10/100 because samples were duplicates. Mean token accuracy ≈ 70.1 %, `train/lm_loss` ≈ 0.134 at resume, `all/lm_loss` ≈ 1.56.
71
+ - **Kaggle inference notebook (test split)**: Also produced 259/259 duplicate attempts, yielding 0.83 % pass@1 and no leaderboard improvement. The issue remains unresolved; do not submit this checkpoint to Kaggle until the sampler divergence is fixed.
72
+ - **Copy-mode diagnostics** (`scripts/debug_eval_cpu.py` in legacy mode): 0/120 grid matches (consistent with earlier baselines).
73
+
74
+ The metrics bundled here are sufficient to reproduce our internal dashboards without requiring live W&B access. If you have Weights & Biases credentials, the run is listed under `trm_arc2_8gpu_resume_step115815_plus100k_v2` in project `trm-arc2`; the first logged step after resume exceeds 115 815, confirming the guard executed.
75
+
76
+ ## Inference & Reproduction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  ```python
78
  from huggingface_hub import hf_hub_download
79
+ import torch
80
 
81
+ ckpt_path = hf_hub_download("seconds0/trm-arc2-8gpu", "model.ckpt")
82
+ state = torch.load(ckpt_path, map_location="cpu")
83
+ print(state["hyperparameters"]["arch"]["hidden_size"]) # 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ```
 
 
 
 
 
 
85
 
86
+ To recreate the CoreWeave launch:
87
+ ```bash
88
+ kubectl apply -f infra/kubernetes/trm-train-8gpu-resume.yaml
89
+ # Ensure ConfigMaps trm-common-script, trm-pyshim-cm, and trm-eval-overlay are applied first.
90
  ```
91
+ Before submitting jobs, verify:
92
+ 1. `RESUME_CHECKPOINT_PATH` points to the 115 815 shard.
93
+ 2. `[resume] initializing train_state.step to 115815` appears once training boots.
94
+ 3. The first W&B point is ≥115 815 with `train/lm_loss` ≈ 0.13.
95
 
96
+ ## Known Gaps & Next Steps
97
+ 1. **Sampler instability** Deduplicate sampler outputs before retrying Kaggle submissions.
98
+ 2. **Identifier remapping** – Remains legacy-only; switching to sorted identifiers requires remapping or finetuning.
99
+ 3. **W&B rehydration** – Set `WANDB_API_KEY` locally if you need fresh metrics; the release ships cached configs only.
 
 
 
100
 
101
+ Please cite the Tiny Recursive Models paper and ARC Prize 2025 when using this checkpoint. Contributions, bug reports, and sampler fixes are welcome via the repository issues.
README.md.bak ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: pytorch
3
+ license: mit
4
+ pipeline_tag: other
5
+ tags:
6
+ - arc-prize-2025
7
+ - program-synthesis
8
+ - tiny-recursive-models
9
+ - recursive-reasoning
10
+ - kaggle
11
+ - act
12
+ - reproducibility
13
+ datasets:
14
+ - arc-prize-2025
15
+ model-index:
16
+ - name: Tiny Recursive Models — ARC-AGI-2
17
+ results:
18
+ - task:
19
+ type: program-synthesis
20
+ name: ARC Prize 2025
21
+ dataset:
22
+ name: ARC Prize 2025 Public Evaluation
23
+ type: arc-prize-2025
24
+ split: evaluation
25
+ metrics:
26
+ - type: accuracy
27
+ name: Accuracy
28
+ value: 0.6283
29
+ - type: loss
30
+ name: LM Loss
31
+ value: 2.0186
32
+ - type: accuracy
33
+ name: Halt Accuracy
34
+ value: 0.9070
35
+ ---
36
+
37
+ # Tiny Recursive Models — ARC-AGI-2 (8×GPU)
38
+
39
+ **Abstract.** This release packages the paper-faithful Tiny Recursive Models (TRM) checkpoint trained on the ARC-AGI-2 augmentation suite. We resume the official 8-GPU run from step 62,976 and continue to step 72,385, preserving upstream hyperparameters, dataset construction, and optimizer settings. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
40
+
41
+ **Special thanks** to Shawn Lewis (CTO of Weights & Biases) and the CoreWeave team (coreweave.com) for their generous contribution of 2 nodes × 8 × H200 GPUs worth of compute time via the CoreWeave Cloud platform. This work would not have been possible without their assistance and trust in the authors.
42
+
43
+ **Note on authorship.** All engineering, documentation, and packaging work in this reproduction project was completed with the assistance of coding-oriented large language models operating under human supervision. The models handled end-to-end implementation—from training orchestration and dataset packaging to documentation and publishing—while humans provided oversight, safety validation, and access control.
44
+
45
+ ## Model Summary
46
+ - **Architecture**: Tiny Recursive Model (TRM) with ACT V1 controller
47
+ `L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE positional encodings, bfloat16 activations.
48
+ - **Checkpoint**: `model.ckpt` captured after **72,385** optimizer steps while training on the ARC-AGI-2 augmentation suite (`arc2concept-aug-1000`).
49
+ - **Upstream Commit**: `e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9` (SamsungSAILMontreal/TinyRecursiveModels).
50
+ - **Optimizer**: Adam-atan2 variant (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, global batch size 768).
51
+ - **License**: MIT (inherits upstream TRM license).
52
+
53
+ This release reproduces the ARC-AGI-2 configuration described in the TRM paper using the officially provided dataset builder and training recipe. It is the same checkpoint published for Kaggle inference, packaged here for broader research use.
54
+
55
+ ## Files Included
56
+ | Path | Description |
57
+ | --- | --- |
58
+ | `model.ckpt` | PyTorch checkpoint (fp32/bf16 mix) containing model + optimizer state. |
59
+ | `ENVIRONMENT.txt` | Hydra-resolved configuration used for the run (mirrors `all_config.yaml`). |
60
+ | `COMMANDS.txt` | Launch command showing exact training flags. |
61
+ | `COMMANDS_resumed.txt` | Resume command showing restart from step 62,976. |
62
+ | `TRM_COMMIT.txt` | Git SHA for the TinyRecursiveModels source at training time. |
63
+ | `all_config.yaml` | Full structured config exported from the training job. |
64
+ | `step_72385.zip` | Raw checkpoint directory as produced by the trainer (weights, EMA, optimizer). |
65
+ | `wandb_ljxzfy3z_history.csv` / `wandb_ljxzfy3z_summary.json` | Captured metrics from Weights & Biases run `Arc2concept-aug-1000-ACT-torch/ljxzfy3z`. |
66
+
67
+ ## Intended Use & Limitations
68
+ - **Primary use**: Research on ARC-AGI-style program synthesis and evaluation, benchmarking Tiny Recursive Models, and reproducing Kaggle ARC Prize 2025 submissions.
69
+ - **Downstream evaluation**: Pair with the official ARC Prize 2025 evaluation set or ARC-AGI-2 validation splits.
70
+ - **Misuse**: The checkpoint is not designed for domains outside program synthesis. No safety mitigations are baked in; users are responsible for verifying results before deployment.
71
+ - **Limitations**: Performance is capped by the paper-faithful hyperparameters; there is no fine-tuning on ARC-AGI-1. As an ACT model, inference cost varies per puzzle and can be high on longer tasks.
72
+
73
+ ## Training Procedure
74
+ - **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
75
+ - **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
76
+ - **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
77
+ - **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
78
+ - **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
79
+
80
+ ### Key Training Metrics (Weights & Biases)
81
+ - `all/accuracy`: **0.704**
82
+ - `all/lm_loss`: **1.70**
83
+ - `all/q_halt_accuracy`: **0.799**
84
+ - `ARC/pass@1`: **1.67 %**
85
+ - `ARC/pass@10`: **5.83 %**
86
+ - `ARC/pass@100`: **8.19 %**
87
+ - `ARC/pass@1000`: **13.75 %**
88
+
89
+ ## Evaluation
90
+ - **ARC Prize 2025 public evaluation (Kaggle GPU)**
91
+ - Accuracy: **0.6283**
92
+ - LM Loss: **2.0186**
93
+ - Halt accuracy: **0.907**
94
+ - Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer.
95
+ - Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`.
96
+
97
+ ## How to Use
98
+ Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
99
+
100
+ ```python
101
+ from pathlib import Path
102
+ import torch
103
+
104
+ from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
105
+ from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
106
+
107
+ def load_trm(weights_path: str):
108
+ ckpt = torch.load(weights_path, map_location="cpu")
109
+ model_cfg = ckpt["hyperparameters"]["arch"]
110
+ model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
111
+ load_trm_checkpoint(model, ckpt, strict=True)
112
+ model.eval()
113
+ return model
114
+
115
+ weights = Path("model.ckpt") # replace with hf_hub_download path if needed
116
+ model = load_trm(weights)
117
+ ```
118
+
119
+ To fetch the checkpoint programmatically:
120
+
121
+ ```python
122
+ from huggingface_hub import hf_hub_download
123
+
124
+ ckpt_path = hf_hub_download(
125
+ repo_id="seconds0/trm-arc2-8gpu",
126
+ filename="model.ckpt",
127
+ repo_type="model",
128
+ )
129
+ ```
130
+
131
+ For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
132
+
133
+ ## Reproducibility Checklist
134
+ - ✅ ARC-AGI-2 data builder command versioned in repository.
135
+ - ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
136
+ - ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
137
+ - ✅ W&B metrics exported for independent verification.
138
+ - ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
139
+
140
+ ## Citation & Acknowledgements
141
+ If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
142
+
143
+ ```
144
+ @inproceedings{shridhar2025trm,
145
+ title = {Tiny Recursive Models},
146
+ author = {Shridhar, Mohit and et al.},
147
+ year = {2025},
148
+ booktitle = {arXiv preprint arXiv:2502.12345}
149
+ }
150
+
151
+ @misc{arcprize2025,
152
+ title = {ARC Prize 2025},
153
+ howpublished = {https://www.kaggle.com/competitions/arc-prize-2025}
154
+ }
155
+ ```
156
+
157
+ - Upstream TRM repository: https://github.com/SamsungSAILMontreal/TinyRecursiveModels
158
+ - Tiny Recursive Models paper: https://arxiv.org/abs/2502.12345
159
+
160
+ ## Responsible AI Considerations
161
+ - **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
162
+ - **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
163
+ - **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
164
+
165
+ ---
all_config.yaml CHANGED
@@ -21,20 +21,20 @@ arch:
21
  beta1: 0.9
22
  beta2: 0.95
23
  checkpoint_every_eval: true
24
- checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100
25
  data_paths:
26
  - data/arc2concept-aug-1000
27
  data_paths_test: []
28
  ema: true
29
  ema_rate: 0.999
30
- epochs: 10000
31
  eval_interval: 100
32
  eval_save_outputs: []
33
  evaluators:
34
  - name: arc@ARC
35
  freeze_weights: false
36
  global_batch_size: 768
37
- load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
38
  lr: 0.0001
39
  lr_min_ratio: 1.0
40
  lr_warmup_steps: 2000
@@ -42,6 +42,6 @@ min_eval_interval: 0
42
  project_name: Arc2concept-aug-1000-ACT-torch
43
  puzzle_emb_lr: 0.01
44
  puzzle_emb_weight_decay: 0.1
45
- run_name: trm_arc2_8gpu_eval100
46
  seed: 0
47
  weight_decay: 0.1
 
21
  beta1: 0.9
22
  beta2: 0.95
23
  checkpoint_every_eval: true
24
+ checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
25
  data_paths:
26
  - data/arc2concept-aug-1000
27
  data_paths_test: []
28
  ema: true
29
  ema_rate: 0.999
30
+ epochs: 24000
31
  eval_interval: 100
32
  eval_save_outputs: []
33
  evaluators:
34
  - name: arc@ARC
35
  freeze_weights: false
36
  global_batch_size: 768
37
+ load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
38
  lr: 0.0001
39
  lr_min_ratio: 1.0
40
  lr_warmup_steps: 2000
 
42
  project_name: Arc2concept-aug-1000-ACT-torch
43
  puzzle_emb_lr: 0.01
44
  puzzle_emb_weight_decay: 0.1
45
+ run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
46
  seed: 0
47
  weight_decay: 0.1
dataset-metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "TRM ARC-AGI-2 Weights (8GPU Step 119432)",
3
+ "id": "seconds0/trm-arc2-weights-trm-arc2-8gpu-step119432",
4
+ "subtitle": "8-GPU resume checkpoint for TRM ARC-AGI-2 (step 119432)",
5
+ "description": "Tiny Recursive Models (TRM) ARC-AGI-2 checkpoint captured at global step 119,432 during the legacy resume run. Packaged for Kaggle inference with legacy identifier mapping.",
6
+ "licenses": [
7
+ { "name": "cc-by-4.0" }
8
+ ]
9
+ }
model.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51e10870c7c0615e7607312ba76accb83c066c02d8324ae8eb929a29bb3d3c3b
3
- size 2467990050
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e
3
+ size 2467988405