Refresh with step119432 resume checkpoint

Browse files

Files changed (9) hide show

COMMANDS.txt +4 -3
COMMANDS_resumed.txt +4 -3
ENVIRONMENT.txt +4 -4
MANIFEST.txt +4 -0
README.md +58 -134
README.md.bak +165 -0
all_config.yaml +4 -4
dataset-metadata.json +9 -0
model.ckpt +2 -2

COMMANDS.txt CHANGED Viewed

@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
   data_paths="[data/arc2concept-aug-1000]" \
   arch.L_layers=2 \
   arch.H_cycles=3 arch.L_cycles=4 \
-  +run_name=trm_arc2_8gpu_eval100 ema=True \
   checkpoint_every_eval=True \
-  epochs=10000 eval_interval=100 \
-  +load_checkpoint=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976

   data_paths="[data/arc2concept-aug-1000]" \
   arch.L_layers=2 \
   arch.H_cycles=3 arch.L_cycles=4 \
+  +run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
   checkpoint_every_eval=True \
+  epochs=24000 eval_interval=100 \
+  global_batch_size=768 \
+  +load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"

COMMANDS_resumed.txt CHANGED Viewed

@@ -3,7 +3,8 @@ python3 -m torch.distributed.run --nproc_per_node 8 --rdzv_backend=c10d --rdzv_e
   data_paths="[data/arc2concept-aug-1000]" \
   arch.L_layers=2 \
   arch.H_cycles=3 arch.L_cycles=4 \
-  +run_name=trm_arc2_8gpu_eval100 ema=True \
   checkpoint_every_eval=True \
-  epochs=10000 eval_interval=100 \
-  +load_checkpoint=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976

   data_paths="[data/arc2concept-aug-1000]" \
   arch.L_layers=2 \
   arch.H_cycles=3 arch.L_cycles=4 \
+  +run_name=trm_arc2_8gpu_resume_step115815_plus100k_v2 ema=True \
   checkpoint_every_eval=True \
+  epochs=24000 eval_interval=100 \
+  global_batch_size=768 \
+  +load_checkpoint="/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815"

ENVIRONMENT.txt CHANGED Viewed

@@ -21,20 +21,20 @@ arch:
 beta1: 0.9
 beta2: 0.95
 checkpoint_every_eval: true
-checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100
 data_paths:
 - data/arc2concept-aug-1000
 data_paths_test: []
 ema: true
 ema_rate: 0.999
-epochs: 10000
 eval_interval: 100
 eval_save_outputs: []
 evaluators:
 - name: arc@ARC
 freeze_weights: false
 global_batch_size: 768
-load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
 lr: 0.0001
 lr_min_ratio: 1.0
 lr_warmup_steps: 2000
@@ -42,6 +42,6 @@ min_eval_interval: 0
 project_name: Arc2concept-aug-1000-ACT-torch
 puzzle_emb_lr: 0.01
 puzzle_emb_weight_decay: 0.1
-run_name: trm_arc2_8gpu_eval100
 seed: 0
 weight_decay: 0.1

 beta1: 0.9
 beta2: 0.95
 checkpoint_every_eval: true
+checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
 data_paths:
 - data/arc2concept-aug-1000
 data_paths_test: []
 ema: true
 ema_rate: 0.999
+epochs: 24000
 eval_interval: 100
 eval_save_outputs: []
 evaluators:
 - name: arc@ARC
 freeze_weights: false
 global_batch_size: 768
+load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
 lr: 0.0001
 lr_min_ratio: 1.0
 lr_warmup_steps: 2000
 project_name: Arc2concept-aug-1000-ACT-torch
 puzzle_emb_lr: 0.01
 puzzle_emb_weight_decay: 0.1
+run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
 seed: 0
 weight_decay: 0.1

MANIFEST.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+CHECKPOINT_STEP=119432
+CHECKPOINT_SOURCE=checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2/step_119432
+PACKAGED_AT=2025-10-28T23:20:00Z
+SHA256=2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e

README.md CHANGED Viewed

@@ -7,171 +7,95 @@ tags:
   - program-synthesis
   - tiny-recursive-models
   - recursive-reasoning
-  - kaggle
   - act
   - reproducibility
 datasets:
   - arc-prize-2025
 model-index:
-  - name: Tiny Recursive Models — ARC-AGI-2
     results:
       - task:
           type: program-synthesis
-          name: ARC Prize 2025
         dataset:
           name: ARC Prize 2025 Public Evaluation
           type: arc-prize-2025
           split: evaluation
         metrics:
           - type: accuracy
             name: ARC Task Solve Rate (pass@2)
-            value: 0.0292
           - type: accuracy
-            name: ARC Task Solve Rate (pass@100)
-            value: 0.0819
           - type: accuracy
-            name: pass@1
-            value: 0.0167
 ---
-# Tiny Recursive Models — ARC-AGI-2 (8×GPU)
-**Abstract.** This release packages the complete paper-faithful Tiny Recursive Models (TRM) checkpoint achieving **2.92% task solve rate (pass@2)** on ARC-AGI-2, the official ARC Prize 2025 competition metric. The model was trained for the full 100,000 steps (step counter displays 72,385 due to training restarts). With increased sampling, the model achieves 8.19% at pass@100. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
-**Special thanks** to Shawn Lewis (CTO of Weights & Biases) and the CoreWeave team (coreweave.com) for their generous contribution of 2 nodes × 8 × H200 GPUs worth of compute time via the CoreWeave Cloud platform. This work would not have been possible without their assistance and trust in the authors.
-**Note on authorship.** All engineering, documentation, and packaging work in this reproduction project was completed with the assistance of coding-oriented large language models operating under human supervision. The models handled end-to-end implementation—from training orchestration and dataset packaging to documentation and publishing—while humans provided oversight, safety validation, and access control.
-## Model Summary
-- **Architecture**: Tiny Recursive Model (TRM) with ACT V1 controller
-  `L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE positional encodings, bfloat16 activations.
-- **Checkpoint**: `model.ckpt` captured after **72,385** optimizer steps while training on the ARC-AGI-2 augmentation suite (`arc2concept-aug-1000`).
-- **Upstream Commit**: `e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9` (SamsungSAILMontreal/TinyRecursiveModels).
-- **Optimizer**: Adam-atan2 variant (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, global batch size 768).
-- **License**: MIT (inherits upstream TRM license).
-This release reproduces the ARC-AGI-2 configuration described in the TRM paper using the officially provided dataset builder and training recipe. It is the same checkpoint published for Kaggle inference, packaged here for broader research use.
 ## Files Included
 | Path | Description |
 | --- | --- |
-| `model.ckpt` | PyTorch checkpoint (fp32/bf16 mix) containing model + optimizer state. |
-| `ENVIRONMENT.txt` | Hydra-resolved configuration used for the run (mirrors `all_config.yaml`). |
-| `COMMANDS.txt` | Launch command showing exact training flags. |
-| `COMMANDS_resumed.txt` | Resume command showing restart from step 62,976. |
-| `TRM_COMMIT.txt` | Git SHA for the TinyRecursiveModels source at training time. |
-| `all_config.yaml` | Full structured config exported from the training job. |
-| `step_72385.zip` | Raw checkpoint directory as produced by the trainer (weights, EMA, optimizer). |
-| `wandb_ljxzfy3z_history.csv` / `wandb_ljxzfy3z_summary.json` | Captured metrics from Weights & Biases run `Arc2concept-aug-1000-ACT-torch/ljxzfy3z`. |
-## Intended Use & Limitations
-- **Primary use**: Research on ARC-AGI-style program synthesis and evaluation, benchmarking Tiny Recursive Models, and reproducing Kaggle ARC Prize 2025 submissions.
-- **Downstream evaluation**: Pair with the official ARC Prize 2025 evaluation set or ARC-AGI-2 validation splits.
-- **Misuse**: The checkpoint is not designed for domains outside program synthesis. No safety mitigations are baked in; users are responsible for verifying results before deployment.
-- **Limitations**: Performance is capped by the paper-faithful hyperparameters; there is no fine-tuning on ARC-AGI-1. As an ACT model, inference cost varies per puzzle and can be high on longer tasks.
-## Training Procedure
-- **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
-- **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
-- **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
-- **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
-- **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
-### Key Training Metrics (Weights & Biases)
-- `all/accuracy`: **0.704**
-- `all/lm_loss`: **1.70**
-- `all/q_halt_accuracy`: **0.799**
-- `ARC/pass@1`: **1.67 %**
-- `ARC/pass@10`: **5.83 %**
-- `ARC/pass@100`: **8.19 %**
-- `ARC/pass@1000`: **13.75 %**
-## Evaluation
-### ARC-AGI-2 Task Solve Rates
-**These are the real puzzle-solving performance metrics:**
-- **pass@1**: 1.67% (single attempt per task)
-- **pass@2**: **2.92%** (official ARC Prize 2025 competition metric)
-- **pass@10**: 5.83%
-- **pass@100**: 8.19%
-- **pass@1000**: 13.75%
-### Model-Level Metrics
-**These measure internal model behavior, not task success:**
-- Token-level accuracy: 62.83% (not indicative of puzzle-solving)
-- LM Loss: 2.0186
-- Halt accuracy: 90.7% (ACT controller stopping mechanism)
-### Evaluation Details
-- Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer
-- Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`
-## How to Use
-Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
-```python
-from pathlib import Path
-import torch
-from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
-from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
-def load_trm(weights_path: str):
-    ckpt = torch.load(weights_path, map_location="cpu")
-    model_cfg = ckpt["hyperparameters"]["arch"]
-    model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
-    load_trm_checkpoint(model, ckpt, strict=True)
-    model.eval()
-    return model
-weights = Path("model.ckpt")  # replace with hf_hub_download path if needed
-model = load_trm(weights)
-```
-To fetch the checkpoint programmatically:
 ```python
 from huggingface_hub import hf_hub_download
-ckpt_path = hf_hub_download(
-    repo_id="seconds0/trm-arc2-8gpu",
-    filename="model.ckpt",
-    repo_type="model",
-)
-```
-For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
-## Reproducibility Checklist
-- ✅ ARC-AGI-2 data builder command versioned in repository.
-- ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
-- ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
-- ✅ W&B metrics exported for independent verification.
-- ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
-## Citation & Acknowledgements
-If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
 ```
-@inproceedings{shridhar2025trm,
-  title     = {Tiny Recursive Models},
-  author    = {Shridhar, Mohit and et al.},
-  year      = {2025},
-  booktitle = {arXiv preprint arXiv:2502.12345}
-}
-@misc{arcprize2025,
-  title = {ARC Prize 2025},
-  howpublished = {https://www.kaggle.com/competitions/arc-prize-2025}
-}
 ```
-- Upstream TRM repository: https://github.com/SamsungSAILMontreal/TinyRecursiveModels
-- Tiny Recursive Models paper: https://arxiv.org/abs/2502.12345
-## Responsible AI Considerations
-- **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
-- **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
-- **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
----

   - program-synthesis
   - tiny-recursive-models
   - recursive-reasoning
+  - resume-training
   - act
   - reproducibility
 datasets:
   - arc-prize-2025
 model-index:
+  - name: Tiny Recursive Models — ARC-AGI-2 (Resume Step 119432)
     results:
       - task:
           type: program-synthesis
+          name: ARC Prize 2025 (legacy evaluation mapping)
         dataset:
           name: ARC Prize 2025 Public Evaluation
           type: arc-prize-2025
           split: evaluation
         metrics:
+          - type: accuracy
+            name: ARC Task Solve Rate (pass@1)
+            value: 0.0083
           - type: accuracy
             name: ARC Task Solve Rate (pass@2)
+            value: 0.0083
           - type: accuracy
+            name: ARC Task Solve Rate (pass@10)
+            value: 0.0083
           - type: accuracy
+            name: ARC Task Solve Rate (pass@100)
+            value: 0.0083
 ---
+# Tiny Recursive Models — ARC-AGI-2 (8× H200 Resume, Step 119 432)
+**What’s new (Nov 2025).** This refresh publishes the best-performing checkpoint from the CoreWeave resume campaign—`trm_arc2_8gpu_resume_step115815_plus100k_v2` at global step **119 432**. The job resumed from TinyRecursiveModels commit `e7b68717` with the full resume guard stack (`trm-common-script` + `trm-pyshim`) and legacy ARC identifier mapping. This is the same checkpoint we attempted to ship to Kaggle; the submission stalled at 0.83 % pass@1 because every task duplicated attempts, so we are documenting the shortfall here instead of claiming leaderboard progress.
+**Why the name mentions 119 434.** Internal tracking labelled this snapshot “step 119 434”, but the persisted shard on the CoreWeave PVC is `step_119432`. The W&B records for the run confirm that resume guard initialized at the expected `115 815` step and advanced to the 119k block; no 119 434 shard survived the routine pruning. When downstream tooling expects the 119 434 identifier, point it at this artifact and note the two-step discrepancy.
+## Checkpoint Snapshot
+- **Run name**: `trm_arc2_8gpu_resume_step115815_plus100k_v2`
+- **Global step**: 119 432 (3 617 optimizer updates after the 115 815 resume point)
+- **Architecture**: Tiny Recursive Model ACT V1 (`L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE, bfloat16 activations)
+- **Optimizer**: Adam-atan2 (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, EMA 0.999, global batch size 768)
+- **Dataset builder**: Legacy identifier order (`dataset/build_arc_dataset_legacy.py`) targeting `arc2concept-aug-1000`
+- **Resume provenance**:
+  - `RESUME_CHECKPOINT_PATH` → `/workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815`
+  - `RESUME_EXPECTED_STEP` → `115815`
+  - `[resume] initializing train_state.step to 115815` appears in pod logs before training continues
+- **PVC retention**: Latest PVC shards now extend to `step_662428`; earlier 119k shards were pruned after packaging this export.
 ## Files Included
 | Path | Description |
 | --- | --- |
+| `model.ckpt` | Consolidated PyTorch checkpoint (optimizer, EMA, and weights) containing `step_119432/*` tensors. SHA-256: `2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e`. |
+| `COMMANDS.txt` / `COMMANDS_resumed.txt` | Torch distributed launch (8 × H200) showing the resume flags and dataset path. |
+| `ENVIRONMENT.txt` | Hydra-resolved configuration captured on CoreWeave after overlays. |
+| `MANIFEST.txt` | Packaging metadata (checkpoint step, source path, timestamp, sha256). |
+| `TRM_COMMIT.txt` | Upstream TinyRecursiveModels Git SHA (`e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9`). |
+| `all_config.yaml` | Structured config snapshot exported alongside the checkpoint. |
+| `dataset-metadata.json` | Kaggle dataset manifest (kept for parity with previous releases). |
+## Evaluation Status
+- **Validation (CoreWeave pod evaluator, legacy mapping)**: `pass@1 = 0.83 %`, identical scores for pass@2/5/10/100 because samples were duplicates. Mean token accuracy ≈ 70.1 %, `train/lm_loss` ≈ 0.134 at resume, `all/lm_loss` ≈ 1.56.
+- **Kaggle inference notebook (test split)**: Also produced 259/259 duplicate attempts, yielding 0.83 % pass@1 and no leaderboard improvement. The issue remains unresolved; do not submit this checkpoint to Kaggle until the sampler divergence is fixed.
+- **Copy-mode diagnostics** (`scripts/debug_eval_cpu.py` in legacy mode): 0/120 grid matches (consistent with earlier baselines).
+The metrics bundled here are sufficient to reproduce our internal dashboards without requiring live W&B access. If you have Weights & Biases credentials, the run is listed under `trm_arc2_8gpu_resume_step115815_plus100k_v2` in project `trm-arc2`; the first logged step after resume exceeds 115 815, confirming the guard executed.
+## Inference & Reproduction
 ```python
 from huggingface_hub import hf_hub_download
+import torch
+ckpt_path = hf_hub_download("seconds0/trm-arc2-8gpu", "model.ckpt")
+state = torch.load(ckpt_path, map_location="cpu")
+print(state["hyperparameters"]["arch"]["hidden_size"])  # 512
 ```
+To recreate the CoreWeave launch:
+```bash
+kubectl apply -f infra/kubernetes/trm-train-8gpu-resume.yaml
+# Ensure ConfigMaps trm-common-script, trm-pyshim-cm, and trm-eval-overlay are applied first.
 ```
+Before submitting jobs, verify:
+1. `RESUME_CHECKPOINT_PATH` points to the 115 815 shard.
+2. `[resume] initializing train_state.step to 115815` appears once training boots.
+3. The first W&B point is ≥115 815 with `train/lm_loss` ≈ 0.13.
+## Known Gaps & Next Steps
+1. **Sampler instability** – Deduplicate sampler outputs before retrying Kaggle submissions.
+2. **Identifier remapping** – Remains legacy-only; switching to sorted identifiers requires remapping or finetuning.
+3. **W&B rehydration** – Set `WANDB_API_KEY` locally if you need fresh metrics; the release ships cached configs only.
+Please cite the Tiny Recursive Models paper and ARC Prize 2025 when using this checkpoint. Contributions, bug reports, and sampler fixes are welcome via the repository issues.

README.md.bak ADDED Viewed

	@@ -0,0 +1,165 @@

+---
+library_name: pytorch
+license: mit
+pipeline_tag: other
+tags:
+  - arc-prize-2025
+  - program-synthesis
+  - tiny-recursive-models
+  - recursive-reasoning
+  - kaggle
+  - act
+  - reproducibility
+datasets:
+  - arc-prize-2025
+model-index:
+  - name: Tiny Recursive Models — ARC-AGI-2
+    results:
+      - task:
+          type: program-synthesis
+          name: ARC Prize 2025
+        dataset:
+          name: ARC Prize 2025 Public Evaluation
+          type: arc-prize-2025
+          split: evaluation
+        metrics:
+          - type: accuracy
+            name: Accuracy
+            value: 0.6283
+          - type: loss
+            name: LM Loss
+            value: 2.0186
+          - type: accuracy
+            name: Halt Accuracy
+            value: 0.9070
+---
+# Tiny Recursive Models — ARC-AGI-2 (8×GPU)
+**Abstract.** This release packages the paper-faithful Tiny Recursive Models (TRM) checkpoint trained on the ARC-AGI-2 augmentation suite. We resume the official 8-GPU run from step 62,976 and continue to step 72,385, preserving upstream hyperparameters, dataset construction, and optimizer settings. The repository bundles the model weights, Hydra configs, training commands, and Weights & Biases metrics so researchers can reproduce ARC Prize 2025 evaluations or fine-tune TRM for downstream ARC-style reasoning tasks.
+**Special thanks** to Shawn Lewis (CTO of Weights & Biases) and the CoreWeave team (coreweave.com) for their generous contribution of 2 nodes × 8 × H200 GPUs worth of compute time via the CoreWeave Cloud platform. This work would not have been possible without their assistance and trust in the authors.
+**Note on authorship.** All engineering, documentation, and packaging work in this reproduction project was completed with the assistance of coding-oriented large language models operating under human supervision. The models handled end-to-end implementation—from training orchestration and dataset packaging to documentation and publishing—while humans provided oversight, safety validation, and access control.
+## Model Summary
+- **Architecture**: Tiny Recursive Model (TRM) with ACT V1 controller
+  `L_layers=2`, `H_cycles=3`, `L_cycles=4`, hidden size 512, 8 heads, RoPE positional encodings, bfloat16 activations.
+- **Checkpoint**: `model.ckpt` captured after **72,385** optimizer steps while training on the ARC-AGI-2 augmentation suite (`arc2concept-aug-1000`).
+- **Upstream Commit**: `e7b68717f0a6c4cbb4ce6fbef787b14f42083bd9` (SamsungSAILMontreal/TinyRecursiveModels).
+- **Optimizer**: Adam-atan2 variant (`beta1=0.9`, `beta2=0.95`, `weight_decay=0.1`, global batch size 768).
+- **License**: MIT (inherits upstream TRM license).
+This release reproduces the ARC-AGI-2 configuration described in the TRM paper using the officially provided dataset builder and training recipe. It is the same checkpoint published for Kaggle inference, packaged here for broader research use.
+## Files Included
+| Path | Description |
+| --- | --- |
+| `model.ckpt` | PyTorch checkpoint (fp32/bf16 mix) containing model + optimizer state. |
+| `ENVIRONMENT.txt` | Hydra-resolved configuration used for the run (mirrors `all_config.yaml`). |
+| `COMMANDS.txt` | Launch command showing exact training flags. |
+| `COMMANDS_resumed.txt` | Resume command showing restart from step 62,976. |
+| `TRM_COMMIT.txt` | Git SHA for the TinyRecursiveModels source at training time. |
+| `all_config.yaml` | Full structured config exported from the training job. |
+| `step_72385.zip` | Raw checkpoint directory as produced by the trainer (weights, EMA, optimizer). |
+| `wandb_ljxzfy3z_history.csv` / `wandb_ljxzfy3z_summary.json` | Captured metrics from Weights & Biases run `Arc2concept-aug-1000-ACT-torch/ljxzfy3z`. |
+## Intended Use & Limitations
+- **Primary use**: Research on ARC-AGI-style program synthesis and evaluation, benchmarking Tiny Recursive Models, and reproducing Kaggle ARC Prize 2025 submissions.
+- **Downstream evaluation**: Pair with the official ARC Prize 2025 evaluation set or ARC-AGI-2 validation splits.
+- **Misuse**: The checkpoint is not designed for domains outside program synthesis. No safety mitigations are baked in; users are responsible for verifying results before deployment.
+- **Limitations**: Performance is capped by the paper-faithful hyperparameters; there is no fine-tuning on ARC-AGI-1. As an ACT model, inference cost varies per puzzle and can be high on longer tasks.
+## Training Procedure
+- **Data**: `data/arc2concept-aug-1000` constructed via `python -m dataset.build_arc_dataset --subsets training2 evaluation2 concept --test-set-name evaluation2`.
+- **Hardware**: 8× NVIDIA H100 (80 GB) GPUs, torch distributed launch with gradient accumulation to reach batch size 768.
+- **Precision**: Mixed bfloat16 compute with fp32 master weights; EMA enabled (`ema_rate=0.999`).
+- **Duration**: 72,385 optimizer steps (~85,900 s runtime) from resume checkpoint `step_62976`.
+- **Scheduler**: Constant LR 1e-4 (warmup complete at resume), cosine decay disabled (`lr_min_ratio=1.0`).
+### Key Training Metrics (Weights & Biases)
+- `all/accuracy`: **0.704**
+- `all/lm_loss`: **1.70**
+- `all/q_halt_accuracy`: **0.799**
+- `ARC/pass@1`: **1.67 %**
+- `ARC/pass@10`: **5.83 %**
+- `ARC/pass@100`: **8.19 %**
+- `ARC/pass@1000`: **13.75 %**
+## Evaluation
+- **ARC Prize 2025 public evaluation (Kaggle GPU)**
+  - Accuracy: **0.6283**
+  - LM Loss: **2.0186**
+  - Halt accuracy: **0.907**
+- Evaluator script: `TinyRecursiveModels/evaluators/arc.py` with default two-attempt submission writer.
+- Submission artifact: `/kaggle/working/trm_eval_outputs/evaluator_ARC_step_72385/submission.json`.
+## How to Use
+Install TinyRecursiveModels (commit above) and load the checkpoint via PyTorch:
+```python
+from pathlib import Path
+import torch
+from recursive_reasoning.trm import TinyRecursiveReasoningModel_ACTV1
+from recursive_reasoning.utils.checkpoint import load_trm_checkpoint
+def load_trm(weights_path: str):
+    ckpt = torch.load(weights_path, map_location="cpu")
+    model_cfg = ckpt["hyperparameters"]["arch"]
+    model = TinyRecursiveReasoningModel_ACTV1(**model_cfg)
+    load_trm_checkpoint(model, ckpt, strict=True)
+    model.eval()
+    return model
+weights = Path("model.ckpt")  # replace with hf_hub_download path if needed
+model = load_trm(weights)
+```
+To fetch the checkpoint programmatically:
+```python
+from huggingface_hub import hf_hub_download
+ckpt_path = hf_hub_download(
+    repo_id="seconds0/trm-arc2-8gpu",
+    filename="model.ckpt",
+    repo_type="model",
+)
+```
+For Kaggle inference, reuse `kaggle/trm_arc2_inference_notebook.py` (packaged separately) and replace the dataset mount with `hf_hub_download`.
+## Reproducibility Checklist
+- ✅ ARC-AGI-2 data builder command versioned in repository.
+- ✅ Training invocation and config saved (`COMMANDS.txt`, `COMMANDS_resumed.txt`, `ENVIRONMENT.txt`, `all_config.yaml`).
+- ✅ Upstream commit recorded (`TRM_COMMIT.txt`).
+- ✅ W&B metrics exported for independent verification.
+- ✅ Checkpoint archive (`step_72385.zip`) matches `model.ckpt` contents (torch + EMA).
+## Citation & Acknowledgements
+If you use this model, please cite the Tiny Recursive Models paper and the ARC Prize competition:
+```
+@inproceedings{shridhar2025trm,
+  title     = {Tiny Recursive Models},
+  author    = {Shridhar, Mohit and et al.},
+  year      = {2025},
+  booktitle = {arXiv preprint arXiv:2502.12345}
+}
+@misc{arcprize2025,
+  title = {ARC Prize 2025},
+  howpublished = {https://www.kaggle.com/competitions/arc-prize-2025}
+}
+```
+- Upstream TRM repository: https://github.com/SamsungSAILMontreal/TinyRecursiveModels
+- Tiny Recursive Models paper: https://arxiv.org/abs/2502.12345
+## Responsible AI Considerations
+- **Bias**: The ARC-AGI corpus reflects synthetic puzzle distributions; extrapolation to human-generated tasks may degrade.
+- **Safety**: No harmful content is generated, but downstream automation (e.g., code execution) should be sandboxed.
+- **Data Privacy**: Training and evaluation use public ARC datasets; no personal data involved.
+---

all_config.yaml CHANGED Viewed

@@ -21,20 +21,20 @@ arch:
 beta1: 0.9
 beta2: 0.95
 checkpoint_every_eval: true
-checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100
 data_paths:
 - data/arc2concept-aug-1000
 data_paths_test: []
 ema: true
 ema_rate: 0.999
-epochs: 10000
 eval_interval: 100
 eval_save_outputs: []
 evaluators:
 - name: arc@ARC
 freeze_weights: false
 global_batch_size: 768
-load_checkpoint: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_eval100/step_62976
 lr: 0.0001
 lr_min_ratio: 1.0
 lr_warmup_steps: 2000
@@ -42,6 +42,6 @@ min_eval_interval: 0
 project_name: Arc2concept-aug-1000-ACT-torch
 puzzle_emb_lr: 0.01
 puzzle_emb_weight_decay: 0.1
-run_name: trm_arc2_8gpu_eval100
 seed: 0
 weight_decay: 0.1

 beta1: 0.9
 beta2: 0.95
 checkpoint_every_eval: true
+checkpoint_path: checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_step115815_plus100k_v2
 data_paths:
 - data/arc2concept-aug-1000
 data_paths_test: []
 ema: true
 ema_rate: 0.999
+epochs: 24000
 eval_interval: 100
 eval_save_outputs: []
 evaluators:
 - name: arc@ARC
 freeze_weights: false
 global_batch_size: 768
+load_checkpoint: /workspace/TinyRecursiveModels/checkpoints/Arc2concept-aug-1000-ACT-torch/trm_arc2_8gpu_resume_plus100k/step_115815
 lr: 0.0001
 lr_min_ratio: 1.0
 lr_warmup_steps: 2000
 project_name: Arc2concept-aug-1000-ACT-torch
 puzzle_emb_lr: 0.01
 puzzle_emb_weight_decay: 0.1
+run_name: trm_arc2_8gpu_resume_step115815_plus100k_v2
 seed: 0
 weight_decay: 0.1

dataset-metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "title": "TRM ARC-AGI-2 Weights (8GPU Step 119432)",
+  "id": "seconds0/trm-arc2-weights-trm-arc2-8gpu-step119432",
+  "subtitle": "8-GPU resume checkpoint for TRM ARC-AGI-2 (step 119432)",
+  "description": "Tiny Recursive Models (TRM) ARC-AGI-2 checkpoint captured at global step 119,432 during the legacy resume run. Packaged for Kaggle inference with legacy identifier mapping.",
+  "licenses": [
+    { "name": "cc-by-4.0" }
+  ]
+}

model.ckpt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51e10870c7c0615e7607312ba76accb83c066c02d8324ae8eb929a29bb3d3c3b
-size 2467990050

 version https://git-lfs.github.com/spec/v1
+oid sha256:2bc8bb3a5a85cd73e169a6fd285f9138427db894bd157edc20e92a58ed8ee33e
+size 2467988405