psidharth567 commited on Apr 10

Commit

dbc69f3

0 Parent(s):

Export neuralese codebase (cache and .env excluded).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +108 -0
.gitignore +15 -0
AGENTS.md +56 -0
README.md +164 -0
configs/accelerate_ddp_2gpu.yaml +17 -0
configs/accelerate_ddp_4gpu.yaml +17 -0
configs/accelerate_ddp_7gpu.yaml +17 -0
configs/accelerate_ddp_8gpu.yaml +17 -0
configs/grpo_llama32_3b_bf16.yaml +70 -0
docs/repository-map.md +50 -0
out.jsonl +0 -0
requirements.txt +9 -0
scripts/conda_env.sh +40 -0
scripts/eval_gsm8k_think_entropy_4gpu.sh +42 -0
scripts/eval_gsm8k_truncated_thinking.sh +51 -0
scripts/eval_gsm8k_zeroshot_4gpu.sh +36 -0
scripts/eval_gsm8k_zeroshot_train_4gpu.sh +54 -0
scripts/eval_length_penalty_ablation_offline.sh +39 -0
scripts/eval_math_level1_4gpu.sh +55 -0
scripts/eval_math_level1_thinking_zeroshot_4gpu.sh +48 -0
scripts/eval_math_level1_zeroshot_4gpu.sh +49 -0
scripts/eval_math_level2_thinking_zeroshot_4gpu.sh +48 -0
scripts/eval_math_level2_zeroshot_4gpu.sh +49 -0
scripts/eval_permanent_root_acc_cot.sh +53 -0
scripts/eval_sweep_models_offline.sh +34 -0
scripts/eval_twostage_permanent_checkpoints.sh +47 -0
scripts/hf_download_repo.py +72 -0
scripts/hf_upload_repo.py +128 -0
scripts/resume_grpo_8gpu.sh +27 -0
scripts/run_grpo.sh +19 -0
scripts/run_grpo_2gpu.sh +22 -0
scripts/run_grpo_4gpu.sh +22 -0
scripts/run_grpo_8gpu.sh +25 -0
scripts/run_grpo_thinking_kl_masked_resume_4gpu.sh +139 -0
scripts/run_lambda_0p1_existing_gate_token_util.sh +107 -0
scripts/run_reward_variants_and_eval.sh +146 -0
scripts/run_sft_gsm8k_boxed_7gpu.sh +44 -0
scripts/run_twostage_correctness1.sh +79 -0
scripts/run_twostage_correctness5.sh +79 -0
scripts/sweep_length_penalty_lambda.sh +88 -0
src/eval_gsm8k_truncated_thinking.py +321 -0
src/eval_gsm8k_zeroshot.py +76 -0
src/eval_math_level1_thinking_zeroshot.py +312 -0
src/eval_permanent_checkpoints.py +433 -0
src/eval_sweep_models.py +386 -0
src/hackable/__init__.py +6 -0
src/hackable/backends.py +62 -0
src/hackable/config.py +183 -0
src/hackable/data_plugins.py +291 -0
src/hackable/interfaces.py +48 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,108 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.avro filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mds filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+# Video files - compressed
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--llama-3.2-3b-instruct-unsloth-bnb-4bit/blobs/9151ba0c695e63ba3dea7436a55bb6aa9f2d0a3d7b7eead62086959354bd6c67 filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--llama-3.2-3b-instruct-unsloth-bnb-4bit/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--llama-3.2-3b-instruct-unsloth-bnb-4bit/snapshots/19846d3f624f3eb96f3bdd275620c6bc7e21e1f8/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/artifacts/runs/llama32-3b-grpo/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/artifacts/runs/llama32-3b-grpo/checkpoint-275/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/reward_variants_lambda_0p1/run_lambda_0p1_existing_gate_token_util/checkpoint-1350/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness5/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness5/checkpoint-3736/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness5/permanent_checkpoints/checkpoint-stage1-boundary-epoch-1p0-step-1868/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness1/checkpoint-2925/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness1/permanent_checkpoints/checkpoint-stage1-boundary-epoch-1p0-step-1868/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_164737-w4awch9k/run-w4awch9k.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_172032-b9o54gvf/run-b9o54gvf.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_170233-ydg9el7o/run-ydg9el7o.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_165455-zj6y61aw/run-zj6y61aw.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260326_035115-p68cj9a2/run-p68cj9a2.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260331_190529-w0p8d22o/run-w0p8d22o.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260326_235622-lo899m3z/run-lo899m3z.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/run-20260218_150224-l50vxprm/run-l50vxprm.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260327_235926-kx2wc1tp/run-kx2wc1tp.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260329_125618-21yt0wn1/run-21yt0wn1.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/run-20260218_145059-onpmdiea/run-onpmdiea.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_172557-kst72vbj/run-kst72vbj.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260325_203352-8ncdpjq1/run-8ncdpjq1.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260329_125618-nuzntjga/run-nuzntjga.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260328_025352-z84jpmb8/run-z84jpmb8.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260328_142710-or2vd7o6/run-or2vd7o6.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260329_124408-koirzdhw/run-koirzdhw.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260325_052529-dz6rih26/run-dz6rih26.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260325_015712-7ialnk1w/run-7ialnk1w.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260327_235600-s2pr0n6l/run-s2pr0n6l.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_154633-igplmjku/run-igplmjku.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_204810-jg5x3bn6/run-jg5x3bn6.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260218_205814-3cvhbicy/run-3cvhbicy.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260326_000646-w5fgyzj3/run-w5fgyzj3.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260326_201451-g14348cv/run-g14348cv.wandb filter=lfs diff=lfs merge=lfs -text
+neuralese_latent/wandb/offline-run-20260219_214352-58nmo2ma/run-58nmo2ma.wandb filter=lfs diff=lfs merge=lfs -text
+cache/models/datasets--openai--gsm8k/blobs/ee7b8da9e381df27b9e3f7758a159ab2bdaa4dbaa910546cbbc47e0cb44e4f59 filter=lfs diff=lfs merge=lfs -text
+cache/models/datasets--openai--gsm8k/blobs/ea82612ea9582142387730c793eb67d3b12849002bc0b7fa6f8efafa7351419d filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--Llama-3.2-3B-Instruct/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--Llama-3.2-3B-Instruct/blobs/7b770216613ac5c34d7c54bdff1fa616bc4e338a9d0b20af6303e48c295ee23c filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--Llama-3.2-3B-Instruct/blobs/13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 filter=lfs diff=lfs merge=lfs -text
+cache/models/models--unsloth--Llama-3.2-3B-Instruct/snapshots/006f5dcd1393c3add266de40994ba96225e9689d/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260219_155344-i4fzutup/run-i4fzutup.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260328_061445-eyjtftlz/run-eyjtftlz.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260327_234927-gbd6yui5/run-gbd6yui5.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260326_154346-zf18ek2n/run-zf18ek2n.wandb filter=lfs diff=lfs merge=lfs -text
+cache/wandb/wandb/offline-run-20260325_011840-yvcn1q14/run-yvcn1q14.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.env
+# Large local caches and artifacts (not for Hub)
+cache/
+**/__pycache__/
+*.py[cod]
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+*.egg-info/
+.eggs/
+.wandb/
+*.swp
+.DS_Store

AGENTS.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# AGENTS.md
+## Purpose
+This repo is a GRPO-style math-reasoning training workspace. Agents should preserve a clean split between tracked source code and untracked runtime artifacts.
+## Canonical Layout
+Tracked code and docs:
+- `src/`: Python entrypoints and the `hackable` library.
+- `configs/`: Accelerate and experiment YAMLs.
+- `scripts/`: launchers and workflow helpers.
+- `docs/`: durable repo documentation.
+- `README.md`: user-facing overview.
+Runtime storage under `cache/`:
+- `cache/datasets/`: Hugging Face dataset cache and prepared dataset assets.
+- `cache/models/`: local model weights, checkpoint loads, tokenizer cache.
+- `cache/hf/`: extra Hugging Face runtime cache.
+- `cache/artifacts/runs/`: single training or SFT runs.
+- `cache/artifacts/sweeps/`: parameter sweeps and multi-run experiments.
+- `cache/artifacts/eval/`: evaluation outputs, JSONL, CSV, plots.
+- `cache/logs/wandb/`: W&B offline or online run logs.
+Within a run directory, use:
+- `checkpoint-*`: rotating trainer checkpoints.
+- `checkpoints/permanent/`: non-rotating full-model snapshots saved by the callback.
+- run-local analysis files only when they belong to that run.
+## Path Rules
+- Treat `storage.cache_dir` as the runtime root. Relative runtime paths such as `artifacts/runs/foo` resolve under that root.
+- Do not introduce new top-level folders for datasets, checkpoints, models, or logs.
+- Prefer `hackable.paths.resolve_storage_path()` and `hackable.paths.storage_layout()` over manual string concatenation.
+- Keep tracked docs and code out of `cache/`. Keep large generated outputs out of `src/`, `scripts/`, and the repo root.
+## Code Map
+- `src/train_grpo.py`: main GRPO training entrypoint.
+- `src/sft_gsm8k_boxed.py`: SFT baseline for boxed-only GSM8K.
+- `src/eval_*.py`: evaluation entrypoints.
+- `src/hackable/config.py`: YAML dataclasses.
+- `src/hackable/paths.py`: canonical storage layout helpers.
+- `src/hackable/data_plugins.py`: dataset providers.
+- `src/hackable/reward_plugins.py`: reward functions.
+- `src/hackable/objectives.py`: reward combination logic.
+- `src/hackable/permanent_checkpoint_callback.py`: permanent checkpoint writer.
+## Working Agreement
+- Before changing structure, inspect existing path assumptions in both `src/` and `scripts/`.
+- When adding a new workflow, document it in `README.md` or `docs/repository-map.md`.
+- Preserve user changes already present in the worktree unless explicitly asked to rewrite them.

README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# Neuralese / hackable GRPO baseline
+This repository is a **hackable GRPO-style training baseline** for math reasoning with a fixed chain-of-thought format: reasoning inside `<redacted_thinking>...</redacted_thinking>` and a final `\boxed{...}` answer. The design keeps **objectives**, **rewards**, and **data providers** in small pluggable modules so you can experiment without rewriting a monolithic trainer.
+## Repository layout
+| Path | Role |
+|------|------|
+| `src/hackable/` | Core library: config types, registries, objectives, rewards, data providers, model loading. |
+| `src/train_grpo.py` | Training entrypoint expected by `scripts/*.sh` (may be absent in a partial checkout; scripts default to `--config configs/grpo_llama32_3b_bf16.yaml`). |
+| `docs/repository-map.md` | Maintainer-focused map of entrypoints, modules, and storage conventions. |
+| `AGENTS.md` | Agent-facing workflow and folder contract for future repo changes. |
+| `configs/` | YAML experiment configs (referenced by scripts; may need to be added locally). |
+| `scripts/` | Bash launchers for single/multi-GPU training, sweeps, and evaluation. |
+| `cache/` | Runtime storage root: `datasets/`, `models/`, `hf/`, `artifacts/{runs,sweeps,eval}/`, and `logs/wandb/`. |
+| `requirements.txt` | Python dependencies (Torch, TRL, Transformers, Accelerate, etc.). |
+| `note.txt` | Unrelated environment/pip noise from a past install; not project documentation. |
+### Storage conventions
+- `storage.cache_dir` is the canonical runtime root. Relative runtime paths like `artifacts/runs/grpo-llama32-3b` resolve under that directory.
+- Use `cache/datasets` for dataset cache, `cache/models` for model/tokenizer cache, and `cache/logs/wandb` for W&B logs.
+- Training runs live in `cache/artifacts/runs`, sweep outputs in `cache/artifacts/sweeps`, eval outputs in `cache/artifacts/eval`.
+- Permanent checkpoints now live under each run at `checkpoints/permanent/`.
+### `src/hackable/` modules
+- **`config.py`** — Loads YAML into typed dataclasses (`ExperimentConfig`, `ModelConfig`, `TrainerConfig`, `GenerationConfig`, `ObjectiveConfig`, `RewardsConfig`, …). Normalizes optimizer/scheduler aliases and numeric fields.
+- **`registry.py`** — `@register_data_provider`, `@register_reward`, `@register_objective` plus `build_*` factories. Reward kwargs from YAML are partially applied via `build_reward` so each reward can receive static options (e.g. tokenizer name for length penalty).
+- **`interfaces.py`** — `TrainingSample` and protocol shapes for providers/rewards/objectives.
+- **`objectives.py`** — `TokenGRPOObjective` (main token-level GRPO recipe) and `LatentNeuraleseObjective` (stub: format reward only, `extra_reward` no-op for future latent scoring).
+- **`reward_plugins.py`** — Registered rewards: strict format, GSM8K/MATH-style correctness, **length penalty**, optional token-utilisation shaping. See [Length penalty rewards](#length-penalty-rewards) below.
+- **`data_plugins.py`** — GSM8K, Hendrycks MATH by level, interleaved curricula (`gsm8k_math_curriculum`, etc.). Shared prompt prefix matches the strict completion format expected by rewards.
+- **`backends.py`** — Loads causal LMs with Liger Llama patches when applicable, FlashAttention2 when importable, else SDPA. Exposes `generation_kwargs` from config.
+- **`utils.py`** — `import_from_path` for `objective.class_path` style `"module:Class"` loading.
+Importing `hackable` registers default plugins (`hackable/__init__.py` imports `data_plugins`, `objectives`, `reward_plugins`).
+### `src/` evaluation utilities
+- **`eval_sweep_models.py`** — Distributed evaluation of every `run_*` directory under a sweep root: loads checkpoints, runs GSM8K test generations, scores correctness, records CoT word length stats, writes CSV/JSON summaries (used after lambda or reward-variant sweeps).
+- **`eval_permanent_checkpoints.py`** — Walks `checkpoints/permanent` folders, evaluates each checkpoint, can emit simple SVG learning curves.
+- **`eval_math_level1_thinking_zeroshot.py`** — Zero-shot / thinking-format eval on MATH-style data with JSONL output (for downstream rewards or analysis).
+### `scripts/` (high level)
+| Script | Purpose |
+|--------|---------|
+| `run_grpo.sh`, `run_grpo_2gpu.sh`, `run_grpo_4gpu.sh`, `run_grpo_8gpu.sh` | Launch training with Accelerate. |
+| `resume_grpo_8gpu.sh` | Resume from latest or explicit checkpoint. |
+| `sweep_length_penalty_lambda.sh` | Trains multiple runs with different `length_penalty_lambda` (weighted length-penalty mode). |
+| `run_reward_variants_and_eval.sh` | Trains three interaction/gating variants, then runs `eval_sweep_models.py`. |
+| `run_twostage_correctness1.sh`, `run_twostage_correctness5.sh` | Two-stage schedules; YAML is expected to set `correctness_weight` and optionally stage-2 length-penalty fields (see below). |
+| `run_lambda_0p1_existing_gate_token_util.sh` | Example run with low λ and `token_utilisation_reward` enabled. |
+| `eval_sweep_models_offline.sh` | Offline eval driver for a length-penalty λ sweep directory. |
+| `eval_length_penalty_ablation_offline.sh` | Launches `src/eval_length_penalty_ablation.py` (script must exist alongside training code). |
+| `eval_twostage_permanent_checkpoints.sh` | Eval for two-stage permanent checkpoint trees. |
+| `eval_gsm8k_*.sh`, `eval_math_level*_*.sh` | Dataset-specific eval launchers. |
+| `hf_upload_repo.py`, `hf_download_repo.py` | Push/pull Hugging Face dataset repo snapshots. |
+---
+## Length penalty rewards
+Training prefers **shorter thinking traces** *within each GRPO group* (same prompt, multiple sampled completions). The signal is implemented as a **reward** (`length_penalty_reward`) and combined with correctness and format either **additively** or via a **weighted multiplicative** term controlled by `TokenGRPOObjective`.
+### What gets measured
+1. **Strict format only** — `_think_length_tokens` in `reward_plugins.py` parses the completion with the same regex as `format_tag_reward`: a single block `<redacted_thinking>...</redacted_thinking>` followed by `\boxed{...}`. If the completion does not match, thinking length is treated as **0** (so length reward does not reward malformed outputs on length grounds).
+2. **Token count** — If `tokenizer_name` is passed (via `rewards.kwargs.length_penalty_reward` in YAML and `build_reward`), the **thinking substring** is encoded with that tokenizer (`add_special_tokens=False`) and the length is the number of token IDs. If no tokenizer is configured, length falls back to **whitespace-split words** inside the thinking block.
+3. **Per-group normalization** — For each group \(G\) of completions (see grouping below), let \(L_i\) be the thinking length of completion \(i\), and \(\bar{L} = \frac{1}{|G|}\sum_j L_j\). If \(\bar{L} \le 0\), every score in the group is **0**. Otherwise:
+\[
+R^{\mathrm{length}}_i = \max\left(0,\ 1 - \frac{L_i}{\bar{L}}\right).
+\]
+So **shorter-than-average** thinking in the group scores closer to **1**, **longer-than-average** scores closer to **0**, and everyone at the average gets **0**. This is a **relative** length preference, not an absolute token budget.
+### How groups are formed
+`length_penalty_reward` assigns the same group normalization to completions that belong to the same GRPO comparison group:
+- If `group_size` is set (again, typically under `rewards.kwargs.length_penalty_reward`), the flat batch is chunked in order: `[0:group_size)`, `[group_size:2*group_size)`, …
+- Else groups are inferred by **contiguous runs of identical `prompt` text** in the parallel lists passed to the reward.
+### How `TokenGRPOObjective` combines length with other rewards
+Configured under `objective.name: token_grpo` with `objective.kwargs` (see `src/hackable/objectives.py`).
+**Registered reward names** (when enabled):
+- `format_tag_reward` — 1.0 if strict thinking + non-empty boxed answer, else 0.0.
+- `gsm8k_correctness_reward` — Parses `\boxed{...}` vs reference (GSM8K `####` answers or MATH boxed solutions), numeric normalization and tolerant float compare.
+- `length_penalty_reward` — if `enable_length_penalty: true`.
+- `token_utilisation_reward` — optional; shapes training vs a frozen zero-shot correctness JSONL (see docstring in `reward_plugins.py`).
+**`reward_mode`**
+1. **`additive`** — Total score is the **sum** of all enabled reward outputs for that sample. If `strict_format_gate: true`, any sample with `format_tag_reward ≤ 0.5` is replaced by `non_strict_penalty` (default **-1.0**) instead of the sum.
+2. **`weighted_length_penalty`** — Correctness and length interact multiplicatively; format is added (and optionally multiplied into the interaction):
+   - Let \(r_c, r_f, r_\ell\) be correctness, format, and length scores in \([0,1]\) (length may be 0 if disabled or malformed).
+   - Base interaction: \(r_c \times r_\ell\).
+   - If `length_penalty_interaction` is `correctness_length_format`, the interaction is \(r_c \times r_\ell \times r_f\).
+   - Total (before optional token-util term):
+   \[
+   \texttt{correctness\_weight} \cdot r_c + \texttt{length\_penalty\_lambda} \cdot \text{interaction} + r_f + r_{\mathrm{util}}.
+   \]
+   If `strict_format_gate` is true and \(r_f \le 0.5\), the total is **`non_strict_penalty`** and the formula above is skipped.
+**Important knobs**
+| KWarg | Meaning |
+|-------|---------|
+| `enable_length_penalty` | When false, length reward is not registered; in weighted mode \(r_\ell\) is treated as 0. |
+| `length_penalty_lambda` | Scales the \(r_c \cdot r_\ell\) (or \(r_c \cdot r_\ell \cdot r_f\)) term in weighted mode. Sweeps often try e.g. 0.25, 0.5, 0.75, 1.0. |
+| `correctness_weight` | Scales the standalone correctness term in weighted mode. |
+| `length_penalty_interaction` | `correctness_length` vs `correctness_length_format` (whether format enters the product). |
+| `strict_format_gate` / `non_strict_penalty` | Hard gate on format before crediting other terms. |
+| `stage2_length_penalty_lambda`, `stage2_start_epoch` | Stored on `TokenGRPOObjective` for **two-stage** schedules; **changing λ at epoch boundaries must be implemented in the trainer** (`combine_rewards` itself only reads the current `length_penalty_lambda` on the instance). |
+**YAML wiring example** (illustrative; paths depend on your repo):
+```yaml
+objective:
+  name: token_grpo
+  kwargs:
+    reward_mode: weighted_length_penalty
+    enable_length_penalty: true
+    correctness_weight: 1.0
+    length_penalty_lambda: 0.5
+    length_penalty_interaction: correctness_length
+    strict_format_gate: true
+    non_strict_penalty: -1.0
+rewards:
+  kwargs:
+    length_penalty_reward:
+      tokenizer_name: meta-llama/Llama-3.2-3B-Instruct   # example
+      group_size: 4                                   # often match generation.num_generations
+      cache_dir: cache/models
+```
+---
+## Hackable extension path
+- Implement a new objective class with `reward_names()` and `combine_rewards`-equivalent behavior (your trainer must call into it the same way as the baseline), or add new `@register_reward` functions.
+- Point config at a custom class: `objective.class_path: "my_package.my_module:MyObjective"`.
+- `LatentNeuraleseObjective` is a stub: keep token rewards while you add latent signals via `extra_reward` or a future trainer hook.
+## Quickstart
+1. `pip install -r requirements.txt` (you may need a CUDA-matched Torch and optional FlashAttention build).
+2. Set `HF_TOKEN` / `WANDB_API_KEY` if needed (names configurable under `auth` in YAML).
+3. Single GPU: `bash scripts/run_grpo.sh` (requires `src/train_grpo.py` and `configs/grpo_llama32_3b_bf16.yaml` present).
+4. Many scripts default to offline Hub/datasets/W&B; override `WANDB_MODE`, `HF_HUB_OFFLINE`, etc. if you need network access.
+5. Runtime outputs default to `cache/artifacts/...` through `storage.cache_dir`; agent docs live in `AGENTS.md` and `docs/repository-map.md`.
+## Attention backend
+`backends.py` prefers **FlashAttention2** when `flash_attn` imports cleanly; otherwise **SDPA**. Llama models optionally use **Liger** kernels when installed.

configs/accelerate_ddp_2gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: "no"
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/accelerate_ddp_4gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/accelerate_ddp_7gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 7
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/accelerate_ddp_8gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/grpo_llama32_3b_bf16.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# Baseline GRPO experiment (paths are relative to storage.cache_dir unless absolute).
+model:
+  name: meta-llama/Llama-3.2-3B-Instruct
+  trust_remote_code: false
+trainer:
+  output_dir: artifacts/runs/grpo-llama32-3b
+  run_name: grpo-llama32-3b
+  num_train_epochs: 1.0
+  max_steps: -1
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  learning_rate: 1.0e-6
+  logging_steps: 1
+  save_steps: 200
+  save_total_limit: 3
+  permanent_checkpoint_steps: 300
+  permanent_checkpoint_dir: checkpoints/permanent
+  bf16: true
+  shuffle_dataset: false
+  report_to: wandb
+  warmup_steps: 20
+  gradient_checkpointing: true
+  max_grad_norm: 1.0
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+data:
+  provider: gsm8k_math_curriculum
+  split: train
+  max_samples: null
+generation:
+  max_prompt_length: 512
+  max_completion_length: 256
+  num_generations: 4
+  temperature: 0.9
+  top_p: 0.95
+objective:
+  name: token_grpo
+  kwargs:
+    reward_mode: weighted_length_penalty
+    enable_length_penalty: true
+    length_penalty_lambda: 0.1
+    correctness_weight: 1.0
+    strict_format_gate: true
+rewards:
+  kwargs:
+    length_penalty_reward:
+      group_size: 4
+# Multiply KL on tokens inside <redacted_thinking>...</redacted_thinking> (inner body) by this weight.
+# 1.0 = default TRL behavior; 0.0 = no KL penalty on those tokens (policy loss still applies everywhere).
+thinking_kl:
+  inner_kl_weight: 1.0
+# TRL GRPO-specific flags (see GRPOConfig). When thinking_kl.inner_kl_weight != 1 and beta != 0,
+# train_grpo.py forces use_liger_kernel: false.
+grpo:
+  beta: 0.001
+  loss_type: grpo
+  importance_sampling_level: token
+  use_liger_kernel: false
+storage:
+  cache_dir: cache
+auth: {}

docs/repository-map.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# Repository Map
+## What Lives Where
+- `src/train_grpo.py`: launches GRPO training from a YAML experiment config.
+- `src/sft_gsm8k_boxed.py`: supervised fine-tuning entrypoint for boxed-answer GSM8K.
+- `src/eval_sweep_models.py`: evaluates every `run_*` directory in a sweep root on GSM8K.
+- `src/eval_permanent_checkpoints.py`: evaluates a run's permanent checkpoints and writes summaries and plots.
+- `src/eval_gsm8k_zeroshot.py`: single-checkpoint GSM8K evaluation.
+- `src/eval_gsm8k_truncated_thinking.py`: GSM8K eval with forced truncation of the thinking span.
+- `src/eval_math_level1_thinking_zeroshot.py`: MATH level-1 zero-shot thinking-format evaluation.
+## `src/hackable/`
+- `config.py`: experiment dataclasses and YAML loading.
+- `paths.py`: canonical runtime storage layout and path resolution.
+- `registry.py`: plugin registration and construction for objectives, rewards, and data providers.
+- `interfaces.py`: shared sample and protocol types.
+- `data_plugins.py`: GSM8K, MATH, and curriculum data providers.
+- `reward_plugins.py`: format, correctness, length, and token-utilization rewards.
+- `objectives.py`: objective classes and reward-combination strategies.
+- `backends.py`: model-loading helpers and backend selection.
+- `thinking_kl_grpo_trainer.py`: custom GRPO trainer with masked thinking-KL handling.
+- `thinking_kl_mask.py`: token masking logic for inner-thinking KL control.
+- `permanent_checkpoint_callback.py`: periodic permanent checkpoint export.
+- `utils.py`: repository-relative import and path utilities.
+## Config And Launch Surface
+- `configs/grpo_llama32_3b_bf16.yaml`: baseline experiment config.
+- `configs/accelerate_ddp_*.yaml`: multi-GPU Accelerate launch configs.
+- `scripts/run_grpo*.sh`: baseline training launchers.
+- `scripts/run_twostage_*.sh`: two-stage experiments.
+- `scripts/sweep_length_penalty_lambda.sh`: lambda sweep generator.
+- `scripts/run_reward_variants_and_eval.sh`: reward variant sweep plus evaluation.
+- `scripts/eval_*.sh`: distributed evaluation wrappers.
+## Runtime Storage
+All generated assets should live under `cache/`:
+- `cache/datasets/`: dataset downloads and prepared data.
+- `cache/models/`: Hugging Face model cache and local model roots.
+- `cache/hf/`: extra Hugging Face cache state.
+- `cache/artifacts/runs/`: one-off runs.
+- `cache/artifacts/sweeps/`: grouped experiments.
+- `cache/artifacts/eval/`: evaluation outputs.
+- `cache/logs/wandb/`: W&B logs.
+Relative paths like `artifacts/runs/foo` are intentionally interpreted relative to `storage.cache_dir`, so they land at `cache/artifacts/runs/foo` by default.

out.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+flash-attn
+liger-kernel
+transformers
+trl>=1.0.0
+datasets
+accelerate
+pyyaml
+wandb

scripts/conda_env.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+# Source from other scripts: default training env is conda "sidharth".
+# Override: CONDA_ENV=myenv bash scripts/run_grpo_8gpu.sh
+#
+# Conda and nvidia packages (e.g. cuda-nvcc) ship deactivate hooks that use unset variables
+# (CUDAARCHS_BACKUP, etc.). Parent scripts often use `set -u`, which breaks those hooks.
+# We disable nounset for this entire file, then restore the parent's setting at the end.
+_nounset_was_on=0
+case $- in *u*) _nounset_was_on=1 ;; esac
+set +u
+: "${CONDA_ENV:=sidharth}"
+_conda_try_init() {
+  if command -v conda >/dev/null 2>&1; then
+    # shellcheck disable=SC2312
+    eval "$(conda shell.bash hook)"
+    return 0
+  fi
+  local d
+  for d in "${CONDA_PREFIX:-}" "$HOME/miniconda3" "$HOME/mambaforge" "$HOME/anaconda3" "/opt/conda"; do
+    if [[ -n "$d" && -f "$d/etc/profile.d/conda.sh" ]]; then
+      # shellcheck disable=SC1090
+      source "$d/etc/profile.d/conda.sh"
+      return 0
+    fi
+  done
+  return 1
+}
+if ! _conda_try_init; then
+  echo "conda_env.sh: could not initialize conda (not on PATH and no known install)." >&2
+  exit 1
+fi
+conda activate "${CONDA_ENV}"
+if [[ "$_nounset_was_on" -eq 1 ]]; then
+  set -u
+fi

scripts/eval_gsm8k_think_entropy_4gpu.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+# Required:
+#   MODEL_DIR=/path/to/checkpoint
+MODEL_DIR="${MODEL_DIR:-}"
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/gsm8k_think_entropy/entropy.json}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+SAVE_PER_SAMPLE="${SAVE_PER_SAMPLE:-0}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export BATCH_SIZE
+export MAX_SAMPLES
+export SAVE_PER_SAMPLE
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_gsm8k_think_entropy.py

scripts/eval_gsm8k_truncated_thinking.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env bash
+# GSM8K eval with forced early </redacted_thinking> after N inner thinking tokens (default 5).
+#
+# Example:
+#   MODEL_PATH=artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness1/checkpoints/permanent/checkpoint-stage1-boundary-epoch-1p0-step-1868 \
+#   bash scripts/eval_gsm8k_truncated_thinking.sh
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_PATH="${MODEL_PATH:?Set MODEL_PATH to your checkpoint directory}"
+THINK_INNER_TOKEN_LIMIT="${THINK_INNER_TOKEN_LIMIT:-5}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-256}"
+OUTPUT_PATH="${OUTPUT_PATH:-}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
+CMD_ENV=(
+  PYTHONPATH="${REPO_ROOT}/src"
+  BASE_CONFIG="${BASE_CONFIG}"
+  MODEL_PATH="${MODEL_PATH}"
+  THINK_INNER_TOKEN_LIMIT="${THINK_INNER_TOKEN_LIMIT}"
+  MAX_SAMPLES="${MAX_SAMPLES}"
+  MAX_NEW_TOKENS="${MAX_NEW_TOKENS}"
+)
+if [[ -n "${OUTPUT_PATH}" ]]; then
+  CMD_ENV+=(OUTPUT_PATH="${OUTPUT_PATH}")
+fi
+env "${CMD_ENV[@]}" accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_gsm8k_truncated_thinking.py \
+  --config "${BASE_CONFIG}" \
+  --model_path "${MODEL_PATH}" \
+  --think_inner_token_limit "${THINK_INNER_TOKEN_LIMIT}" \
+  --max_samples "${MAX_SAMPLES}" \
+  --max_new_tokens "${MAX_NEW_TOKENS}"
+echo "Done."

scripts/eval_gsm8k_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/usr/bin/env bash
+# Zero-shot GSM-8K eval: system prompt asks for answer only in \\boxed{}; chat template via tokenizer.
+# Usage:
+#   MODEL_PATH=/path/to/checkpoint OUTPUT_PATH=out.jsonl ./scripts/eval_gsm8k_zeroshot_4gpu.sh
+# Or with base YAML (uses model.name if MODEL_PATH unset):
+#   BASE_CONFIG=configs/grpo_llama32_3b_bf16.yaml OUTPUT_PATH=out.jsonl ./scripts/eval_gsm8k_zeroshot_4gpu.sh
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/gsm8k_zeroshot_answers.jsonl}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export OUTPUT_PATH
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_SAMPLES
+# Optional: MODEL_PATH, SYSTEM_PROMPT, GSM8K_SPLIT, MAX_PROMPT_LENGTH, LOCAL_FILES_ONLY=1
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_gsm8k_zeroshot.py

scripts/eval_gsm8k_zeroshot_train_4gpu.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_PATH="${MODEL_PATH:-}"
+if [[ -z "${MODEL_PATH}" ]]; then
+  echo "Missing MODEL_PATH."
+  echo "Example:"
+  echo "  MODEL_PATH=/path/to/checkpoint OUTPUT_PATH=artifacts/eval/gsm8k_train_zeroshot/results.jsonl ./scripts/eval_gsm8k_zeroshot_train_4gpu.sh"
+  exit 1
+fi
+# Train split, entire dataset by default.
+GSM8K_SPLIT="${GSM8K_SPLIT:-train}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/gsm8k_train_zeroshot/results.jsonl}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-1024}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_PATH
+export GSM8K_SPLIT
+export MAX_SAMPLES
+export OUTPUT_PATH
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_PROMPT_LENGTH
+# Optional override for prompt behavior.
+# export SYSTEM_PROMPT="..."
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_gsm8k_zeroshot.py

scripts/eval_length_penalty_ablation_offline.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-2}"
+OUT_ROOT="${OUT_ROOT:-artifacts/ablation_length_penalty}"
+# Update these if your model dirs differ.
+MODEL_LEN_ON="${MODEL_LEN_ON:-${OUT_ROOT}/run_len_on}"
+MODEL_LEN_OFF="${MODEL_LEN_OFF:-${OUT_ROOT}/run_len_off}"
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+BASE_CONFIG="${BASE_CONFIG}" \
+OUT_ROOT="${OUT_ROOT}" \
+MODEL_LEN_ON="${MODEL_LEN_ON}" \
+MODEL_LEN_OFF="${MODEL_LEN_OFF}" \
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}" \
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}" \
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_length_penalty_ablation.py

scripts/eval_math_level1_4gpu.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_DIR="${MODEL_DIR:-}"
+if [[ -z "${MODEL_DIR}" ]]; then
+  echo "Missing MODEL_DIR. Example:"
+  echo "  MODEL_DIR=/path/to/checkpoint OUTPUT_PATH=out.json ./scripts/eval_math_level1_4gpu.sh"
+  exit 1
+fi
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/math_level1/accuracy.json}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MATH_SPLIT="${MATH_SPLIT:-test}"
+SAVE_ROLLOUTS="${SAVE_ROLLOUTS:-0}"
+ROLLOUTS_MAX_SAMPLES="${ROLLOUTS_MAX_SAMPLES:-50}"
+ROLLOUTS_OUT_PATH="${ROLLOUTS_OUT_PATH:-artifacts/eval/math_level1/rollouts.jsonl}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-}"
+MAX_COMPLETION_LENGTH="${MAX_COMPLETION_LENGTH:-}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export MAX_SAMPLES
+export BATCH_SIZE
+export MATH_SPLIT
+export SAVE_ROLLOUTS
+export ROLLOUTS_MAX_SAMPLES
+export ROLLOUTS_OUT_PATH
+if [[ -n "${MAX_PROMPT_LENGTH}" ]]; then export MAX_PROMPT_LENGTH; fi
+if [[ -n "${MAX_COMPLETION_LENGTH}" ]]; then export MAX_COMPLETION_LENGTH; fi
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_math_level1.py

scripts/eval_math_level1_thinking_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_DIR="${MODEL_DIR:-}"
+if [[ -z "${MODEL_DIR}" ]]; then
+  echo "Missing MODEL_DIR."
+  exit 1
+fi
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/math_level1_thinking_zeroshot/answers.jsonl}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-512}"
+MATH_SPLIT="${MATH_SPLIT:-test}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export MAX_SAMPLES
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_PROMPT_LENGTH
+export MATH_SPLIT
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_math_level1_thinking_zeroshot.py

scripts/eval_math_level1_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_DIR="${MODEL_DIR:-}"
+if [[ -z "${MODEL_DIR}" ]]; then
+  echo "Missing MODEL_DIR."
+  exit 1
+fi
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/math_level1_zeroshot/answers.jsonl}"
+SYSTEM_PROMPT="${SYSTEM_PROMPT:-}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-512}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export MAX_SAMPLES
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_PROMPT_LENGTH
+if [[ -n "${SYSTEM_PROMPT}" ]]; then export SYSTEM_PROMPT; fi
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_math_level1_zeroshot.py

scripts/eval_math_level2_thinking_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_DIR="${MODEL_DIR:-}"
+if [[ -z "${MODEL_DIR}" ]]; then
+  echo "Missing MODEL_DIR."
+  exit 1
+fi
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/math_level2_thinking_zeroshot/answers.jsonl}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-512}"
+MATH_SPLIT="${MATH_SPLIT:-test}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export MAX_SAMPLES
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_PROMPT_LENGTH
+export MATH_SPLIT
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_math_level2_thinking_zeroshot.py

scripts/eval_math_level2_zeroshot_4gpu.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${ROOT}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+MODEL_DIR="${MODEL_DIR:-}"
+if [[ -z "${MODEL_DIR}" ]]; then
+  echo "Missing MODEL_DIR."
+  exit 1
+fi
+OUTPUT_PATH="${OUTPUT_PATH:-artifacts/eval/math_level2_zeroshot/answers.jsonl}"
+SYSTEM_PROMPT="${SYSTEM_PROMPT:-}"
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-128}"
+MAX_PROMPT_LENGTH="${MAX_PROMPT_LENGTH:-512}"
+export PYTHONPATH="${ROOT}/src"
+export BASE_CONFIG
+export MODEL_DIR
+export OUTPUT_PATH
+export MAX_SAMPLES
+export BATCH_SIZE
+export MAX_NEW_TOKENS
+export MAX_PROMPT_LENGTH
+if [[ -n "${SYSTEM_PROMPT}" ]]; then export SYSTEM_PROMPT; fi
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_math_level2_zeroshot.py

scripts/eval_permanent_root_acc_cot.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env bash
+# Evaluate every checkpoint-* under a single checkpoints/permanent directory, then write
+# JSON/CSV summary, per-checkpoint JSONL, and accuracy_vs_avg_cot_words.svg (scatter + step labels).
+#
+# Usage:
+#   PERMANENT_ROOT=artifacts/runs/.../checkpoints/permanent ./scripts/eval_permanent_root_acc_cot.sh
+#
+# Optional: RUN_LABEL (default permanent), OUT_ROOT, BASE_CONFIG, EVAL_MAX_SAMPLES, NUM_PROCESSES, offline HF vars.
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+: "${PERMANENT_ROOT:?Set PERMANENT_ROOT to your checkpoints/permanent directory}"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+RUN_LABEL="${RUN_LABEL:-permanent}"
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES:-8}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+# Allow dataset download into cache unless you already mirrored GSM8K (set HF_DATASETS_OFFLINE=1).
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+CMD_ENV=(
+  PYTHONPATH="${REPO_ROOT}/src"
+  BASE_CONFIG="${BASE_CONFIG}"
+  PERMANENT_ROOT="${PERMANENT_ROOT}"
+  RUN_LABEL="${RUN_LABEL}"
+  EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}"
+  EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}"
+  ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES}"
+)
+if [[ -n "${OUT_ROOT:-}" ]]; then
+  CMD_ENV+=(OUT_ROOT="${OUT_ROOT}")
+fi
+env "${CMD_ENV[@]}" accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_permanent_checkpoints.py
+echo "Done. Summary and accuracy_vs_avg_cot_words.svg under OUT_ROOT (default: <PERMANENT_ROOT>/eval_permanent)."

scripts/eval_sweep_models_offline.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+SWEEP_ROOT="${SWEEP_ROOT:-artifacts/sweeps/length_penalty_lambda}"
+OUT_ROOT="${OUT_ROOT:-${SWEEP_ROOT}/eval_results}"
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+BASE_CONFIG="${BASE_CONFIG}" \
+SWEEP_ROOT="${SWEEP_ROOT}" \
+OUT_ROOT="${OUT_ROOT}" \
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}" \
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}" \
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_sweep_models.py

scripts/eval_twostage_permanent_checkpoints.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+# Evaluate every checkpoint under two run-local checkpoints/permanent dirs (cw=1 and cw=5 runs).
+# Uses 4 GPUs. Writes rollouts, full JSONL outputs, CSV/JSON summary, step line charts,
+# and accuracy_vs_avg_cot_words.svg (scatter: accuracy vs mean CoT length).
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PERMANENT_CW1="${PERMANENT_CW1:-artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness1/checkpoints/permanent}"
+PERMANENT_CW5="${PERMANENT_CW5:-artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness5/checkpoints/permanent}"
+OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/twostage_correctness_weight/eval_permanent_checkpoints}"
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES:-8}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+PYTHONPATH="${REPO_ROOT}/src" \
+BASE_CONFIG="${BASE_CONFIG}" \
+PERMANENT_CW1="${PERMANENT_CW1}" \
+PERMANENT_CW5="${PERMANENT_CW5}" \
+OUT_ROOT="${OUT_ROOT}" \
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}" \
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}" \
+ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES}" \
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_permanent_checkpoints.py
+echo "Done. Results under: ${OUT_ROOT}"

scripts/hf_download_repo.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import os
+from pathlib import Path
+from huggingface_hub import snapshot_download
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Download a Hugging Face dataset repo snapshot to local disk."
+    )
+    parser.add_argument(
+        "--repo-name",
+        type=str,
+        default="neuralese-move",
+        help="HF dataset repo name under the username.",
+    )
+    parser.add_argument(
+        "--username",
+        type=str,
+        default="psidharth567",
+        help="HF username/org (default: psidharth567).",
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=str,
+        default="./neuralese",
+        help="Where to download the repo snapshot locally.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Optional branch/tag/commit to download.",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        default=os.environ.get("HF_TOKEN", ""),
+        help="HF token (or set HF_TOKEN env var).",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    token = args.hf_token.strip()
+    if not token:
+        raise RuntimeError("Missing HF token. Set HF_TOKEN or pass --hf-token.")
+    repo_id = f"{args.username}/{args.repo_name}"
+    local_dir = Path(args.local_dir).resolve()
+    local_dir.parent.mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+    snapshot_path = snapshot_download(
+        repo_id=repo_id,
+        repo_type="dataset",
+        local_dir=str(local_dir),
+        local_dir_use_symlinks=False,
+        revision=args.revision,
+        token=token,
+    )
+    print(f"Downloaded snapshot to: {snapshot_path}")
+if __name__ == "__main__":
+    main()

scripts/hf_upload_repo.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import inspect
+import os
+from pathlib import Path
+from huggingface_hub import HfApi, create_repo
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Upload a local folder to a Hugging Face dataset repo."
+    )
+    parser.add_argument(
+        "--folder-path",
+        type=str,
+        default=".",
+        help="Local folder to upload (default: current directory).",
+    )
+    parser.add_argument(
+        "--repo-name",
+        type=str,
+        default="neuralese-move",
+        help="HF dataset repo name under the username.",
+    )
+    parser.add_argument(
+        "--username",
+        type=str,
+        default="psidharth567",
+        help="HF username/org (default: psidharth567).",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Create repo as private (default is public).",
+    )
+    parser.add_argument(
+        "--path-in-repo",
+        type=str,
+        default=".",
+        help="Destination path inside HF repo (default: repo root).",
+    )
+    parser.add_argument(
+        "--commit-message",
+        type=str,
+        default="temporary cluster migration upload",
+        help="Commit message for upload.",
+    )
+    parser.add_argument(
+        "--ignore",
+        type=str,
+        nargs="*",
+        default=[
+            "**/.git/**",
+            "**/__pycache__/**",
+            "**/*.pyc",
+            "**/*.pyo",
+            "**/.DS_Store",
+        ],
+        help="Glob patterns to ignore during upload.",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        default=os.environ.get("HF_TOKEN", ""),
+        help="HF token (or set HF_TOKEN env var).",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    token = args.hf_token.strip()
+    if not token:
+        raise RuntimeError("Missing HF token. Set HF_TOKEN or pass --hf-token.")
+    folder = Path(args.folder_path).resolve()
+    if not folder.exists():
+        raise FileNotFoundError(f"Folder does not exist: {folder}")
+    if not folder.is_dir():
+        raise NotADirectoryError(f"Not a directory: {folder}")
+    os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+    repo_id = f"{args.username}/{args.repo_name}"
+    api = HfApi(token=token)
+    # Create repo if needed; no-op if already exists.
+    create_repo(
+        repo_id=repo_id,
+        repo_type="dataset",
+        private=bool(args.private),
+        exist_ok=True,
+        token=token,
+    )
+    large_sig = inspect.signature(api.upload_large_folder)
+    large_params = set(large_sig.parameters.keys())
+    upload_kwargs = {
+        "repo_id": repo_id,
+        "repo_type": "dataset",
+        "folder_path": str(folder),
+        "ignore_patterns": args.ignore,
+    }
+    if "token" in large_params:
+        upload_kwargs["token"] = token
+    # Some huggingface_hub versions support path_in_repo for large uploads,
+    # others do not. Pass it only when supported.
+    if "path_in_repo" in large_params:
+        upload_kwargs["path_in_repo"] = args.path_in_repo
+    elif args.path_in_repo not in (".", "", "/"):
+        print(
+            "Warning: this huggingface_hub version does not support "
+            "`path_in_repo` for upload_large_folder; uploading to repo root."
+        )
+    api.upload_large_folder(**upload_kwargs)
+    info = api.repo_info(repo_id=repo_id, repo_type="dataset")
+    print(f"Upload complete: https://huggingface.co/datasets/{repo_id}")
+    print(f"Latest commit: {info.sha}")
+if __name__ == "__main__":
+    main()

scripts/resume_grpo_8gpu.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+RESUME_FROM="${1:-latest}"
+GPU_COUNT="${2:-${GPU_COUNT:-8}}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file configs/accelerate_ddp_8gpu.yaml \
+  --num_processes "${GPU_COUNT}" \
+  src/train_grpo.py \
+  --config configs/grpo_llama32_3b_bf16.yaml \
+  --resume_from_checkpoint "${RESUME_FROM}"

scripts/run_grpo.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+python src/train_grpo.py --config configs/grpo_llama32_3b_bf16.yaml

scripts/run_grpo_2gpu.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file configs/accelerate_ddp_2gpu.yaml \
+  src/train_grpo.py \
+  --config configs/grpo_llama32_3b_bf16.yaml

scripts/run_grpo_4gpu.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file configs/accelerate_ddp_4gpu.yaml \
+  src/train_grpo.py \
+  --config configs/grpo_llama32_3b_bf16.yaml

scripts/run_grpo_8gpu.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+GPU_COUNT="${1:-${GPU_COUNT:-8}}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+accelerate launch \
+  --config_file configs/accelerate_ddp_8gpu.yaml \
+  --num_processes "${GPU_COUNT}" \
+  src/train_grpo.py \
+  --config configs/grpo_llama32_3b_bf16.yaml

scripts/run_grpo_thinking_kl_masked_resume_4gpu.sh ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env bash
+# Defer `set -u` until after conda: nvidia/conda hooks use unset vars during activate.
+set -eo pipefail
+# Fine-tune from a stage-1 checkpoint with KL *disabled* on tokens inside
+# <redacted_thinking>...</redacted_thinking> (inner body only). KL still applies
+# to tags, \\boxed{...}, and any text outside the inner thinking span.
+#
+# Conda env: sidharth (override with CONDA_ENV=...).
+#
+# Defaults: online Hugging Face / datasets (fetch if missing). Override with
+#   HF_HUB_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
+#
+# Effective microbatch for GRPO: 4 GPUs × per_device_batch × grad_accum.
+# Matches configs/grpo_llama32_3b_bf16.yaml baseline: per_device=1, grad_accum=8 -> 32.
+#
+# Overrides (defaults tuned for this launcher):
+#   LENGTH_PENALTY_LAMBDA=0.5
+#   PERMANENT_CHECKPOINT_STEPS=300  (set to 0 to disable extra copies under permanent_checkpoint_dir)
+#   PERMANENT_CHECKPOINT_DIR=checkpoints/permanent
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+cd "${REPO_ROOT}"
+# DeepSpeed is imported by some TRL/Accelerate stacks even when using plain DDP. Its import path
+# calls installed_cuda_version() and requires CUDA_HOME if CUDA ops are probed. PyTorch GPU builds
+# often work without nvcc, but DeepSpeed does not — set CUDA_HOME from nvcc when unset, or run:
+#   pip uninstall deepspeed
+if [[ -z "${CUDA_HOME:-}" ]] && command -v nvcc >/dev/null 2>&1; then
+  export CUDA_HOME="$(dirname "$(dirname "$(command -v nvcc)")")"
+fi
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+CHECKPOINT="${CHECKPOINT:-${REPO_ROOT}/cache/artifacts/sweeps/twostage_correctness_weight/run_twostage_correctness1/checkpoints/permanent/checkpoint-stage1-boundary-epoch-1p0-step-1868}"
+# 4 × 1 × 8 = 32 (same as base YAML; override with GRAD_ACCUM=...)
+GRAD_ACCUM="${GRAD_ACCUM:-8}"
+LENGTH_PENALTY_LAMBDA="${LENGTH_PENALTY_LAMBDA:-0.5}"
+PERMANENT_CHECKPOINT_STEPS="${PERMANENT_CHECKPOINT_STEPS:-300}"
+PERMANENT_CHECKPOINT_DIR="${PERMANENT_CHECKPOINT_DIR:-checkpoints/permanent}"
+OUT_ROOT="${OUT_ROOT:-artifacts/runs/grpo_thinking_kl_masked_from_stage1_4gpu}"
+RUN_NAME="${RUN_NAME:-grpo-think-kl0-resume-stage1-4gpu}"
+export THINKING_INNER_KL_WEIGHT="${THINKING_INNER_KL_WEIGHT:-0.0}"
+export WANDB_MODE="${WANDB_MODE:-online}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-0}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-0}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export PYTHONPATH="${REPO_ROOT}/src"
+mkdir -p "${OUT_ROOT}"
+TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+CFG_PATH="${TMP_DIR}/grpo_thinking_kl_4gpu.yaml"
+BASE_CONFIG="${BASE_CONFIG}" \
+OUT_ROOT="${OUT_ROOT}" \
+RUN_NAME="${RUN_NAME}" \
+GRAD_ACCUM="${GRAD_ACCUM}" \
+THINKING_INNER_KL_WEIGHT="${THINKING_INNER_KL_WEIGHT}" \
+LENGTH_PENALTY_LAMBDA="${LENGTH_PENALTY_LAMBDA}" \
+PERMANENT_CHECKPOINT_STEPS="${PERMANENT_CHECKPOINT_STEPS}" \
+PERMANENT_CHECKPOINT_DIR="${PERMANENT_CHECKPOINT_DIR}" \
+CFG_PATH="${CFG_PATH}" \
+"${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+base = Path(os.environ["BASE_CONFIG"])
+out_root = Path(os.environ["OUT_ROOT"])
+run_name = os.environ["RUN_NAME"]
+grad_accum = int(os.environ["GRAD_ACCUM"])
+cfg_path = Path(os.environ["CFG_PATH"])
+with base.open("r", encoding="utf-8") as f:
+    cfg = yaml.safe_load(f)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("model", {})
+# Local Unsloth-touched checkpoints often need this when loading with Transformers.
+cfg["model"]["trust_remote_code"] = True
+cfg.setdefault("trainer", {})
+cfg["trainer"]["gradient_accumulation_steps"] = grad_accum
+cfg["trainer"]["run_name"] = run_name
+cfg["trainer"]["output_dir"] = str(out_root)
+cfg["trainer"]["permanent_checkpoint_steps"] = int(os.environ["PERMANENT_CHECKPOINT_STEPS"])
+cfg["trainer"]["permanent_checkpoint_dir"] = os.environ["PERMANENT_CHECKPOINT_DIR"]
+cfg.setdefault("objective", {}).setdefault("kwargs", {})
+cfg["objective"]["kwargs"]["length_penalty_lambda"] = float(os.environ["LENGTH_PENALTY_LAMBDA"])
+cfg.setdefault("thinking_kl", {})
+# 0.0 = no KL on inner <redacted_thinking>...</redacted_thinking> body; override with THINKING_INNER_KL_WEIGHT=1
+cfg["thinking_kl"]["inner_kl_weight"] = float(os.environ.get("THINKING_INNER_KL_WEIGHT", "0.0"))
+cfg.setdefault("grpo", {})
+cfg["grpo"]["use_liger_kernel"] = False
+# Ensure KL term exists so masking is meaningful (raise beta in YAML if you use 0 today)
+if cfg["grpo"].get("beta", 0) == 0:
+    cfg["grpo"]["beta"] = 0.001
+with cfg_path.open("w", encoding="utf-8") as f:
+    yaml.safe_dump(cfg, f, sort_keys=False)
+print("Wrote", cfg_path)
+print("output_dir:", cfg["trainer"]["output_dir"])
+print("thinking_kl.inner_kl_weight:", cfg["thinking_kl"]["inner_kl_weight"])
+print("trainer.gradient_accumulation_steps:", cfg["trainer"]["gradient_accumulation_steps"])
+print("objective.kwargs.length_penalty_lambda:", cfg["objective"]["kwargs"]["length_penalty_lambda"])
+print("trainer.permanent_checkpoint_steps:", cfg["trainer"]["permanent_checkpoint_steps"])
+print("trainer.permanent_checkpoint_dir:", cfg["trainer"]["permanent_checkpoint_dir"])
+PY
+echo "Checkpoint: ${CHECKPOINT}"
+echo "Config: ${CFG_PATH}"
+echo "Launching ${NUM_PROCESSES} processes (online HF/datasets unless overridden)..."
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/train_grpo.py \
+  --config "${CFG_PATH}" \
+  --model_path "${CHECKPOINT}"
+echo "Done."

scripts/run_lambda_0p1_existing_gate_token_util.sh ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+# Train one GRPO run with:
+# - lambda = 0.1
+# - strict format gate = true
+# - non-multiplicative interaction (correctness_length)
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/reward_variants_lambda_0p1}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}"
+mkdir -p "${OUT_ROOT}"
+LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util_$(date +%Y%m%d_%H%M%S).log}"
+exec > >(tee -a "${LOG_PATH}") 2>&1
+TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+CFG_PATH="${TMP_DIR}/grpo_lambda_0p1_existing_gate_token_util.yaml"
+echo "Preparing config..."
+echo "Base config: ${BASE_CONFIG}"
+echo "Out root: ${OUT_ROOT}"
+echo "Accelerate config: ${ACCELERATE_CONFIG}"
+echo "Num processes (GPUs): ${NUM_PROCESSES}"
+echo "Log path: ${LOG_PATH}"
+BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" REPO_ROOT="${REPO_ROOT}" "${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+repo_root = Path(os.environ["REPO_ROOT"])
+base_config = Path(os.environ["BASE_CONFIG"])
+out_root = Path(os.environ["OUT_ROOT"])
+cfg_path = Path(os.environ["CFG_PATH"])
+with base_config.open("r", encoding="utf-8") as handle:
+    cfg = yaml.safe_load(handle)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("rewards", {})
+cfg["rewards"].setdefault("kwargs", {})
+rewards_kwargs = cfg["rewards"]["kwargs"]
+rewards_kwargs.setdefault("token_utilisation_reward", {})
+rewards_kwargs["token_utilisation_reward"]["results_jsonl_path"] = str(
+    repo_root / "artifacts/eval/gsm8k_train_zeroshot/results.jsonl"
+)
+cfg.setdefault("objective", {})
+cfg["objective"].setdefault("kwargs", {})
+obj = cfg["objective"]["kwargs"]
+obj["enable_length_penalty"] = True
+obj["enable_token_utilisation_reward"] = True
+obj["reward_mode"] = "weighted_length_penalty"
+obj["length_penalty_lambda"] = 0.1
+obj["length_penalty_interaction"] = "correctness_length_format"
+obj["strict_format_gate"] = True
+obj["non_strict_penalty"] = -1.0
+cfg.setdefault("trainer", {})
+base_run_name = cfg["trainer"].get("run_name", "grpo")
+cfg["trainer"]["run_name"] = f"{base_run_name}-lambda-0p1-existing-gate-token-util"
+cfg["trainer"]["output_dir"] = str(out_root / "run_lambda_0p1_existing_gate_token_util")
+with cfg_path.open("w", encoding="utf-8") as handle:
+    yaml.safe_dump(cfg, handle, sort_keys=False)
+print(f"Wrote config: {cfg_path}")
+print(f"Run name: {cfg['trainer']['run_name']}")
+print(f"Output dir: {cfg['trainer']['output_dir']}")
+PY
+echo
+echo "Starting training..."
+export PYTHONPATH="${REPO_ROOT}/src"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/train_grpo.py \
+  --config "${CFG_PATH}"
+echo
+echo "Done."
+echo "Model outputs/checkpoints under: ${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util"
+echo "Full log: ${LOG_PATH}"

scripts/run_reward_variants_and_eval.sh ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+# Train 3 reward variants sequentially, then evaluate all and save outputs+plot.
+#
+# Variants (all lambda=0.75):
+# 1) existing_gate:
+#    total = r_correctness + lambda*(r_correctness*r_length) + r_format
+#    strict_format_gate = true
+# 2) mult_nogate:
+#    total = r_correctness + lambda*(r_correctness*r_length*r_format) + r_format
+#    strict_format_gate = false
+# 3) mult_gate:
+#    total = r_correctness + lambda*(r_correctness*r_length*r_format) + r_format
+#    strict_format_gate = true
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+LAMBDA="${LAMBDA:-0.75}"
+SWEEP_ROOT="${SWEEP_ROOT:-artifacts/sweeps/reward_variants_lambda_0p75}"
+EVAL_OUT_ROOT="${EVAL_OUT_ROOT:-${SWEEP_ROOT}/eval_results}"
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}"
+mkdir -p "${SWEEP_ROOT}"
+LOG_PATH="${LOG_PATH:-${SWEEP_ROOT}/run_reward_variants_and_eval_$(date +%Y%m%d_%H%M%S).log}"
+exec > >(tee -a "${LOG_PATH}") 2>&1
+TMP_DIR="$(mktemp -d "${SWEEP_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+VARIANTS=("existing_gate" "mult_nogate" "mult_gate")
+echo "Starting 3-variant reward sweep..."
+echo "Base config: ${BASE_CONFIG}"
+echo "Sweep root: ${SWEEP_ROOT}"
+echo "Lambda: ${LAMBDA}"
+echo "Num processes (GPUs): ${NUM_PROCESSES}"
+echo "Accelerate config: ${ACCELERATE_CONFIG}"
+echo "Live+saved log path: ${LOG_PATH}"
+for VARIANT in "${VARIANTS[@]}"; do
+  echo
+  echo "=== Training variant: ${VARIANT} ==="
+  CFG_PATH="${TMP_DIR}/grpo_${VARIANT}.yaml"
+  BASE_CONFIG="${BASE_CONFIG}" \
+  SWEEP_ROOT="${SWEEP_ROOT}" \
+  CFG_PATH="${CFG_PATH}" \
+  VARIANT="${VARIANT}" \
+  LAMBDA="${LAMBDA}" \
+  "${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+base_config = Path(os.environ["BASE_CONFIG"])
+sweep_root = Path(os.environ["SWEEP_ROOT"])
+cfg_path = Path(os.environ["CFG_PATH"])
+variant = os.environ["VARIANT"]
+lam = float(os.environ["LAMBDA"])
+with base_config.open("r", encoding="utf-8") as handle:
+    cfg = yaml.safe_load(handle)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("objective", {})
+cfg["objective"].setdefault("kwargs", {})
+obj = cfg["objective"]["kwargs"]
+obj["enable_length_penalty"] = True
+obj["reward_mode"] = "weighted_length_penalty"
+obj["length_penalty_lambda"] = lam
+obj["non_strict_penalty"] = -1.0
+if variant == "existing_gate":
+    obj["length_penalty_interaction"] = "correctness_length"
+    obj["strict_format_gate"] = True
+elif variant == "mult_nogate":
+    obj["length_penalty_interaction"] = "correctness_length_format"
+    obj["strict_format_gate"] = False
+elif variant == "mult_gate":
+    obj["length_penalty_interaction"] = "correctness_length_format"
+    obj["strict_format_gate"] = True
+else:
+    raise ValueError(f"Unknown variant: {variant}")
+cfg.setdefault("trainer", {})
+base_run_name = cfg["trainer"].get("run_name", "grpo")
+cfg["trainer"]["run_name"] = f"{base_run_name}-{variant}"
+cfg["trainer"]["output_dir"] = str(sweep_root / f"run_lambda_0p75_{variant}")
+with cfg_path.open("w", encoding="utf-8") as handle:
+    yaml.safe_dump(cfg, handle, sort_keys=False)
+print(f"Wrote {cfg_path}")
+print(f"Run dir: {cfg['trainer']['output_dir']}")
+PY
+  accelerate launch \
+    --config_file "${ACCELERATE_CONFIG}" \
+    --num_processes "${NUM_PROCESSES}" \
+    src/train_grpo.py \
+    --config "${CFG_PATH}"
+done
+echo
+echo "=== Running sweep evaluation on all trained variants ==="
+BASE_CONFIG="${BASE_CONFIG}" \
+SWEEP_ROOT="${SWEEP_ROOT}" \
+OUT_ROOT="${EVAL_OUT_ROOT}" \
+EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}" \
+EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}" \
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/eval_sweep_models.py
+echo
+echo "All done."
+echo "Train runs: ${SWEEP_ROOT}/run_lambda_0p75_*"
+echo "Eval summary JSON: ${EVAL_OUT_ROOT}/sweep_eval_summary.json"
+echo "Eval summary CSV:  ${EVAL_OUT_ROOT}/sweep_eval_summary.csv"
+echo "Eval outputs:      ${EVAL_OUT_ROOT}/outputs/"
+echo "Accuracy plot:     ${EVAL_OUT_ROOT}/sweep_eval_accuracy.svg"

scripts/run_sft_gsm8k_boxed_7gpu.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env bash
+# SFT on GSM8K: assistant target is only \boxed{answer} (no thinking). DDP on 7 GPUs, then GSM8K test eval on GPU 0.
+#
+# Usage:
+#   ./scripts/run_sft_gsm8k_boxed_7gpu.sh
+#   MODEL_PATH=/path/to/model HF_DATASETS_OFFLINE=0 ./scripts/run_sft_gsm8k_boxed_7gpu.sh
+#   NUM_TRAIN_EPOCHS=3 ./scripts/run_sft_gsm8k_boxed_7gpu.sh
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+cd "${REPO_ROOT}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_7gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-7}"
+MODEL_PATH="${MODEL_PATH:-${REPO_ROOT}/cache/models/models--unsloth--Llama-3.2-3B-Instruct}"
+# Unsloth hub cache snapshots often lack config.json / model_type; SFT pulls config from this id (HF token if gated).
+HUB_MODEL_ID="${HUB_MODEL_ID:-meta-llama/Llama-3.2-3B-Instruct}"
+OUTPUT_DIR="${OUTPUT_DIR:-artifacts/runs/sft_gsm8k_boxed}"
+NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-1}"
+if [[ -z "${CUDA_HOME:-}" ]] && command -v nvcc >/dev/null 2>&1; then
+  export CUDA_HOME="$(dirname "$(dirname "$(command -v nvcc)")")"
+fi
+export PYTHONPATH="${REPO_ROOT}/src"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
+# Everything after src/sft_gsm8k_boxed.py is passed to Python, not to accelerate.
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/sft_gsm8k_boxed.py \
+  --model_path "${MODEL_PATH}" \
+  --hub_model_id "${HUB_MODEL_ID}" \
+  --output_dir "${OUTPUT_DIR}" \
+  --num_train_epochs "${NUM_TRAIN_EPOCHS}"
+echo "Done. Weights: ${OUTPUT_DIR}/final_model  metrics: ${OUTPUT_DIR}/gsm8k_test_metrics.json"

scripts/run_twostage_correctness1.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+# Two-stage run with correctness_weight = 1.0
+# Stage 1/2 schedule is inherited from BASE_CONFIG:
+#   - length_penalty_lambda (stage 1)
+#   - stage2_length_penalty_lambda (stage 2)
+#   - stage2_start_epoch
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/twostage_correctness_weight}"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}"
+mkdir -p "${OUT_ROOT}"
+LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_twostage_correctness1_$(date +%Y%m%d_%H%M%S).log}"
+exec > >(tee -a "${LOG_PATH}") 2>&1
+TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+CFG_PATH="${TMP_DIR}/grpo_twostage_correctness1.yaml"
+BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" "${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+base_config = Path(os.environ["BASE_CONFIG"])
+out_root = Path(os.environ["OUT_ROOT"])
+cfg_path = Path(os.environ["CFG_PATH"])
+with base_config.open("r", encoding="utf-8") as handle:
+    cfg = yaml.safe_load(handle)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("objective", {})
+cfg["objective"].setdefault("kwargs", {})
+cfg["objective"]["kwargs"]["correctness_weight"] = 1.0
+cfg.setdefault("trainer", {})
+base_run_name = cfg["trainer"].get("run_name", "grpo")
+cfg["trainer"]["run_name"] = f"{base_run_name}-twostage-cw1"
+cfg["trainer"]["output_dir"] = str(out_root / "run_twostage_correctness1")
+with cfg_path.open("w", encoding="utf-8") as handle:
+    yaml.safe_dump(cfg, handle, sort_keys=False)
+print(f"Wrote config: {cfg_path}")
+print(f"Output dir: {cfg['trainer']['output_dir']}")
+PY
+echo "Starting 4-GPU training (correctness_weight=1.0)"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/train_grpo.py \
+  --config "${CFG_PATH}"
+echo "Done. Log: ${LOG_PATH}"

scripts/run_twostage_correctness5.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+# Two-stage run with correctness_weight = 5.0
+# Stage 1/2 schedule is inherited from BASE_CONFIG:
+#   - length_penalty_lambda (stage 1)
+#   - stage2_length_penalty_lambda (stage 2)
+#   - stage2_start_epoch
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/twostage_correctness_weight}"
+export WANDB_DIR="${WANDB_DIR:-${REPO_ROOT}/cache/logs/wandb}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}"
+mkdir -p "${OUT_ROOT}"
+LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_twostage_correctness5_$(date +%Y%m%d_%H%M%S).log}"
+exec > >(tee -a "${LOG_PATH}") 2>&1
+TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+CFG_PATH="${TMP_DIR}/grpo_twostage_correctness5.yaml"
+BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" "${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+base_config = Path(os.environ["BASE_CONFIG"])
+out_root = Path(os.environ["OUT_ROOT"])
+cfg_path = Path(os.environ["CFG_PATH"])
+with base_config.open("r", encoding="utf-8") as handle:
+    cfg = yaml.safe_load(handle)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("objective", {})
+cfg["objective"].setdefault("kwargs", {})
+cfg["objective"]["kwargs"]["correctness_weight"] = 5.0
+cfg.setdefault("trainer", {})
+base_run_name = cfg["trainer"].get("run_name", "grpo")
+cfg["trainer"]["run_name"] = f"{base_run_name}-twostage-cw5"
+cfg["trainer"]["output_dir"] = str(out_root / "run_twostage_correctness5")
+with cfg_path.open("w", encoding="utf-8") as handle:
+    yaml.safe_dump(cfg, handle, sort_keys=False)
+print(f"Wrote config: {cfg_path}")
+print(f"Output dir: {cfg['trainer']['output_dir']}")
+PY
+echo "Starting 4-GPU training (correctness_weight=5.0)"
+accelerate launch \
+  --config_file "${ACCELERATE_CONFIG}" \
+  --num_processes "${NUM_PROCESSES}" \
+  src/train_grpo.py \
+  --config "${CFG_PATH}"
+echo "Done. Log: ${LOG_PATH}"

scripts/sweep_length_penalty_lambda.sh ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env bash
+set -eo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "${SCRIPT_DIR}/conda_env.sh"
+set -u
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/src"
+# Sweep two GRPO runs over length_penalty_lambda:
+#   - 0.5
+#   - 0.25
+#
+# Each run keeps all other settings identical to BASE_CONFIG.
+BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
+ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
+NUM_PROCESSES="${NUM_PROCESSES:-4}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/length_penalty_lambda}"
+LAMBDAS="${LAMBDAS:-0.5 0.25}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
+export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
+mkdir -p "${OUT_ROOT}"
+TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
+trap 'rm -rf "${TMP_DIR}"' EXIT
+echo "Base config: ${BASE_CONFIG}"
+echo "Lambdas: ${LAMBDAS}"
+echo "Output root: ${OUT_ROOT}"
+for LAMBDA in ${LAMBDAS}; do
+  echo
+  echo "=== Running lambda=${LAMBDA} ==="
+  CFG_PATH="${TMP_DIR}/grpo_lambda_${LAMBDA}.yaml"
+  BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" LAMBDA="${LAMBDA}" CFG_PATH="${CFG_PATH}" "${PYTHON_BIN}" - <<'PY'
+import copy
+import os
+from pathlib import Path
+import yaml
+base_config = Path(os.environ["BASE_CONFIG"])
+out_root = Path(os.environ["OUT_ROOT"])
+cfg_path = Path(os.environ["CFG_PATH"])
+lam = os.environ["LAMBDA"]
+with base_config.open("r", encoding="utf-8") as handle:
+    cfg = yaml.safe_load(handle)
+cfg = copy.deepcopy(cfg)
+cfg.setdefault("objective", {})
+cfg["objective"].setdefault("kwargs", {})
+cfg["objective"]["kwargs"]["enable_length_penalty"] = True
+cfg["objective"]["kwargs"]["reward_mode"] = "weighted_length_penalty"
+cfg["objective"]["kwargs"]["length_penalty_lambda"] = float(lam)
+cfg.setdefault("trainer", {})
+base_run_name = cfg["trainer"].get("run_name", "grpo")
+safe_lam = lam.replace(".", "p")
+cfg["trainer"]["run_name"] = f"{base_run_name}-lambda-{safe_lam}"
+cfg["trainer"]["output_dir"] = str(out_root / f"run_lambda_{safe_lam}")
+with cfg_path.open("w", encoding="utf-8") as handle:
+    yaml.safe_dump(cfg, handle, sort_keys=False)
+print(f"Wrote config: {cfg_path}")
+print(f"run_name: {cfg['trainer']['run_name']}")
+print(f"output_dir: {cfg['trainer']['output_dir']}")
+PY
+  accelerate launch \
+    --config_file "${ACCELERATE_CONFIG}" \
+    --num_processes "${NUM_PROCESSES}" \
+    src/train_grpo.py \
+    --config "${CFG_PATH}"
+done
+echo
+echo "Sweep complete. Runs are under: ${OUT_ROOT}"

src/eval_gsm8k_truncated_thinking.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""
+GSM8K eval with the same short ``<think>...</think>`` tags as ``data_plugins.PROMPT_PREFIX``.
+The inner span is truncated to at most ``--think_inner_token_limit`` tokenizer tokens (a close tag
+is forced if the model would exceed that, including when it closes late). Then greedy generation
+continues for the boxed answer.
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import yaml
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import hackable  # noqa: F401
+from hackable.data_plugins import GSM8KProvider
+from hackable.paths import resolve_storage_path, storage_layout
+from hackable.reward_plugins import gsm8k_correctness_reward
+from hackable.utils import resolve_repo_path
+THINK_OPEN = "<think>"
+THINK_CLOSE = "</think>"
+def _greedy_generate_rest(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    cur: torch.Tensor,
+    plen: int,
+    max_new_tokens: int,
+    device: torch.device,
+) -> str:
+    """Continue greedy from ``cur``; completion length (after prompt) is capped by ``max_new_tokens``."""
+    generated = int(cur.shape[1]) - plen
+    remaining = max_new_tokens - generated
+    if remaining <= 0:
+        return tokenizer.decode(cur[0, plen:], skip_special_tokens=False)
+    attn = torch.ones_like(cur, device=device)
+    rest = model.generate(
+        cur,
+        attention_mask=attn,
+        max_new_tokens=remaining,
+        do_sample=False,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    return tokenizer.decode(rest[0, plen:], skip_special_tokens=False)
+def _load_yaml(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+def _model_dtype(cfg: dict) -> torch.dtype:
+    return torch.bfloat16 if bool(cfg.get("trainer", {}).get("bf16", True)) else torch.float16
+def _get_cache_paths(base_cfg: dict) -> tuple[Path, Path]:
+    layout = storage_layout(base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    return layout.datasets, layout.models
+def _dist_info() -> tuple[int, int, int]:
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    return rank, world_size, local_rank
+def _init_distributed() -> tuple[int, int, int]:
+    rank, world_size, local_rank = _dist_info()
+    if world_size > 1 and not dist.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        dist.init_process_group(backend=backend, init_method="env://")
+    return rank, world_size, local_rank
+def _resolve_local_model_dir(base_cfg: dict, model_dir: str) -> Path:
+    candidate = Path(model_dir)
+    if candidate.is_absolute() and candidate.exists():
+        return candidate.resolve()
+    repo_local = resolve_repo_path(model_dir)
+    if repo_local.exists():
+        return repo_local
+    prefixed = resolve_storage_path(model_dir, base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    if prefixed.exists():
+        return prefixed
+    raise FileNotFoundError(f"Model directory not found: {model_dir}")
+def _flash_attn_impl() -> str:
+    if importlib.util.find_spec("flash_attn") is not None:
+        try:
+            __import__("flash_attn")
+            return "flash_attention_2"
+        except Exception:
+            pass
+    return "sdpa"
+@torch.no_grad()
+def generate_truncated_thinking(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    prompt: str,
+    device: torch.device,
+    inner_token_limit: int,
+    max_new_tokens: int,
+) -> str:
+    """
+    Inner reasoning is **always** at most ``inner_token_limit`` tokenizer tokens: we never keep a
+    longer span. If the model emits ``</redacted_thinking>`` after more than that many inner tokens,
+    we rewrite the prefix to the first ``inner_token_limit`` inner tokens, then ``</redacted_thinking>``,
+    then greedy-decode the boxed answer. If there is no close tag yet but inner already has that many
+    tokens, we force the close tag and decode.
+    """
+    enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
+    input_ids = enc["input_ids"].to(device)
+    plen = int(input_ids.shape[1])
+    prompt_ids = input_ids.clone()
+    cur = input_ids.clone()
+    generated_count = 0
+    def rewrite_and_continue(gen_text: str, after_open_idx: int, inner_text: str) -> str:
+        inner_ids = tokenizer.encode(inner_text, add_special_tokens=False)
+        inner_cut = inner_ids[:inner_token_limit]
+        inner_str = tokenizer.decode(inner_cut, skip_special_tokens=False)
+        fixed_comp = gen_text[:after_open_idx] + inner_str + THINK_CLOSE + "\n"
+        comp_ids = tokenizer.encode(fixed_comp, add_special_tokens=False)
+        comp_t = torch.tensor([comp_ids], dtype=torch.long, device=device)
+        new_cur = torch.cat([prompt_ids, comp_t], dim=-1)
+        return _greedy_generate_rest(model, tokenizer, new_cur, plen, max_new_tokens, device)
+    while generated_count < max_new_tokens:
+        out = model(cur)
+        next_id = out.logits[:, -1, :].argmax(dim=-1, keepdim=True)
+        cur = torch.cat([cur, next_id], dim=-1)
+        generated_count += 1
+        gen_text = tokenizer.decode(cur[0, plen:], skip_special_tokens=False)
+        # If the model never opens thinking, fall back to normal greedy (much faster).
+        if generated_count >= 128 and THINK_OPEN not in gen_text:
+            return _greedy_generate_rest(model, tokenizer, cur, plen, max_new_tokens, device)
+        if THINK_OPEN not in gen_text:
+            continue
+        after_open_idx = gen_text.find(THINK_OPEN) + len(THINK_OPEN)
+        after_open = gen_text[after_open_idx:]
+        if THINK_CLOSE in after_open:
+            inner_part, _rest = after_open.split(THINK_CLOSE, 1)
+            n_inner = len(tokenizer.encode(inner_part, add_special_tokens=False))
+            if n_inner > inner_token_limit:
+                return rewrite_and_continue(gen_text, after_open_idx, inner_part)
+            return _greedy_generate_rest(model, tokenizer, cur, plen, max_new_tokens, device)
+        inner_ids = tokenizer.encode(after_open, add_special_tokens=False)
+        if len(inner_ids) >= inner_token_limit:
+            inner_str = tokenizer.decode(inner_ids[:inner_token_limit], skip_special_tokens=False)
+            fixed_comp = gen_text[:after_open_idx] + inner_str + THINK_CLOSE + "\n"
+            comp_ids = tokenizer.encode(fixed_comp, add_special_tokens=False)
+            comp_t = torch.tensor([comp_ids], dtype=torch.long, device=device)
+            cur = torch.cat([prompt_ids, comp_t], dim=-1)
+            return _greedy_generate_rest(model, tokenizer, cur, plen, max_new_tokens, device)
+    return tokenizer.decode(cur[0, plen:], skip_special_tokens=False)
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="configs/grpo_llama32_3b_bf16.yaml")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Checkpoint directory (e.g. artifacts/.../checkpoints/permanent/checkpoint-...).",
+    )
+    parser.add_argument("--think_inner_token_limit", type=int, default=5)
+    parser.add_argument("--max_samples", type=int, default=-1, help="GSM8K test examples; -1 = all.")
+    parser.add_argument("--max_new_tokens", type=int, default=256)
+    args = parser.parse_args()
+    rank, world_size, local_rank = _init_distributed()
+    base_cfg = _load_yaml(str(resolve_repo_path(args.config)))
+    model_dir = _resolve_local_model_dir(base_cfg, args.model_path)
+    datasets_cache, models_cache = _get_cache_paths(base_cfg)
+    dtype = _model_dtype(base_cfg)
+    model_name_fallback = str(base_cfg["model"]["name"])
+    trust_remote_code = bool(base_cfg.get("model", {}).get("trust_remote_code", True))
+    max_prompt_len = int(base_cfg.get("generation", {}).get("max_prompt_length", 512))
+    local_files_only = os.environ.get("LOCAL_FILES_ONLY", "0").strip() in ("1", "true", "True")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            str(model_dir),
+            trust_remote_code=trust_remote_code,
+            cache_dir=str(models_cache),
+            local_files_only=local_files_only,
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name_fallback,
+            trust_remote_code=trust_remote_code,
+            cache_dir=str(models_cache),
+            local_files_only=local_files_only,
+        )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        str(model_dir),
+        trust_remote_code=trust_remote_code,
+        cache_dir=str(models_cache),
+        torch_dtype=dtype,
+        local_files_only=local_files_only,
+        attn_implementation=_flash_attn_impl(),
+    )
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank)
+        device = torch.device(f"cuda:{local_rank}")
+    else:
+        device = torch.device("cpu")
+    model.to(device)
+    model.eval()
+    provider = GSM8KProvider()
+    eval_max = None if args.max_samples < 0 else args.max_samples
+    all_samples = provider.load(
+        split="test",
+        max_samples=eval_max,
+        cache_dir=str(datasets_cache),
+    )
+    indices = list(range(rank, len(all_samples), world_size))
+    local_samples = [all_samples[i] for i in indices]
+    records: list[dict] = []
+    for sample in local_samples:
+        prompt = sample.prompt
+        if len(tokenizer.encode(prompt)) > max_prompt_len:
+            # Match eval_sweep style: truncate prompt text is complex; skip ultra-long for this script
+            enc = tokenizer(prompt, truncation=True, max_length=max_prompt_len, return_tensors="pt")
+            prompt = tokenizer.decode(enc["input_ids"][0], skip_special_tokens=False)
+        completion = generate_truncated_thinking(
+            model,
+            tokenizer,
+            prompt,
+            device,
+            inner_token_limit=args.think_inner_token_limit,
+            max_new_tokens=args.max_new_tokens,
+        )
+        score = gsm8k_correctness_reward(
+            prompts=[prompt],
+            completions=[completion],
+            references=[sample.target],
+            metadata=[sample.metadata],
+        )[0]
+        records.append(
+            {
+                "sample_index": sample.metadata.get("sample_index", -1),
+                "correctness": float(score),
+                "prompt": prompt,
+                "reference": sample.target,
+                "completion": completion,
+            }
+        )
+    del model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if dist.is_initialized():
+        gathered: list[list[dict] | None] = [None for _ in range(world_size)]
+        dist.all_gather_object(gathered, records)
+        merged: list[dict] = []
+        for part in gathered:
+            if part:
+                merged.extend(part)
+    else:
+        merged = records
+    if rank != 0:
+        return
+    merged.sort(key=lambda r: int(r.get("sample_index", 0)))
+    acc = sum(r["correctness"] for r in merged) / len(merged) if merged else 0.0
+    summary = {
+        "model_dir": str(model_dir),
+        "think_inner_token_limit": args.think_inner_token_limit,
+        "max_new_tokens": args.max_new_tokens,
+        "num_examples": len(merged),
+        "accuracy": float(acc),
+    }
+    print(json.dumps(summary, indent=2))
+    out = os.environ.get("OUTPUT_PATH")
+    if out:
+        p = resolve_repo_path(out)
+        p.parent.mkdir(parents=True, exist_ok=True)
+        with p.open("w", encoding="utf-8") as handle:
+            for row in merged:
+                handle.write(json.dumps(row, ensure_ascii=True) + "\n")
+        meta_path = p.with_suffix(".summary.json")
+        meta_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+        print("Wrote", p, "and", meta_path)
+if __name__ == "__main__":
+    main()

src/eval_gsm8k_zeroshot.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""GSM8K zero-shot eval for a single checkpoint (same pipeline as eval_sweep_models)."""
+from __future__ import annotations
+import copy
+import json
+import os
+from pathlib import Path
+from eval_sweep_models import (
+    _init_distributed,
+    _load_yaml,
+    _resolve_local_model_dir,
+    _summarize,
+    evaluate_one_model,
+)
+from hackable.utils import resolve_repo_path
+def _model_dir_from_env() -> str:
+    for key in ("MODEL_PATH", "MODEL_DIR"):
+        v = os.environ.get(key)
+        if v:
+            return v
+    raise SystemExit(
+        "Set MODEL_PATH or MODEL_DIR to the checkpoint directory "
+        "(e.g. artifacts/sweeps/kl_0/run_foo/checkpoints/permanent/checkpoint-1871)."
+    )
+def _eval_max_samples() -> int:
+    if "EVAL_MAX_SAMPLES" in os.environ:
+        return int(os.environ["EVAL_MAX_SAMPLES"])
+    if "MAX_SAMPLES" in os.environ:
+        return int(os.environ["MAX_SAMPLES"])
+    return -1
+def main() -> None:
+    rank, _, _ = _init_distributed()
+    base_cfg_path = str(resolve_repo_path(os.environ.get("BASE_CONFIG", "configs/grpo_llama32_3b_bf16.yaml")))
+    base_cfg = copy.deepcopy(_load_yaml(base_cfg_path))
+    gen = base_cfg.setdefault("generation", {})
+    if os.environ.get("MAX_NEW_TOKENS"):
+        gen["max_completion_length"] = int(os.environ["MAX_NEW_TOKENS"])
+    if os.environ.get("MAX_PROMPT_LENGTH"):
+        gen["max_prompt_length"] = int(os.environ["MAX_PROMPT_LENGTH"])
+    model_dir = _resolve_local_model_dir(base_cfg, _model_dir_from_env())
+    eval_max = _eval_max_samples()
+    batch_size = int(os.environ.get("BATCH_SIZE", "4"))
+    records = evaluate_one_model(
+        model_dir=model_dir,
+        base_cfg=base_cfg,
+        eval_max_samples=eval_max,
+        batch_size=batch_size,
+    )
+    if rank != 0:
+        return
+    out = resolve_repo_path(os.environ.get("OUTPUT_PATH", "artifacts/eval/gsm8k_zeroshot_answers.jsonl"))
+    out = Path(out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as handle:
+        for row in records:
+            handle.write(json.dumps(row, ensure_ascii=True) + "\n")
+    summary = _summarize(records, str(model_dir))
+    summary["outputs_jsonl"] = str(out.resolve())
+    print(json.dumps(summary, indent=2))
+if __name__ == "__main__":
+    main()

src/eval_math_level1_thinking_zeroshot.py ADDED Viewed

	@@ -0,0 +1,312 @@

+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import yaml
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import hackable  # noqa: F401
+from hackable import reward_plugins as reward_plugins_mod
+from hackable.utils import resolve_repo_path
+THINKING_SYSTEM_PROMPT = (
+    "Solve the following math problem.\n"
+    "Think step-by-step inside <think>...</think> tags.\n"
+    "Then output only the final answer in LaTeX boxed format.\n"
+    "Do not include any words or explanations outside the tags/boxed answer.\n"
+    "Output format must be exactly:\n"
+    "<think>your reasoning</think>\n"
+    "\\boxed{your_final_answer}\n"
+)
+def _load_yaml(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+def _dist_info() -> tuple[int, int, int]:
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    return rank, world_size, local_rank
+def _init_distributed() -> tuple[int, int, int]:
+    rank, world_size, local_rank = _dist_info()
+    if world_size > 1 and not dist.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        dist.init_process_group(backend=backend, init_method="env://")
+    return rank, world_size, local_rank
+def _resolve_local_model_dir(base_cfg: dict, model_dir: str) -> Path:
+    candidate = Path(model_dir)
+    if candidate.is_absolute() and candidate.exists():
+        return candidate.resolve()
+    if not candidate.is_absolute() and candidate.exists():
+        return candidate.resolve()
+    repo_local = resolve_repo_path(model_dir)
+    if repo_local.exists():
+        return repo_local
+    cache_root = resolve_repo_path(base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    prefixed = (cache_root / candidate).resolve()
+    if prefixed.exists():
+        return prefixed
+    raise FileNotFoundError(
+        f"Model directory not found locally: '{model_dir}'. "
+        f"Tried '{candidate}', '{repo_local}', and '{prefixed}'."
+    )
+def _build_chat_prompts(
+    tokenizer: AutoTokenizer, questions: list[str], system_prompt: str
+) -> list[str]:
+    if getattr(tokenizer, "chat_template", None) is None:
+        raise RuntimeError("Tokenizer has no chat_template; cannot apply chat formatting.")
+    prompts: list[str] = []
+    for q in questions:
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": q.strip()},
+        ]
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        prompts.append(text)
+    return prompts
+def _load_math_level_rows(
+    level: str,
+    split: str,
+    max_samples: int | None,
+    cache_dir: str | None,
+) -> tuple[list[str], list[str]]:
+    dataset_name = "EleutherAI/hendrycks_math"
+    dataset_configs = (
+        "algebra",
+        "counting_and_probability",
+        "geometry",
+        "intermediate_algebra",
+        "number_theory",
+        "prealgebra",
+        "precalculus",
+    )
+    questions: list[str] = []
+    references: list[str] = []
+    for config_name in dataset_configs:
+        rows = load_dataset(
+            dataset_name,
+            config_name,
+            split=split,
+            cache_dir=cache_dir,
+        )
+        for row in rows:
+            row_level = str(row.get("level", "")).strip()
+            if row_level != level:
+                continue
+            questions.append(str(row.get("problem", "")))
+            references.append(str(row.get("solution", "")))
+            if max_samples is not None and len(questions) >= max_samples:
+                return questions[:max_samples], references[:max_samples]
+    return questions, references
+@torch.no_grad()
+def main() -> None:
+    rank, world_size, local_rank = _init_distributed()
+    base_cfg = _load_yaml(str(resolve_repo_path(os.environ["BASE_CONFIG"])))
+    model_dir = os.environ.get("MODEL_DIR") or os.environ.get("MODEL_PATH")
+    if not model_dir:
+        raise ValueError("Set MODEL_DIR or MODEL_PATH for the checkpoint to evaluate.")
+    resolved_model_dir = _resolve_local_model_dir(base_cfg, model_dir)
+    generation = base_cfg.get("generation", {})
+    max_prompt_length = int(generation.get("max_prompt_length", 512))
+    max_new_tokens = int(generation.get("max_completion_length", 256))
+    max_prompt_length = int(os.environ.get("MAX_PROMPT_LENGTH", str(max_prompt_length)))
+    max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", str(max_new_tokens)))
+    split = os.environ.get("MATH_SPLIT", "test")
+    max_samples_env = os.environ.get("MAX_SAMPLES", os.environ.get("EVAL_MAX_SAMPLES", "-1"))
+    max_samples = None if int(max_samples_env) < 0 else int(max_samples_env)
+    batch_size = int(os.environ.get("BATCH_SIZE", "4"))
+    cache_root = resolve_repo_path(base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    datasets_cache = str(cache_root / "datasets")
+    models_cache = str(cache_root / "models")
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(resolved_model_dir),
+        trust_remote_code=bool(base_cfg.get("model", {}).get("trust_remote_code", False)),
+        cache_dir=models_cache,
+        local_files_only=True,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Decoder-only safe.
+    tokenizer.padding_side = "left"
+    dtype = torch.bfloat16 if bool(base_cfg.get("trainer", {}).get("bf16", True)) else torch.float16
+    model = AutoModelForCausalLM.from_pretrained(
+        str(resolved_model_dir),
+        trust_remote_code=bool(base_cfg.get("model", {}).get("trust_remote_code", False)),
+        cache_dir=models_cache,
+        torch_dtype=dtype,
+        local_files_only=True,
+    )
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank)
+        device = torch.device(f"cuda:{local_rank}")
+    else:
+        device = torch.device("cpu")
+    model.to(device)
+    model.eval()
+    questions, references = _load_math_level_rows(
+        level="Level 1",
+        split=split,
+        max_samples=max_samples,
+        cache_dir=datasets_cache,
+    )
+    indices = list(range(rank, len(questions), world_size))
+    local_questions = [questions[i] for i in indices]
+    local_refs = [references[i] for i in indices]
+    chat_prompts = _build_chat_prompts(tokenizer, local_questions, THINKING_SYSTEM_PROMPT)
+    completions: list[str] = []
+    for start in range(0, len(chat_prompts), batch_size):
+        batch_prompts = chat_prompts[start : start + batch_size]
+        enc = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_prompt_length,
+        )
+        input_ids = enc["input_ids"].to(device)
+        attn = enc["attention_mask"].to(device)
+        prompt_seq_len = input_ids.shape[1]
+        out = model.generate(
+            input_ids=input_ids,
+            attention_mask=attn,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+        for bi in range(out.size(0)):
+            gen_ids = out[bi, prompt_seq_len:]
+            completions.append(tokenizer.decode(gen_ids, skip_special_tokens=True))
+    # Strict boxed correctness (project metric)
+    strict_scores = []
+    for completion, reference in zip(completions, local_refs, strict=True):
+        pred_text = reward_plugins_mod._extract_predicted_answer_text(completion)
+        ref_text = reward_plugins_mod._extract_reference_answer_text(reference)
+        if not pred_text or not ref_text:
+            strict_scores.append(0.0)
+            continue
+        pred_norm = reward_plugins_mod._normalize_answer_text(pred_text)
+        ref_norm = reward_plugins_mod._normalize_answer_text(ref_text)
+        if pred_norm and ref_norm and pred_norm == ref_norm:
+            strict_scores.append(1.0)
+            continue
+        pred_value = reward_plugins_mod._parse_numeric(pred_text)
+        ref_value = reward_plugins_mod._parse_numeric(ref_text)
+        if pred_value is not None and ref_value is not None and reward_plugins_mod._is_close(pred_value, ref_value):
+            strict_scores.append(1.0)
+        else:
+            strict_scores.append(0.0)
+    # Lenient numeric correctness fallback
+    lenient_scores: list[float] = []
+    for completion, reference in zip(completions, local_refs, strict=True):
+        ref_val = reward_plugins_mod._extract_reference_target(reference)
+        boxed = reward_plugins_mod._extract_last_boxed(completion)
+        if boxed:
+            pred_val = reward_plugins_mod._parse_numeric(boxed)
+            if pred_val is None:
+                nums = reward_plugins_mod._extract_numbers(boxed)
+                pred_val = nums[-1] if nums else None
+        else:
+            nums = reward_plugins_mod._extract_numbers(completion)
+            pred_val = nums[-1] if nums else None
+        if ref_val is not None and pred_val is not None and reward_plugins_mod._is_close(pred_val, ref_val):
+            lenient_scores.append(1.0)
+        else:
+            lenient_scores.append(0.0)
+    local_records: list[dict] = []
+    for i, idx in enumerate(indices):
+        local_records.append(
+            {
+                "sample_index": int(idx),
+                "question": local_questions[i],
+                "reference_answer": local_refs[i],
+                "model_answer_raw": completions[i],
+                "correctness": float(lenient_scores[i]),
+                "correctness_strict_boxed": float(strict_scores[i]),
+            }
+        )
+    if dist.is_initialized():
+        gathered: list[list[dict] | None] = [None for _ in range(world_size)]
+        dist.all_gather_object(gathered, local_records)
+        merged: list[dict] = []
+        for part in gathered:
+            if part:
+                merged.extend(part)
+    else:
+        merged = local_records
+    if rank != 0:
+        return
+    merged.sort(key=lambda r: r["sample_index"])
+    output_path = resolve_repo_path(
+        os.environ.get(
+            "OUTPUT_PATH",
+            "artifacts/eval/math_level1_thinking_zeroshot/answers.jsonl",
+        )
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as handle:
+        for row in merged:
+            handle.write(json.dumps(row, ensure_ascii=True) + "\n")
+    acc = sum(r["correctness"] for r in merged) / len(merged) if merged else 0.0
+    acc_strict = (
+        sum(r["correctness_strict_boxed"] for r in merged) / len(merged)
+        if merged
+        else 0.0
+    )
+    print(f"Wrote {len(merged)} rows to {output_path}")
+    print(f"Accuracy (lenient numeric): {acc:.4f}")
+    print(f"Accuracy (strict boxed): {acc_strict:.4f}")
+if __name__ == "__main__":
+    main()

src/eval_permanent_checkpoints.py ADDED Viewed

	@@ -0,0 +1,433 @@

+from __future__ import annotations
+import csv
+import json
+import os
+import re
+from pathlib import Path
+import torch.distributed as dist
+import hackable  # noqa: F401
+from hackable.utils import resolve_repo_path
+from eval_sweep_models import (
+    _init_distributed,
+    _load_yaml,
+    _resolve_local_model_dir,
+    evaluate_one_model,
+)
+def _parse_checkpoint_step(dirname: str) -> int | None:
+    m = re.match(r"^checkpoint-(\d+)$", dirname)
+    if m:
+        return int(m.group(1))
+    m = re.search(r"-step-(\d+)$", dirname)
+    if m:
+        return int(m.group(1))
+    return None
+def _discover_checkpoint_jobs(
+    base_cfg: dict, permanent_root: Path, run_label: str
+) -> list[tuple[str, int, str, Path, str]]:
+    """(run_label, step, resolved_model_dir_str, resolved_path, dir_name)"""
+    root = permanent_root.resolve()
+    if not root.is_dir():
+        raise FileNotFoundError(f"Not a directory: {root}")
+    jobs: list[tuple[str, int, str, Path, str]] = []
+    for p in sorted(root.iterdir()):
+        if not p.is_dir():
+            continue
+        step = _parse_checkpoint_step(p.name)
+        if step is None:
+            continue
+        resolved = _resolve_local_model_dir(base_cfg, str(p))
+        jobs.append((run_label, step, str(resolved), resolved, p.name))
+    jobs.sort(key=lambda x: (x[1], x[4]))
+    return jobs
+def _line_chart_svg(
+    series: list[tuple[str, list[tuple[int, float]], str]],
+    title: str,
+    y_label: str,
+    y_max: float,
+    path: Path,
+) -> None:
+    width = 900
+    height = 420
+    lm, rm, tm, bm = 70, 40, 50, 55
+    pw = width - lm - rm
+    ph = height - tm - bm
+    yb = tm + ph
+    all_steps: list[int] = []
+    for _, pts, _ in series:
+        all_steps.extend(s for s, _ in pts)
+    if not all_steps:
+        path.write_text(
+            f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">'
+            f'<text x="40" y="40">{title} (no data)</text></svg>',
+            encoding="utf-8",
+        )
+        return
+    x_min, x_max = min(all_steps), max(all_steps)
+    if x_max == x_min:
+        x_max = x_min + 1
+    def sx(x: int) -> int:
+        return lm + int((x - x_min) / (x_max - x_min) * pw)
+    def sy(y: float) -> int:
+        y = max(0.0, min(y_max, y))
+        return yb - int((y / y_max) * ph) if y_max > 0 else yb
+    parts: list[str] = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">',
+        '<rect width="100%" height="100%" fill="#ffffff"/>',
+        f'<text x="{lm}" y="28" font-size="16" font-family="sans-serif">{title}</text>',
+        f'<text x="20" y="{tm + ph // 2}" font-size="12" font-family="sans-serif" '
+        f'transform="rotate(-90 20 {tm + ph // 2})">{y_label}</text>',
+        f'<line x1="{lm}" y1="{yb}" x2="{lm + pw}" y2="{yb}" stroke="#111" stroke-width="2"/>',
+        f'<line x1="{lm}" y1="{tm}" x2="{lm}" y2="{yb}" stroke="#111" stroke-width="2"/>',
+        f'<text x="{lm + pw // 2}" y="{height - 12}" text-anchor="middle" '
+        f'font-size="12" font-family="sans-serif">Training step</text>',
+    ]
+    for i in range(5):
+        val = (i / 4) * y_max
+        yy = sy(val)
+        parts.append(
+            f'<line x1="{lm - 4}" y1="{yy}" x2="{lm}" y2="{yy}" stroke="#999"/>'
+        )
+        parts.append(
+            f'<text x="{lm - 8}" y="{yy + 4}" text-anchor="end" font-size="10" '
+            f'font-family="sans-serif">{val:.2f}</text>'
+        )
+    legend_x = lm + pw - 200
+    legend_y = tm + 8
+    for idx, (name, pts, color) in enumerate(series):
+        if len(pts) < 2:
+            pts_sorted = sorted(pts, key=lambda z: z[0])
+            if not pts_sorted:
+                continue
+            cx, cy = sx(pts_sorted[0][0]), sy(pts_sorted[0][1])
+            parts.append(
+                f'<circle cx="{cx}" cy="{cy}" r="4" fill="{color}" stroke="#111"/>'
+            )
+        else:
+            pts_sorted = sorted(pts, key=lambda z: z[0])
+            d = "M " + " L ".join(f"{sx(s)} {sy(v)}" for s, v in pts_sorted)
+            parts.append(
+                f'<path d="{d}" fill="none" stroke="{color}" stroke-width="2.5"/>'
+            )
+        parts.append(
+            f'<rect x="{legend_x}" y="{legend_y + idx * 18}" width="10" height="10" fill="{color}"/>'
+        )
+        parts.append(
+            f'<text x="{legend_x + 16}" y="{legend_y + idx * 18 + 9}" font-size="11" '
+            f'font-family="sans-serif">{name}</text>'
+        )
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def _scatter_accuracy_vs_cot_svg(rows: list[dict], path: Path, title: str) -> None:
+    """Scatter: x = avg_cot_words, y = accuracy. One color per ``run_label``; optional path by training step."""
+    width = 640
+    height = 520
+    lm, rm, tm, bm = 72, 160, 52, 64
+    pw = width - lm - rm
+    ph = height - tm - bm
+    yb = tm + ph
+    if not rows:
+        path.write_text(
+            f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">'
+            f'<text x="40" y="40">{title} (no data)</text></svg>',
+            encoding="utf-8",
+        )
+        return
+    labels: list[str] = []
+    seen: set[str] = set()
+    for r in rows:
+        lab = str(r.get("run_label", "run"))
+        if lab not in seen:
+            seen.add(lab)
+            labels.append(lab)
+    colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ca8a04", "#0891b2"]
+    color_map = {lab: colors[i % len(colors)] for i, lab in enumerate(labels)}
+    xs = [float(r["avg_cot_words"]) for r in rows]
+    ys = [float(r["accuracy"]) for r in rows]
+    x_min, x_max = min(xs), max(xs)
+    y_min, y_max = 0.0, 1.0
+    if x_max <= x_min:
+        x_max = x_min + 1.0
+    pad = (x_max - x_min) * 0.06 + 1.0
+    x_min = max(0.0, x_min - pad)
+    x_max = x_max + pad
+    def sx(x: float) -> float:
+        return lm + (x - x_min) / (x_max - x_min) * pw
+    def sy(y: float) -> float:
+        y = max(y_min, min(y_max, y))
+        return yb - (y - y_min) / (y_max - y_min) * ph
+    parts: list[str] = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">',
+        '<rect width="100%" height="100%" fill="#fafafa"/>',
+        f'<text x="{lm}" y="30" font-size="15" font-family="sans-serif">{title}</text>',
+        f'<text x="{width // 2}" y="{height - 18}" text-anchor="middle" font-size="12" '
+        f'font-family="sans-serif">Avg CoT length (words)</text>',
+        f'<text x="18" y="{tm + ph // 2}" font-size="12" font-family="sans-serif" '
+        f'transform="rotate(-90 18 {tm + ph // 2})">Accuracy</text>',
+        f'<line x1="{lm}" y1="{yb}" x2="{lm + pw}" y2="{yb}" stroke="#111" stroke-width="2"/>',
+        f'<line x1="{lm}" y1="{tm}" x2="{lm}" y2="{yb}" stroke="#111" stroke-width="2"/>',
+    ]
+    for i in range(5):
+        val = y_min + (i / 4) * (y_max - y_min)
+        yy = sy(val)
+        parts.append(f'<line x1="{lm - 4}" y1="{yy}" x2="{lm}" y2="{yy}" stroke="#bbb"/>')
+        parts.append(
+            f'<text x="{lm - 8}" y="{yy + 4}" text-anchor="end" font-size="10" '
+            f'font-family="sans-serif">{val:.2f}</text>'
+        )
+    for i in range(5):
+        frac = i / 4
+        xv = x_min + frac * (x_max - x_min)
+        xx = sx(xv)
+        parts.append(f'<line x1="{xx}" y1="{yb}" x2="{xx}" y2="{yb + 4}" stroke="#bbb"/>')
+        parts.append(
+            f'<text x="{xx}" y="{yb + 18}" text-anchor="middle" font-size="10" '
+            f'font-family="sans-serif">{xv:.0f}</text>'
+        )
+    for lab in labels:
+        sub = [r for r in rows if str(r.get("run_label", "run")) == lab]
+        sub.sort(key=lambda r: int(r["checkpoint_step"]))
+        color = color_map[lab]
+        if len(sub) >= 2:
+            d = "M " + " L ".join(f'{sx(float(r["avg_cot_words"])):.1f} {sy(float(r["accuracy"])):.1f}' for r in sub)
+            parts.append(
+                f'<path d="{d}" fill="none" stroke="{color}" stroke-width="1.5" stroke-opacity="0.35"/>'
+            )
+    for r in rows:
+        lab = str(r.get("run_label", "run"))
+        color = color_map[lab]
+        cx = sx(float(r["avg_cot_words"]))
+        cy = sy(float(r["accuracy"]))
+        step = int(r["checkpoint_step"])
+        name = str(r.get("checkpoint_dir", f"step-{step}"))
+        tip = f"{name}: accuracy={float(r['accuracy']):.4f}, avg_cot_words={float(r['avg_cot_words']):.2f}"
+        parts.append(
+            f'<g><circle cx="{cx:.1f}" cy="{cy:.1f}" r="5" fill="{color}" stroke="#111" stroke-width="1">'
+            f"<title>{tip}</title></circle>"
+            f'<text x="{cx + 8:.1f}" y="{cy - 6:.1f}" font-size="9" font-family="sans-serif" fill="#333">{step}</text></g>'
+        )
+    legend_x = lm + pw + 14
+    legend_y = tm + 4
+    parts.append(
+        f'<text x="{legend_x}" y="{legend_y}" font-size="11" font-family="sans-serif" font-weight="bold">Series</text>'
+    )
+    for idx, lab in enumerate(labels):
+        cy = legend_y + 18 + idx * 20
+        parts.append(
+            f'<rect x="{legend_x}" y="{cy - 8}" width="10" height="10" fill="{color_map[lab]}"/>'
+        )
+        parts.append(
+            f'<text x="{legend_x + 16}" y="{cy}" font-size="11" font-family="sans-serif">{lab}</text>'
+        )
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def _resolve_out_root(default: Path) -> Path:
+    raw = os.environ.get("OUT_ROOT")
+    if raw is None or not str(raw).strip():
+        return resolve_repo_path(str(default))
+    return resolve_repo_path(raw)
+def main() -> None:
+    rank, _, _ = _init_distributed()
+    base_cfg = _load_yaml(str(resolve_repo_path(os.environ["BASE_CONFIG"])))
+    eval_max_samples = int(os.environ.get("EVAL_MAX_SAMPLES", "200"))
+    eval_batch_size = int(os.environ.get("EVAL_BATCH_SIZE", "4"))
+    rollout_n = int(os.environ.get("ROLLOUT_SAMPLES", "8"))
+    permanent_root = os.environ.get("PERMANENT_ROOT", "").strip()
+    if permanent_root:
+        pr = resolve_repo_path(permanent_root)
+        run_label_single = os.environ.get("RUN_LABEL", "permanent")
+        out_default = pr / "eval_permanent"
+        out_root = _resolve_out_root(out_default)
+        jobs_single = _discover_checkpoint_jobs(base_cfg, pr, run_label_single)
+        all_jobs = jobs_single
+        jobs_cw1: list = []
+        jobs_cw5: list = []
+    else:
+        cw1_root = resolve_repo_path(os.environ["PERMANENT_CW1"])
+        cw5_root = resolve_repo_path(os.environ["PERMANENT_CW5"])
+        out_default = cw1_root.parent / "eval_permanent"
+        out_root = _resolve_out_root(out_default)
+        jobs_cw1 = _discover_checkpoint_jobs(base_cfg, cw1_root, "correctness_weight_1")
+        jobs_cw5 = _discover_checkpoint_jobs(base_cfg, cw5_root, "correctness_weight_5")
+        all_jobs = jobs_cw1 + jobs_cw5
+    if rank == 0:
+        out_root.mkdir(parents=True, exist_ok=True)
+        (out_root / "rollouts").mkdir(parents=True, exist_ok=True)
+        (out_root / "full_outputs").mkdir(parents=True, exist_ok=True)
+        if permanent_root:
+            print(f"PERMANENT_ROOT: {resolve_repo_path(permanent_root)} ({len(all_jobs)} checkpoints)")
+            for run_label, step, _, _, name in all_jobs:
+                print(f"  {run_label} step={step} ({name})")
+        else:
+            print(f"Found {len(jobs_cw1)} checkpoints (cw=1), {len(jobs_cw5)} checkpoints (cw=5)")
+            for jl in (jobs_cw1, jobs_cw5):
+                for run_label, step, _, _, name in jl:
+                    print(f"  {run_label} step={step} ({name})")
+    if dist.is_initialized():
+        dist.barrier()
+    rows: list[dict] = []
+    for run_label, step, _resolved_str, resolved_path, dir_name in all_jobs:
+        records = evaluate_one_model(
+            model_dir=resolved_path,
+            base_cfg=base_cfg,
+            eval_max_samples=eval_max_samples,
+            batch_size=eval_batch_size,
+        )
+        if rank == 0:
+            acc = sum(float(r["correctness"]) for r in records) / len(records) if records else 0.0
+            avg_cot = sum(float(r["cot_words"]) for r in records) / len(records) if records else 0.0
+            row = {
+                "run_label": run_label,
+                "checkpoint_step": step,
+                "checkpoint_dir": dir_name,
+                "model_dir": str(resolved_path),
+                "num_examples": len(records),
+                "accuracy": acc,
+                "avg_cot_words": avg_cot,
+            }
+            rows.append(row)
+            rollout_dir = out_root / "rollouts" / run_label
+            rollout_dir.mkdir(parents=True, exist_ok=True)
+            rollout_path = rollout_dir / f"{dir_name}_rollouts.jsonl"
+            with rollout_path.open("w", encoding="utf-8") as handle:
+                for rec in records[:rollout_n]:
+                    handle.write(json.dumps(rec, ensure_ascii=True) + "\n")
+            full_path = out_root / "full_outputs" / run_label / f"{dir_name}_outputs.jsonl"
+            full_path.parent.mkdir(parents=True, exist_ok=True)
+            with full_path.open("w", encoding="utf-8") as handle:
+                for rec in records:
+                    handle.write(json.dumps(rec, ensure_ascii=True) + "\n")
+            print(
+                f"Eval {run_label} {dir_name}: acc={acc:.4f} avg_cot_words={avg_cot:.2f} n={len(records)}"
+            )
+        if dist.is_initialized():
+            dist.barrier()
+    if rank != 0:
+        return
+    rows.sort(key=lambda r: (r["run_label"], r["checkpoint_step"], r["checkpoint_dir"]))
+    summary_json = out_root / "permanent_checkpoints_eval.json"
+    summary_csv = out_root / "permanent_checkpoints_eval.csv"
+    summary_json.write_text(json.dumps(rows, indent=2), encoding="utf-8")
+    with summary_csv.open("w", encoding="utf-8", newline="") as handle:
+        w = csv.DictWriter(
+            handle,
+            fieldnames=[
+                "run_label",
+                "checkpoint_step",
+                "checkpoint_dir",
+                "model_dir",
+                "num_examples",
+                "accuracy",
+                "avg_cot_words",
+            ],
+        )
+        w.writeheader()
+        for row in rows:
+            w.writerow(row)
+    def series_for(label: str, ykey: str) -> list[tuple[int, float]]:
+        return [
+            (int(r["checkpoint_step"]), float(r[ykey]))
+            for r in rows
+            if r["run_label"] == label
+        ]
+    palette = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ca8a04", "#0891b2"]
+    uniq_labels = sorted({str(r["run_label"]) for r in rows})
+    acc_series = [
+        (lab, series_for(lab, "accuracy"), palette[i % len(palette)])
+        for i, lab in enumerate(uniq_labels)
+        if series_for(lab, "accuracy")
+    ]
+    cot_series = [
+        (lab, series_for(lab, "avg_cot_words"), palette[i % len(palette)])
+        for i, lab in enumerate(uniq_labels)
+        if series_for(lab, "avg_cot_words")
+    ]
+    cot_max = 1.0
+    for r in rows:
+        cot_max = max(cot_max, float(r["avg_cot_words"]))
+    if acc_series:
+        _line_chart_svg(
+            acc_series,
+            "GSM8K accuracy vs checkpoint step",
+            "Accuracy",
+            1.0,
+            out_root / "accuracy_vs_step.svg",
+        )
+    if cot_series:
+        _line_chart_svg(
+            cot_series,
+            "Average CoT length (words) vs checkpoint step",
+            "Avg CoT words",
+            cot_max,
+            out_root / "avg_cot_vs_step.svg",
+        )
+    _scatter_accuracy_vs_cot_svg(
+        rows,
+        out_root / "accuracy_vs_avg_cot_words.svg",
+        "GSM8K accuracy vs average CoT length (words)",
+    )
+    print(f"Saved: {summary_json}")
+    print(f"Saved: {summary_csv}")
+    if acc_series:
+        print(f"Saved: {out_root / 'accuracy_vs_step.svg'}")
+    if cot_series:
+        print(f"Saved: {out_root / 'avg_cot_vs_step.svg'}")
+    print(f"Saved: {out_root / 'accuracy_vs_avg_cot_words.svg'}")
+    print(f"Rollouts: {out_root / 'rollouts'}/<run_label>/")
+    print(f"Full outputs: {out_root / 'full_outputs'}/<run_label>/")
+if __name__ == "__main__":
+    main()

src/eval_sweep_models.py ADDED Viewed

	@@ -0,0 +1,386 @@

+from __future__ import annotations
+import csv
+import json
+import os
+import re
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import yaml
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import hackable  # noqa: F401
+from hackable.data_plugins import GSM8KProvider
+from hackable.paths import resolve_storage_path, storage_layout
+from hackable.reward_plugins import gsm8k_correctness_reward
+from hackable.utils import resolve_repo_path
+THINK_RE = re.compile(r"<think>(.*?)</think>", re.DOTALL)
+def _load_yaml(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+def _cot_word_len(completion: str) -> int:
+    match = THINK_RE.search(completion)
+    text = match.group(1).strip() if match else ""
+    return len(text.split()) if text else 0
+def _model_dtype(cfg: dict):
+    return torch.bfloat16 if bool(cfg.get("trainer", {}).get("bf16", True)) else torch.float16
+def _get_cache_paths(base_cfg: dict) -> tuple[Path, Path]:
+    layout = storage_layout(base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    return layout.datasets, layout.models
+def _dist_info() -> tuple[int, int, int]:
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    return rank, world_size, local_rank
+def _init_distributed() -> tuple[int, int, int]:
+    rank, world_size, local_rank = _dist_info()
+    if world_size > 1 and not dist.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        dist.init_process_group(backend=backend, init_method="env://")
+    return rank, world_size, local_rank
+def _resolve_local_model_dir(base_cfg: dict, model_dir: str) -> Path:
+    candidate = Path(model_dir)
+    if candidate.is_absolute() and candidate.exists():
+        return candidate.resolve()
+    if not candidate.is_absolute() and candidate.exists():
+        return candidate.resolve()
+    repo_local = resolve_repo_path(model_dir)
+    if repo_local.exists():
+        return repo_local
+    cache_root = resolve_repo_path(base_cfg.get("storage", {}).get("cache_dir", "cache"))
+    prefixed = (cache_root / candidate).resolve()
+    if prefixed.exists():
+        return prefixed
+    raise FileNotFoundError(
+        f"Model directory not found locally: '{model_dir}'. "
+        f"Tried '{candidate}', '{repo_local}', and '{prefixed}'."
+    )
+def _resolve_sweep_root(base_cfg: dict, requested_sweep_root: Path) -> Path:
+    candidate = resolve_storage_path(
+        requested_sweep_root,
+        base_cfg.get("storage", {}).get("cache_dir", "cache"),
+    )
+    if candidate.is_dir() and any(path.is_dir() and path.name.startswith("run_") for path in candidate.iterdir()):
+        return candidate
+    raise FileNotFoundError(
+        "Could not resolve SWEEP_ROOT with run directories: "
+        f"{candidate}"
+    )
+def _discover_model_dirs(sweep_root: Path) -> list[Path]:
+    dirs = [
+        path
+        for path in sweep_root.iterdir()
+        if path.is_dir() and path.name.startswith("run_")
+    ]
+    if not dirs:
+        raise FileNotFoundError(
+            f"No run directories starting with 'run_' found in {sweep_root}"
+        )
+    return sorted(dirs)
+@torch.no_grad()
+def evaluate_one_model(
+    model_dir: Path,
+    base_cfg: dict,
+    eval_max_samples: int,
+    batch_size: int,
+) -> list[dict]:
+    rank, world_size, local_rank = _dist_info()
+    generation = base_cfg.get("generation", {})
+    max_prompt_len = int(generation.get("max_prompt_length", 512))
+    max_completion_len = int(generation.get("max_completion_length", 256))
+    model_name_fallback = str(base_cfg["model"]["name"])
+    trust_remote_code = bool(base_cfg.get("model", {}).get("trust_remote_code", False))
+    dtype = _model_dtype(base_cfg)
+    datasets_cache, models_cache = _get_cache_paths(base_cfg)
+    provider = GSM8KProvider()
+    all_samples = provider.load(
+        split="test",
+        max_samples=None if eval_max_samples < 0 else eval_max_samples,
+        cache_dir=str(datasets_cache),
+    )
+    indices = list(range(rank, len(all_samples), world_size))
+    local_samples = [all_samples[idx] for idx in indices]
+    prompts = [sample.prompt for sample in local_samples]
+    refs = [sample.target for sample in local_samples]
+    metadata = [sample.metadata for sample in local_samples]
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            str(model_dir),
+            trust_remote_code=trust_remote_code,
+            cache_dir=str(models_cache),
+            local_files_only=True,
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name_fallback,
+            trust_remote_code=trust_remote_code,
+            cache_dir=str(models_cache),
+            local_files_only=True,
+        )
+    model = AutoModelForCausalLM.from_pretrained(
+        str(model_dir),
+        trust_remote_code=trust_remote_code,
+        cache_dir=str(models_cache),
+        torch_dtype=dtype,
+        local_files_only=True,
+    )
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank)
+        device = torch.device(f"cuda:{local_rank}")
+    else:
+        device = torch.device("cpu")
+    model.to(device)
+    model.eval()
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    completions: list[str] = []
+    for start in range(0, len(prompts), batch_size):
+        batch_prompts = prompts[start : start + batch_size]
+        enc = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_prompt_len,
+        )
+        input_ids = enc["input_ids"].to(device)
+        attn = enc["attention_mask"].to(device)
+        out = model.generate(
+            input_ids=input_ids,
+            attention_mask=attn,
+            max_new_tokens=max_completion_len,
+            do_sample=False,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+        prompt_lens = attn.sum(dim=1).tolist()
+        for idx in range(out.size(0)):
+            completion_ids = out[idx, int(prompt_lens[idx]) :]
+            completions.append(tokenizer.decode(completion_ids, skip_special_tokens=True))
+    scores = gsm8k_correctness_reward(
+        prompts=prompts,
+        completions=completions,
+        references=refs,
+        metadata=metadata,
+    )
+    local_records: list[dict] = []
+    for i, (prompt, reference, completion, score) in enumerate(
+        zip(prompts, refs, completions, scores, strict=True)
+    ):
+        local_records.append(
+            {
+                "sample_index": int(indices[i]),
+                "prompt": prompt,
+                "reference": reference,
+                "completion": completion,
+                "correctness": float(score),
+                "cot_words": int(_cot_word_len(completion)),
+            }
+        )
+    del model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if dist.is_initialized():
+        gathered: list[list[dict] | None] = [None for _ in range(world_size)]
+        dist.all_gather_object(gathered, local_records)
+        merged: list[dict] = []
+        for part in gathered:
+            if part:
+                merged.extend(part)
+    else:
+        merged = local_records
+    merged.sort(key=lambda row: row["sample_index"])
+    return merged
+def _summarize(records: list[dict], model_dir: str) -> dict:
+    if not records:
+        return {
+            "name": Path(model_dir).name,
+            "model_dir": model_dir,
+            "num_examples": 0,
+            "accuracy": 0.0,
+            "avg_cot_words": 0.0,
+        }
+    accuracy = sum(float(row["correctness"]) for row in records) / len(records)
+    avg_cot = sum(float(row["cot_words"]) for row in records) / len(records)
+    return {
+        "name": Path(model_dir).name,
+        "model_dir": model_dir,
+        "num_examples": len(records),
+        "accuracy": float(accuracy),
+        "avg_cot_words": float(avg_cot),
+    }
+def _write_accuracy_svg(summaries: list[dict], path: Path) -> None:
+    width = 1000
+    height = 460
+    left_margin = 70
+    right_margin = 30
+    top_margin = 70
+    bottom_margin = 90
+    plot_w = width - left_margin - right_margin
+    plot_h = height - top_margin - bottom_margin
+    y_base = top_margin + plot_h
+    runs = [row["name"] for row in summaries]
+    acc_vals = [float(row["accuracy"]) for row in summaries]
+    vmax = max(1.0, max(acc_vals) if acc_vals else 1.0)
+    bar_count = max(1, len(runs))
+    slot_w = plot_w / bar_count
+    bar_w = min(120, max(30, int(slot_w * 0.55)))
+    palette = ["#2563eb", "#dc2626", "#16a34a", "#ca8a04", "#7c3aed", "#0891b2"]
+    parts: list[str] = []
+    parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">')
+    parts.append('<rect width="100%" height="100%" fill="#ffffff"/>')
+    parts.append(
+        '<text x="40" y="34" font-size="20" font-family="sans-serif">Sweep Evaluation: GSM8K Accuracy</text>'
+    )
+    parts.append(
+        f'<line x1="{left_margin}" y1="{y_base}" x2="{left_margin + plot_w}" y2="{y_base}" stroke="#111" stroke-width="2" />'
+    )
+    parts.append(
+        f'<line x1="{left_margin}" y1="{top_margin}" x2="{left_margin}" y2="{y_base}" stroke="#111" stroke-width="2" />'
+    )
+    # y-axis ticks
+    for tick in [0.0, 0.25, 0.5, 0.75, 1.0]:
+        y = y_base - int((tick / vmax) * plot_h) if vmax > 0 else y_base
+        parts.append(
+            f'<line x1="{left_margin - 6}" y1="{y}" x2="{left_margin}" y2="{y}" stroke="#111" stroke-width="1" />'
+        )
+        parts.append(
+            f'<text x="{left_margin - 10}" y="{y + 4}" text-anchor="end" font-size="11" font-family="sans-serif">{tick:.2f}</text>'
+        )
+    for idx, (run_name, acc) in enumerate(zip(runs, acc_vals, strict=True)):
+        center_x = left_margin + int((idx + 0.5) * slot_w)
+        bar_h = int((acc / vmax) * plot_h) if vmax > 0 else 0
+        x = center_x - bar_w // 2
+        y = y_base - bar_h
+        color = palette[idx % len(palette)]
+        parts.append(f'<rect x="{x}" y="{y}" width="{bar_w}" height="{bar_h}" fill="{color}" />')
+        parts.append(
+            f'<text x="{center_x}" y="{y - 8}" text-anchor="middle" font-size="12" font-family="sans-serif">{acc:.3f}</text>'
+        )
+        parts.append(
+            f'<text x="{center_x}" y="{y_base + 18}" text-anchor="middle" font-size="11" font-family="sans-serif">{run_name}</text>'
+        )
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def main() -> None:
+    rank, _, _ = _init_distributed()
+    base_cfg = _load_yaml(str(resolve_repo_path(os.environ["BASE_CONFIG"])))
+    requested_sweep_root = Path(os.environ["SWEEP_ROOT"])
+    sweep_root = _resolve_sweep_root(base_cfg, requested_sweep_root)
+    if "OUT_ROOT" in os.environ:
+        out_root = resolve_repo_path(os.environ["OUT_ROOT"])
+    else:
+        out_root = (sweep_root / "eval_results").resolve()
+    eval_max_samples = int(os.environ.get("EVAL_MAX_SAMPLES", "200"))
+    eval_batch_size = int(os.environ.get("EVAL_BATCH_SIZE", "4"))
+    model_dirs = _discover_model_dirs(sweep_root)
+    resolved_model_dirs = [_resolve_local_model_dir(base_cfg, str(path)) for path in model_dirs]
+    if rank == 0:
+        out_root.mkdir(parents=True, exist_ok=True)
+        (out_root / "outputs").mkdir(parents=True, exist_ok=True)
+    if dist.is_initialized():
+        dist.barrier()
+    summaries: list[dict] = []
+    for model_dir in resolved_model_dirs:
+        records = evaluate_one_model(
+            model_dir=model_dir,
+            base_cfg=base_cfg,
+            eval_max_samples=eval_max_samples,
+            batch_size=eval_batch_size,
+        )
+        if rank == 0:
+            output_jsonl = out_root / "outputs" / f"{model_dir.name}_outputs.jsonl"
+            with output_jsonl.open("w", encoding="utf-8") as handle:
+                for row in records:
+                    handle.write(json.dumps(row, ensure_ascii=True) + "\n")
+            summary = _summarize(records, str(model_dir))
+            summary["outputs_jsonl"] = str(output_jsonl)
+            summaries.append(summary)
+        if dist.is_initialized():
+            dist.barrier()
+    if rank != 0:
+        return
+    json_path = out_root / "sweep_eval_summary.json"
+    csv_path = out_root / "sweep_eval_summary.csv"
+    svg_path = out_root / "sweep_eval_accuracy.svg"
+    json_path.write_text(json.dumps(summaries, indent=2), encoding="utf-8")
+    with csv_path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(
+            handle,
+            fieldnames=[
+                "name",
+                "model_dir",
+                "num_examples",
+                "accuracy",
+                "avg_cot_words",
+                "outputs_jsonl",
+            ],
+        )
+        writer.writeheader()
+        for row in summaries:
+            writer.writerow(row)
+    _write_accuracy_svg(summaries, svg_path)
+    print(f"Saved summary: {json_path}")
+    print(f"Saved summary: {csv_path}")
+    print(f"Saved plot: {svg_path}")
+    print(f"Saved outputs dir: {out_root / 'outputs'}")
+if __name__ == "__main__":
+    main()

src/hackable/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Hackable GRPO training primitives."""
+# Ensure default plugins register on import.
+from . import data_plugins as _data_plugins  # noqa: F401
+from . import objectives as _objectives  # noqa: F401
+from . import reward_plugins as _reward_plugins  # noqa: F401

src/hackable/backends.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from __future__ import annotations
+import importlib.util
+from typing import Any
+import torch
+def load_model_and_tokenizer(
+    model_name: str,
+    trust_remote_code: bool = False,
+    cache_dir: str | None = None,
+    load_in_4bit: bool = False,
+    torch_dtype: str = "bfloat16",
+):
+    del load_in_4bit
+    dtype = torch.bfloat16 if torch_dtype == "bfloat16" else torch.float16
+    # Apply Liger kernels before constructing Llama models.
+    if "llama" in model_name.lower():
+        try:
+            from liger_kernel.transformers import apply_liger_kernel_to_llama
+        except Exception as exc:
+            raise RuntimeError(
+                "Failed to import Liger kernel patcher for Llama. "
+                "Install liger-kernel in the runtime environment."
+            ) from exc
+        apply_liger_kernel_to_llama()
+    attn_impl = "sdpa"
+    if importlib.util.find_spec("flash_attn") is not None:
+        try:
+            __import__("flash_attn")
+            attn_impl = "flash_attention_2"
+        except Exception:
+            attn_impl = "sdpa"
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=trust_remote_code,
+        cache_dir=cache_dir,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        trust_remote_code=trust_remote_code,
+        cache_dir=cache_dir,
+        dtype=dtype,
+        attn_implementation=attn_impl,
+    )
+    return model, tokenizer, "transformers"
+def generation_kwargs(cfg: Any) -> dict[str, Any]:
+    return {
+        "max_prompt_length": cfg.max_prompt_length,
+        "max_completion_length": cfg.max_completion_length,
+        "num_generations": cfg.num_generations,
+        "temperature": cfg.temperature,
+        "top_p": cfg.top_p,
+    }

src/hackable/config.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import yaml
+from .utils import resolve_repo_path
+@dataclass
+class ModelConfig:
+    name: str
+    trust_remote_code: bool = False
+    load_in_4bit: bool = False
+    use_lora_adapters: bool = False
+    lora_r: int = 16
+    lora_alpha: int = 16
+    lora_dropout: float = 0.0
+@dataclass
+class TrainerConfig:
+    output_dir: str
+    run_name: str = "grpo-run"
+    max_steps: int = -1
+    num_train_epochs: float = 1.0
+    per_device_train_batch_size: int = 1
+    gradient_accumulation_steps: int = 8
+    learning_rate: float = 1.0e-6
+    logging_steps: int = 1
+    save_steps: int = 25
+    save_total_limit: int = 5
+    bf16: bool = True
+    seed: int = 42
+    report_to: str = "wandb"
+    optim: str = "adamw_torch"
+    gradient_checkpointing: bool = True
+    max_grad_norm: float = 1.0
+    shuffle_dataset: bool = False
+    lr_scheduler_type: str = "cosine"
+    lr_scheduler_kwargs: dict[str, Any] = field(default_factory=dict)
+    warmup_steps: int = 20
+    sanity_log_examples: int = 8
+    sanity_log_max_chars: int = 300
+    permanent_checkpoint_steps: int = 300
+    permanent_checkpoint_dir: str = "checkpoints/permanent"
+@dataclass
+class DataConfig:
+    provider: str = "gsm8k_math_curriculum"
+    split: str = "train"
+    max_samples: int | None = None
+@dataclass
+class GenerationConfig:
+    max_prompt_length: int = 512
+    max_completion_length: int = 256
+    num_generations: int = 4
+    temperature: float = 0.9
+    top_p: float = 0.95
+@dataclass
+class ObjectiveConfig:
+    name: str = "token_grpo"
+    kwargs: dict[str, Any] = field(default_factory=dict)
+    class_path: str | None = None
+@dataclass
+class RewardsConfig:
+    kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+@dataclass
+class AuthConfig:
+    hf_api_key: str | None = None
+    wandb_api_key: str | None = None
+    hf_api_key_env: str = "HF_TOKEN"
+    wandb_api_key_env: str = "WANDB_API_KEY"
+@dataclass
+class StorageConfig:
+    cache_dir: str = "cache"
+@dataclass
+class ThinkingKLConfig:
+    """Scale KL penalty on completion tokens that overlap the *inner* redacted thinking body."""
+    inner_kl_weight: float = 1.0
+@dataclass
+class ExperimentConfig:
+    model: ModelConfig
+    trainer: TrainerConfig
+    data: DataConfig = field(default_factory=DataConfig)
+    generation: GenerationConfig = field(default_factory=GenerationConfig)
+    objective: ObjectiveConfig = field(default_factory=ObjectiveConfig)
+    rewards: RewardsConfig = field(default_factory=RewardsConfig)
+    auth: AuthConfig = field(default_factory=AuthConfig)
+    storage: StorageConfig = field(default_factory=StorageConfig)
+    thinking_kl: ThinkingKLConfig = field(default_factory=ThinkingKLConfig)
+    grpo: dict[str, Any] = field(default_factory=dict)
+def load_config(path: str | Path) -> ExperimentConfig:
+    resolved = resolve_repo_path(path)
+    with resolved.open("r", encoding="utf-8") as handle:
+        raw = yaml.safe_load(handle)
+    trainer_raw = raw["trainer"]
+    # Backward-compatible alias: allow "optimizer" in YAML.
+    if "optim" not in trainer_raw and "optimizer" in trainer_raw:
+        trainer_raw = {**trainer_raw, "optim": trainer_raw["optimizer"]}
+    if "optimizer" in trainer_raw:
+        trainer_raw = {k: v for k, v in trainer_raw.items() if k != "optimizer"}
+    # Accept common shorthand names.
+    optim_aliases = {
+        "adamw": "adamw_torch",
+        "adamw_fused": "adamw_torch_fused",
+    }
+    scheduler_aliases = {
+        "cosine_decay": "cosine",
+    }
+    if "optim" in trainer_raw:
+        trainer_raw = {
+            **trainer_raw,
+            "optim": optim_aliases.get(trainer_raw["optim"], trainer_raw["optim"]),
+        }
+    if "lr_scheduler_type" in trainer_raw:
+        trainer_raw = {
+            **trainer_raw,
+            "lr_scheduler_type": scheduler_aliases.get(
+                trainer_raw["lr_scheduler_type"], trainer_raw["lr_scheduler_type"]
+            ),
+        }
+    # Normalize numeric fields that may come from YAML as strings.
+    float_fields = {
+        "learning_rate",
+        "max_grad_norm",
+        "num_train_epochs",
+    }
+    int_fields = {
+        "max_steps",
+        "per_device_train_batch_size",
+        "gradient_accumulation_steps",
+        "logging_steps",
+        "save_steps",
+        "save_total_limit",
+        "seed",
+        "warmup_steps",
+        "sanity_log_examples",
+        "sanity_log_max_chars",
+        "permanent_checkpoint_steps",
+    }
+    for key in float_fields:
+        if key in trainer_raw:
+            trainer_raw = {**trainer_raw, key: float(trainer_raw[key])}
+    for key in int_fields:
+        if key in trainer_raw:
+            trainer_raw = {**trainer_raw, key: int(trainer_raw[key])}
+    return ExperimentConfig(
+        model=ModelConfig(**raw["model"]),
+        trainer=TrainerConfig(**trainer_raw),
+        data=DataConfig(**raw.get("data", {})),
+        generation=GenerationConfig(**raw.get("generation", {})),
+        objective=ObjectiveConfig(**raw.get("objective", {})),
+        rewards=RewardsConfig(**raw.get("rewards", {})),
+        auth=AuthConfig(**raw.get("auth", {})),
+        storage=StorageConfig(**raw.get("storage", {})),
+        thinking_kl=ThinkingKLConfig(**raw.get("thinking_kl", {})),
+        grpo=dict(raw.get("grpo", {})),
+    )

src/hackable/data_plugins.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from __future__ import annotations
+from contextlib import contextmanager
+from dataclasses import asdict
+from pathlib import Path
+from .interfaces import TrainingSample
+from .registry import register_data_provider
+@contextmanager
+def _serialized_hf_dataset_download():
+    """
+    Serialize Hugging Face ``datasets`` downloads/prepare across processes.
+    Multi-GPU ``accelerate launch`` otherwise races on the same ``cache_dir`` and can
+    leave a half-written tree (e.g. missing ``dataset_info.json``).
+    """
+    root = Path.home() / ".cache" / "neuralese"
+    root.mkdir(parents=True, exist_ok=True)
+    lock_path = root / "hf_dataset_download.lock"
+    try:
+        from filelock import FileLock
+        with FileLock(str(lock_path), timeout=7200):
+            yield
+    except ImportError:
+        yield
+def _load_hf_split(
+    path: str,
+    split: str,
+    cache_dir: str | None,
+    config_name: str | None = None,
+):
+    from datasets import load_dataset
+    with _serialized_hf_dataset_download():
+        try:
+            if config_name is not None:
+                return load_dataset(path, config_name, split=split, cache_dir=cache_dir)
+            return load_dataset(path, split=split, cache_dir=cache_dir)
+        except FileNotFoundError as exc:
+            if not cache_dir:
+                raise RuntimeError(
+                    "Hugging Face dataset files are missing from the default cache. "
+                    "Run once with HF_DATASETS_OFFLINE=0 (or download the dataset), "
+                    "or set HF_HOME / HF_DATASETS_CACHE to a populated cache."
+                ) from exc
+            try:
+                if config_name is not None:
+                    return load_dataset(path, config_name, split=split, cache_dir=None)
+                return load_dataset(path, split=split, cache_dir=None)
+            except FileNotFoundError as exc2:
+                raise RuntimeError(
+                    "Could not load the dataset from the experiment cache_dir or the default HF cache. "
+                    "Seed ~/.cache/huggingface/datasets (or your HF_HOME) with HF_DATASETS_OFFLINE=0, "
+                    "or point storage.cache_dir at a shared cache that already contains the dataset."
+                ) from exc2
+PROMPT_PREFIX = (
+    "Solve the following math problem.\n"
+    "Think step-by-step inside <think>...</think> tags.\n"
+    "Then output only the final answer in LaTeX boxed format.\n"
+    "Do not include any words or explanations outside the tags/boxed answer.\n"
+    "Output format must be exactly:\n"
+    "<think>your reasoning</think>\n"
+    "\\boxed{your_final_answer}\n\n"
+)
+def _build_math_prompt(question: str) -> str:
+    user_content = f"{PROMPT_PREFIX}Question: {question}"
+    # Chat-style prefill so decoding starts after "assistant:".
+    return f"user: {user_content}\nassistant:"
+def _interleave_samples(
+    left: list[TrainingSample], right: list[TrainingSample]
+) -> list[TrainingSample]:
+    output: list[TrainingSample] = []
+    width = max(len(left), len(right))
+    for idx in range(width):
+        if idx < len(left):
+            output.append(left[idx])
+        if idx < len(right):
+            output.append(right[idx])
+    return output
+def _slice_if_needed(
+    samples: list[TrainingSample], max_samples: int | None
+) -> list[TrainingSample]:
+    if max_samples is None:
+        return samples
+    return samples[: max(0, max_samples)]
+class _MathProviderBase:
+    dataset_name = "EleutherAI/hendrycks_math"
+    dataset_configs = (
+        "algebra",
+        "counting_and_probability",
+        "geometry",
+        "intermediate_algebra",
+        "number_theory",
+        "prealgebra",
+        "precalculus",
+    )
+    def __init__(self, levels: tuple[str, ...]):
+        self.levels = levels
+    def load(
+        self,
+        split: str,
+        max_samples: int | None = None,
+        cache_dir: str | None = None,
+    ) -> list[TrainingSample]:
+        try:
+            import datasets  # noqa: F401
+        except Exception as exc:
+            raise RuntimeError(
+                "datasets is required for Hendrycks MATH providers. Install dependencies first."
+            ) from exc
+        level_set = {level.strip() for level in self.levels}
+        output: list[TrainingSample] = []
+        for config_name in self.dataset_configs:
+            rows = _load_hf_split(
+                self.dataset_name,
+                split,
+                cache_dir,
+                config_name=config_name,
+            )
+            for row in rows:
+                level = str(row.get("level", "")).strip()
+                if level not in level_set:
+                    continue
+                question = str(row.get("problem", ""))
+                target = str(row.get("solution", ""))
+                output.append(
+                    TrainingSample(
+                        prompt=_build_math_prompt(question),
+                        target=target,
+                        metadata={
+                            "dataset": "hendrycks_math",
+                            "subject": config_name,
+                            "level": level,
+                        },
+                    )
+                )
+                if max_samples is not None and len(output) >= max_samples:
+                    return output
+        return output
+@register_data_provider("gsm8k")
+class GSM8KProvider:
+    def __init__(self, dataset_name: str = "openai/gsm8k", subset: str = "main"):
+        self.dataset_name = dataset_name
+        self.subset = subset
+    def load(
+        self,
+        split: str,
+        max_samples: int | None = None,
+        cache_dir: str | None = None,
+    ) -> list[TrainingSample]:
+        try:
+            import datasets  # noqa: F401
+        except Exception as exc:
+            raise RuntimeError(
+                "datasets is required for GSM8K provider. Install dependencies first."
+            ) from exc
+        rows = _load_hf_split(
+            self.dataset_name,
+            split,
+            cache_dir,
+            config_name=self.subset,
+        )
+        if max_samples is not None:
+            rows = rows.select(range(min(max_samples, len(rows))))
+        output: list[TrainingSample] = []
+        for sample_index, row in enumerate(rows):
+            prompt = _build_math_prompt(str(row["question"]))
+            output.append(
+                TrainingSample(
+                    prompt=prompt,
+                    target=row["answer"],
+                    metadata={
+                        "dataset": "gsm8k",
+                        "sample_index": int(sample_index),
+                        "split": str(split),
+                    },
+                )
+            )
+        return output
+@register_data_provider("math_level_1")
+class MathLevel1Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 1",))
+@register_data_provider("math_level_2")
+class MathLevel2Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 2",))
+@register_data_provider("math_level_3")
+class MathLevel3Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 3",))
+@register_data_provider("math_level_4")
+class MathLevel4Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 4",))
+@register_data_provider("math_level_5")
+class MathLevel5Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 5",))
+@register_data_provider("math_levels_12")
+class MathLevels12Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 1", "Level 2"))
+@register_data_provider("math_levels_345")
+class MathLevels345Provider(_MathProviderBase):
+    def __init__(self):
+        super().__init__(levels=("Level 3", "Level 4", "Level 5"))
+@register_data_provider("gsm8k_math_stage12")
+class GSM8KMathStage12Provider:
+    def load(
+        self,
+        split: str,
+        max_samples: int | None = None,
+        cache_dir: str | None = None,
+    ) -> list[TrainingSample]:
+        gsm = GSM8KProvider().load(split=split, max_samples=None, cache_dir=cache_dir)
+        math12 = MathLevels12Provider().load(
+            split=split, max_samples=None, cache_dir=cache_dir
+        )
+        mixed = _interleave_samples(gsm, math12)
+        return _slice_if_needed(mixed, max_samples)
+@register_data_provider("gsm8k_math_curriculum")
+class GSM8KMathCurriculumProvider:
+    def load(
+        self,
+        split: str,
+        max_samples: int | None = None,
+        cache_dir: str | None = None,
+    ) -> list[TrainingSample]:
+        if max_samples is None:
+            stage12_budget = None
+            stage345_budget = None
+        else:
+            stage12_budget = (max_samples + 1) // 2
+            stage345_budget = max_samples // 2
+        stage12 = GSM8KMathStage12Provider().load(
+            split=split, max_samples=stage12_budget, cache_dir=cache_dir
+        )
+        stage345 = MathLevels345Provider().load(
+            split=split, max_samples=stage345_budget, cache_dir=cache_dir
+        )
+        # Curriculum order: first easier mixed set, then harder levels.
+        return stage12 + stage345
+def to_dataset_rows(samples: list[TrainingSample]) -> list[dict]:
+    return [asdict(sample) for sample in samples]

src/hackable/interfaces.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Protocol
+@dataclass
+class TrainingSample:
+    prompt: str
+    target: str
+    metadata: dict[str, Any]
+class DataProvider(Protocol):
+    def load(
+        self,
+        split: str,
+        max_samples: int | None = None,
+        cache_dir: str | None = None,
+    ) -> list[TrainingSample]:
+        ...
+class RewardFunction(Protocol):
+    def __call__(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        references: list[str],
+        metadata: list[dict[str, Any]],
+    ) -> list[float]:
+        ...
+class ObjectiveModule(Protocol):
+    name: str
+    def reward_names(self) -> list[str]:
+        ...
+    def extra_reward(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        references: list[str],
+        metadata: list[dict[str, Any]],
+    ) -> list[float]:
+        ...