Upload 34 files

Browse files

Files changed (35) hide show

.gitattributes +2 -0
assets/TRM_fig.png +3 -0
assets/TRM_pseudocode.png +3 -0
config/arch/hrm.yaml +24 -0
config/arch/transformers_baseline.yaml +18 -0
config/arch/trm.yaml +26 -0
config/arch/trm_hier6.yaml +26 -0
config/arch/trm_singlez.yaml +26 -0
config/cfg_pretrain.yaml +42 -0
dataset/build_arc_dataset.py +341 -0
dataset/build_maze_dataset.py +140 -0
dataset/build_sudoku_dataset.py +167 -0
dataset/common.py +49 -0
evaluators/arc.py +177 -0
kaggle/combined/arc-agi_concept_challenges.json +0 -0
kaggle/combined/arc-agi_concept_solutions.json +0 -0
kaggle/combined/arc-agi_evaluation2_challenges.json +0 -0
kaggle/combined/arc-agi_evaluation2_solutions.json +0 -0
kaggle/combined/arc-agi_evaluation_challenges.json +0 -0
kaggle/combined/arc-agi_evaluation_solutions.json +0 -0
kaggle/combined/arc-agi_training2_challenges.json +0 -0
kaggle/combined/arc-agi_training2_solutions.json +0 -0
kaggle/combined/arc-agi_training_challenges.json +0 -0
kaggle/combined/arc-agi_training_solutions.json +0 -0
models/common.py +32 -0
models/ema.py +40 -0
models/layers.py +169 -0
models/losses.py +103 -0
models/recursive_reasoning/hrm.py +294 -0
models/recursive_reasoning/transformers_baseline.py +342 -0
models/recursive_reasoning/trm.py +297 -0
models/recursive_reasoning/trm_hier6.py +323 -0
models/recursive_reasoning/trm_singlez.py +294 -0
models/sparse_embedding.py +132 -0
utils/functions.py +19 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/TRM_fig.png filter=lfs diff=lfs merge=lfs -text
+assets/TRM_pseudocode.png filter=lfs diff=lfs merge=lfs -text

assets/TRM_fig.png ADDED Viewed

Git LFS Details

SHA256: 3cc0ac2a6eeca5af89d03ae678d121b4a3467ba7e34f0ab1b1c532bbbf4689da
Pointer size: 131 Bytes
Size of remote file: 354 kB

assets/TRM_pseudocode.png ADDED Viewed

Git LFS Details

SHA256: cab417c86f074113f40dbd16628b0354adafc8a0db1fc5d489dd427f11927659
Pointer size: 131 Bytes
Size of remote file: 267 kB

config/arch/hrm.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+name: recursive_reasoning.hrm@HierarchicalReasoningModel_ACTV1
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+H_cycles: 2
+L_cycles: 2
+H_layers: 4
+L_layers: 4
+hidden_size: 512
+num_heads: 8  # min(2, hidden_size // 64)
+expansion: 4
+puzzle_emb_ndim: ${.hidden_size}
+pos_encodings: rope
+forward_dtype: bfloat16
+mlp_t: False # use mlp on L instead of transformer

config/arch/transformers_baseline.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+name: recursive_reasoning.transformers_baseline@Model_ACTV2
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+H_cycles: 1  # kept for compatibility
+H_layers: 8
+hidden_size: 512
+num_heads: 12
+expansion: 4
+puzzle_emb_ndim: ${.hidden_size}
+pos_encodings: rope

config/arch/trm.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: recursive_reasoning.trm@TinyRecursiveReasoningModel_ACTV1
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+H_cycles: 3
+L_cycles: 6
+H_layers: 0
+L_layers: 2
+hidden_size: 512
+num_heads: 8  # min(2, hidden_size // 64)
+expansion: 4
+puzzle_emb_ndim: ${.hidden_size}
+pos_encodings: rope
+forward_dtype: bfloat16
+mlp_t: False # use mlp on L instead of transformer
+puzzle_emb_len: 16 # if non-zero, its specified to this value
+no_ACT_continue: True # No continue ACT loss, only use the sigmoid of the halt which makes much more sense

config/arch/trm_hier6.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: recursive_reasoning.trm_hier6@TinyRecursiveReasoningModel_ACTV1
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+H_cycles: 3
+L_cycles: 6
+H_layers: 0
+L_layers: 2
+hidden_size: 512
+num_heads: 8  # min(2, hidden_size // 64)
+expansion: 4
+puzzle_emb_ndim: ${.hidden_size}
+pos_encodings: rope
+forward_dtype: bfloat16
+mlp_t: False # use mlp on L instead of transformer
+puzzle_emb_len: 16 # if non-zero, its specified to this value
+no_ACT_continue: True # No continue ACT loss, only use the sigmoid of the halt which makes much more sense

config/arch/trm_singlez.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: recursive_reasoning.trm_singlez@TinyRecursiveReasoningModel_ACTV1
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+H_cycles: 3
+L_cycles: 6
+H_layers: 0
+L_layers: 2
+hidden_size: 512
+num_heads: 8  # min(2, hidden_size // 64)
+expansion: 4
+puzzle_emb_ndim: ${.hidden_size}
+pos_encodings: rope
+forward_dtype: bfloat16
+mlp_t: False # use mlp on L instead of transformer
+puzzle_emb_len: 16 # if non-zero, its specified to this value
+no_ACT_continue: True # No continue ACT loss, only use the sigmoid of the halt which makes much more sense

config/cfg_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# ARC training config
+defaults:
+  - arch: trm
+  - _self_
+hydra:
+  output_subdir: null
+# Data path
+data_paths: ['data/arc-aug-1000']
+data_paths_test: []
+evaluators:
+  - name: arc@ARC
+# Hyperparams - Training
+global_batch_size: 768
+epochs: 100000
+eval_interval: 10000
+checkpoint_every_eval: True
+lr: 1e-4
+lr_min_ratio: 1.0
+lr_warmup_steps: 2000
+# Standard hyperparameter settings for LM, as used in Llama
+beta1: 0.9
+beta2: 0.95
+weight_decay: 0.1
+puzzle_emb_weight_decay: 0.1
+# Hyperparams - Puzzle embeddings training
+puzzle_emb_lr: 1e-2
+seed: 0
+min_eval_interval: 0 # when to start the eval
+ema: False # use Exponential-Moving-Average
+ema_rate: 0.999 # EMA-rate
+freeze_weights: False # If True, freeze weights and only learn the embeddings

dataset/build_arc_dataset.py ADDED Viewed

	@@ -0,0 +1,341 @@

+from typing import List, Tuple, Dict
+from dataclasses import dataclass
+import os
+import json
+import hashlib
+import numpy as np
+from argdantic import ArgParser
+from pydantic import BaseModel
+from dataset.common import PuzzleDatasetMetadata, dihedral_transform, inverse_dihedral_transform
+cli = ArgParser()
+class DataProcessConfig(BaseModel):
+    input_file_prefix: str
+    output_dir: str
+    subsets: List[str]
+    test_set_name: str
+    test_set_name2: str = "your_test_set"
+    seed: int = 42
+    num_aug: int = 1000
+    puzzle_identifiers_start: int = 1 # start > 1 to handle multiple datasets
+ARCMaxGridSize = 30
+ARCAugmentRetriesFactor = 5
+PuzzleIdSeparator = "|||"
+@dataclass
+class ARCPuzzle:
+    id: str
+    examples: List[Tuple[np.ndarray, np.ndarray]]
+def arc_grid_to_np(grid: List[List[int]]):
+    arr = np.array(grid)
+    # Shape check
+    assert arr.ndim == 2
+    assert arr.shape[0] <= ARCMaxGridSize and arr.shape[1] <= ARCMaxGridSize
+    # Element check
+    assert np.all((arr >= 0) & (arr <= 9))
+    return arr.astype(np.uint8)
+def np_grid_to_seq_translational_augment(inp: np.ndarray, out: np.ndarray, do_translation: bool):
+    # PAD: 0, <eos>: 1, digits: 2 ... 11
+    # Compute random top-left pad
+    if do_translation:
+        pad_r = np.random.randint(0, ARCMaxGridSize - max(inp.shape[0], out.shape[0]) + 1)
+        pad_c = np.random.randint(0, ARCMaxGridSize - max(inp.shape[1], out.shape[1]) + 1)
+    else:
+        pad_r = pad_c = 0
+    # Pad grid
+    result = []
+    for grid in [inp, out]:
+        nrow, ncol = grid.shape
+        grid = np.pad(grid + 2, ((pad_r, ARCMaxGridSize - pad_r - nrow), (pad_c, ARCMaxGridSize - pad_c - ncol)), constant_values=0)
+        # Add <eos>
+        eos_row, eos_col = pad_r + nrow, pad_c + ncol
+        if eos_row < ARCMaxGridSize:
+            grid[eos_row, pad_c:eos_col] = 1
+        if eos_col < ARCMaxGridSize:
+            grid[pad_r:eos_row, eos_col] = 1
+        result.append(grid.flatten())
+    return result
+def grid_hash(grid: np.ndarray):
+    assert grid.ndim == 2
+    assert grid.dtype == np.uint8
+    buffer = [x.to_bytes(1, byteorder='big') for x in grid.shape]
+    buffer.append(grid.tobytes())
+    return hashlib.sha256(b"".join(buffer)).hexdigest()
+def puzzle_hash(puzzle: dict):
+    # Hash the puzzle for checking equivalence
+    hashes = []
+    for example_type, example in puzzle.items():
+        for input, label in example.examples:
+            hashes.append(f"{grid_hash(input)}|{grid_hash(label)}")
+    hashes.sort()
+    return hashlib.sha256("|".join(hashes).encode()).hexdigest()
+def aug(name: str):
+    # Augment plan
+    trans_id = np.random.randint(0, 8)
+    mapping = np.concatenate([np.arange(0, 1, dtype=np.uint8), np.random.permutation(np.arange(1, 10, dtype=np.uint8))])  # Permute colors, Excluding "0" (black)
+    name_with_aug_repr = f"{name}{PuzzleIdSeparator}t{trans_id}{PuzzleIdSeparator}{''.join(str(x) for x in mapping)}"
+    def _map_grid(grid: np.ndarray):
+        return dihedral_transform(mapping[grid], trans_id)
+    return name_with_aug_repr, _map_grid
+def inverse_aug(name: str):
+    # Inverse the "aug" function
+    if PuzzleIdSeparator not in name:
+        return name, lambda x: x
+    trans_id, perm = name.split(PuzzleIdSeparator)[-2:]
+    trans_id = int(trans_id[1:])  # Remove "t" letter
+    inv_perm = np.argsort(list(perm)).astype(np.uint8)
+    def _map_grid(grid: np.ndarray):
+        return inv_perm[inverse_dihedral_transform(grid, trans_id)]
+    return name.split(PuzzleIdSeparator)[0], _map_grid
+def convert_single_arc_puzzle(results: dict, name: str, puzzle: dict, aug_count: int, dest_mapping: Dict[str, Tuple[str, str]]):
+    # Convert
+    dests = set(dest_mapping.values())
+    converted = {dest: ARCPuzzle(name, []) for dest in dests}
+    for example_type, examples in puzzle.items():
+        # Map to target split
+        dest = dest_mapping[example_type]
+        converted[dest].examples.extend([(arc_grid_to_np(example["input"]), arc_grid_to_np(example["output"])) for example in examples])
+    group = [converted]
+    # Augment
+    if aug_count > 0:
+        hashes = {puzzle_hash(converted)}
+        for _trial in range(ARCAugmentRetriesFactor * aug_count):
+            aug_name, _map_grid = aug(name)
+            # Check duplicate
+            augmented = {dest: ARCPuzzle(aug_name, [(_map_grid(input), _map_grid(label)) for (input, label) in puzzle.examples]) for dest, puzzle in converted.items()}
+            h = puzzle_hash(augmented)
+            if h not in hashes:
+                hashes.add(h)
+                group.append(augmented)
+            if len(group) >= aug_count + 1:
+                break
+        if len(group) < aug_count + 1:
+            print (f"[Puzzle {name}] augmentation not full, only {len(group)}")
+    # Append
+    for dest in dests:
+        # Convert the examples
+        dest_split, dest_set = dest
+        results.setdefault(dest_split, {})
+        results[dest_split].setdefault(dest_set, [])
+        results[dest_split][dest_set].append([converted[dest] for converted in group])
+def load_puzzles_arcagi(config: DataProcessConfig):
+    train_examples_dest = ("train", "all")
+    test_examples_map = {
+        config.test_set_name: [(1.0, ("test", "all"))],
+        config.test_set_name2: [(1.0, ("test", "all"))],
+        "_default": [(1.0, ("train", "all"))]
+    }
+    test_puzzles = {}
+    results = {}
+    total_puzzles = 0
+    for subset_name in config.subsets:
+        # Load all puzzles in this subset
+        with open(f"{config.input_file_prefix}_{subset_name}_challenges.json", "r") as f:
+            puzzles = json.load(f)
+        sols_filename = f"{config.input_file_prefix}_{subset_name}_solutions.json"
+        if os.path.isfile(sols_filename):
+            with open(sols_filename, "r") as f:
+                sols = json.load(f)
+                for puzzle_id in puzzles.keys():
+                    for idx, sol_grid in enumerate(sols[puzzle_id]):
+                        puzzles[puzzle_id]["test"][idx]["output"] = sol_grid
+        else:
+            # Fill with dummy
+            print (f"{subset_name} solutions not found, filling with dummy")
+            for puzzle_id, puzzle in puzzles.items():
+                for example in puzzle["test"]:
+                    example.setdefault("output", [[0]])
+        # Shuffle puzzles
+        puzzles = list(puzzles.items())
+        np.random.shuffle(puzzles)
+        # Assign by fraction
+        for idx, (name, puzzle) in enumerate(puzzles):
+            fraction = idx / len(puzzles)
+            test_examples_dest = None
+            for f, dest in test_examples_map.get(subset_name, test_examples_map["_default"]):
+                if fraction < f:
+                    test_examples_dest = dest
+                    break
+            assert test_examples_dest is not None
+            if test_examples_dest[0] == "test":
+                test_puzzles[name] = puzzle
+            convert_single_arc_puzzle(results, name, puzzle, config.num_aug, {"train": train_examples_dest, "test": test_examples_dest})
+            total_puzzles += 1
+    print (f"Total puzzles: {total_puzzles}")
+    return results, test_puzzles
+def convert_dataset(config: DataProcessConfig):
+    np.random.seed(config.seed)
+    # Read dataset
+    data, test_puzzles = load_puzzles_arcagi(config)
+    # Map global puzzle identifiers
+    num_identifiers = config.puzzle_identifiers_start  # 0 is blank, start at 1
+    identifier_map = {}
+    for split_name, split in data.items():
+        for subset_name, subset in split.items():
+            for group in subset:
+                for puzzle in group:
+                    if puzzle.id not in identifier_map:
+                        identifier_map[puzzle.id] = num_identifiers
+                        num_identifiers += 1
+    print (f"Total puzzle IDs (including <blank>): {num_identifiers}")
+    # Save
+    for split_name, split in data.items():
+        os.makedirs(os.path.join(config.output_dir, split_name), exist_ok=True)
+        # Translational augmentations
+        enable_translational_augment = split_name == "train"
+        # Statistics
+        total_examples = 0
+        total_puzzles = 0
+        total_groups = 0
+        for subset_name, subset in split.items(): # "all" is the only subset
+            # Construct subset
+            results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]}
+            results["puzzle_indices"].append(0)
+            results["group_indices"].append(0)
+            example_id = 0
+            puzzle_id = 0
+            for group in subset:
+                for puzzle in group:
+                    # Push puzzle
+                    no_aug_id = np.random.randint(0, len(puzzle.examples))
+                    for _idx_ex, (inp, out) in enumerate(puzzle.examples):
+                        inp, out = np_grid_to_seq_translational_augment(inp, out, do_translation=enable_translational_augment and _idx_ex != no_aug_id)
+                        results["inputs"].append(inp)
+                        results["labels"].append(out)
+                        example_id += 1
+                        total_examples += 1
+                    results["puzzle_indices"].append(example_id)
+                    results["puzzle_identifiers"].append(identifier_map[puzzle.id])
+                    puzzle_id += 1
+                    total_puzzles += 1
+                # Push group
+                results["group_indices"].append(puzzle_id)
+                total_groups += 1
+            for k, v in results.items():
+                if k in {"inputs", "labels"}:
+                    v = np.stack(v, 0)
+                else:
+                    v = np.array(v, dtype=np.int32)
+                np.save(os.path.join(config.output_dir, split_name, f"{subset_name}__{k}.npy"), v)
+        # Metadata
+        metadata = PuzzleDatasetMetadata(
+            seq_len=ARCMaxGridSize * ARCMaxGridSize,
+            vocab_size=10 + 2,  # PAD + EOS + "0" ... "9"
+            pad_id=0,
+            ignore_label_id=0,
+            blank_identifier_id=0,
+            num_puzzle_identifiers=num_identifiers,
+            total_groups=total_groups,
+            mean_puzzle_examples=total_examples / total_puzzles,
+            total_puzzles=total_puzzles,
+            sets=list(split.keys())
+        )
+        # Save metadata as JSON.
+        with open(os.path.join(config.output_dir, split_name, "dataset.json"), "w") as f:
+            json.dump(metadata.model_dump(), f)
+    # Save IDs mapping
+    with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f:
+        ids_mapping = {v: k for k, v in identifier_map.items()}
+        json.dump([ids_mapping.get(i, "<blank>") for i in range(num_identifiers)], f)
+    # Save Test Puzzles
+    with open(os.path.join(config.output_dir, "test_puzzles.json"), "w") as f:
+        json.dump(test_puzzles, f)
+@cli.command(singleton=True)
+def main(config: DataProcessConfig):
+    convert_dataset(config)
+if __name__ == "__main__":
+    cli()

dataset/build_maze_dataset.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Optional
+import math
+import os
+import csv
+import json
+import numpy as np
+from argdantic import ArgParser
+from pydantic import BaseModel
+from tqdm import tqdm
+from huggingface_hub import hf_hub_download
+from common import PuzzleDatasetMetadata, dihedral_transform
+CHARSET = "# SGo"
+cli = ArgParser()
+class DataProcessConfig(BaseModel):
+    source_repo: str = "sapientinc/maze-30x30-hard-1k"
+    output_dir: str = "data/maze-30x30-hard-1k"
+    subsample_size: Optional[int] = None
+    aug: bool = False
+def convert_subset(set_name: str, config: DataProcessConfig):
+    # Read CSV
+    all_chars = set()
+    grid_size = None
+    inputs = []
+    labels = []
+    with open(hf_hub_download(config.source_repo, f"{set_name}.csv", repo_type="dataset"), newline="") as csvfile:  # type: ignore
+        reader = csv.reader(csvfile)
+        next(reader)  # Skip header
+        for source, q, a, rating in reader:
+            all_chars.update(q)
+            all_chars.update(a)
+            if grid_size is None:
+                n = int(len(q) ** 0.5)
+                grid_size = (n, n)
+            inputs.append(np.frombuffer(q.encode(), dtype=np.uint8).reshape(grid_size))
+            labels.append(np.frombuffer(a.encode(), dtype=np.uint8).reshape(grid_size))
+    # If subsample_size is specified for the training set,
+    # randomly sample the desired number of examples.
+    if set_name == "train" and config.subsample_size is not None:
+        total_samples = len(inputs)
+        if config.subsample_size < total_samples:
+            indices = np.random.choice(total_samples, size=config.subsample_size, replace=False)
+            inputs = [inputs[i] for i in indices]
+            labels = [labels[i] for i in indices]
+    # Generate dataset
+    results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]}
+    puzzle_id = 0
+    example_id = 0
+    results["puzzle_indices"].append(0)
+    results["group_indices"].append(0)
+    for inp, out in zip(tqdm(inputs), labels):
+        # Dihedral transformations for augmentation
+        for aug_idx in range(8 if (set_name == "train" and config.aug) else 1):
+            results["inputs"].append(dihedral_transform(inp, aug_idx))
+            results["labels"].append(dihedral_transform(out, aug_idx))
+            example_id += 1
+            puzzle_id += 1
+            results["puzzle_indices"].append(example_id)
+            results["puzzle_identifiers"].append(0)
+        # Push group
+        results["group_indices"].append(puzzle_id)
+    # Char mappings
+    assert len(all_chars - set(CHARSET)) == 0
+    char2id = np.zeros(256, np.uint8)
+    char2id[np.array(list(map(ord, CHARSET)))] = np.arange(len(CHARSET)) + 1
+    # To Numpy
+    def _seq_to_numpy(seq):
+        arr = np.vstack([char2id[s.reshape(-1)] for s in seq])
+        return arr
+    results = {
+        "inputs": _seq_to_numpy(results["inputs"]),
+        "labels": _seq_to_numpy(results["labels"]),
+        "group_indices": np.array(results["group_indices"], dtype=np.int32),
+        "puzzle_indices": np.array(results["puzzle_indices"], dtype=np.int32),
+        "puzzle_identifiers": np.array(results["puzzle_identifiers"], dtype=np.int32),
+    }
+    # Metadata
+    metadata = PuzzleDatasetMetadata(
+        seq_len=int(math.prod(grid_size)),  # type: ignore
+        vocab_size=len(CHARSET) + 1,  # PAD + Charset
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        num_puzzle_identifiers=1,
+        total_groups=len(results["group_indices"]) - 1,
+        mean_puzzle_examples=1,
+        total_puzzles=len(results["group_indices"]) - 1,
+        sets=["all"]
+    )
+    # Save metadata as JSON.
+    save_dir = os.path.join(config.output_dir, set_name)
+    os.makedirs(save_dir, exist_ok=True)
+    with open(os.path.join(save_dir, "dataset.json"), "w") as f:
+        json.dump(metadata.model_dump(), f)
+    # Save data
+    for k, v in results.items():
+        np.save(os.path.join(save_dir, f"all__{k}.npy"), v)
+    # Save IDs mapping (for visualization only)
+    with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f:
+        json.dump(["<blank>"], f)
+@cli.command(singleton=True)
+def preprocess_data(config: DataProcessConfig):
+    convert_subset("train", config)
+    convert_subset("test", config)
+if __name__ == "__main__":
+    cli()

dataset/build_sudoku_dataset.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from typing import Optional
+import os
+import csv
+import json
+import numpy as np
+from argdantic import ArgParser
+from pydantic import BaseModel
+from tqdm import tqdm
+from huggingface_hub import hf_hub_download
+from common import PuzzleDatasetMetadata
+cli = ArgParser()
+class DataProcessConfig(BaseModel):
+    source_repo: str = "sapientinc/sudoku-extreme"
+    output_dir: str = "data/sudoku-extreme-full"
+    subsample_size: Optional[int] = None
+    min_difficulty: Optional[int] = None
+    num_aug: int = 0
+def shuffle_sudoku(board: np.ndarray, solution: np.ndarray):
+    # Create a random digit mapping: a permutation of 1..9, with zero (blank) unchanged
+    digit_map = np.pad(np.random.permutation(np.arange(1, 10)), (1, 0))
+    # Randomly decide whether to transpose.
+    transpose_flag = np.random.rand() < 0.5
+    # Generate a valid row permutation:
+    # - Shuffle the 3 bands (each band = 3 rows) and for each band, shuffle its 3 rows.
+    bands = np.random.permutation(3)
+    row_perm = np.concatenate([b * 3 + np.random.permutation(3) for b in bands])
+    # Similarly for columns (stacks).
+    stacks = np.random.permutation(3)
+    col_perm = np.concatenate([s * 3 + np.random.permutation(3) for s in stacks])
+    # Build an 81->81 mapping. For each new cell at (i, j)
+    # (row index = i // 9, col index = i % 9),
+    # its value comes from old row = row_perm[i//9] and old col = col_perm[i%9].
+    mapping = np.array([row_perm[i // 9] * 9 + col_perm[i % 9] for i in range(81)])
+    def apply_transformation(x: np.ndarray) -> np.ndarray:
+        # Apply transpose flag
+        if transpose_flag:
+            x = x.T
+        # Apply the position mapping.
+        new_board = x.flatten()[mapping].reshape(9, 9).copy()
+        # Apply digit mapping
+        return digit_map[new_board]
+    return apply_transformation(board), apply_transformation(solution)
+def convert_subset(set_name: str, config: DataProcessConfig):
+    # Read CSV
+    inputs = []
+    labels = []
+    with open(hf_hub_download(config.source_repo, f"{set_name}.csv", repo_type="dataset"), newline="") as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)  # Skip header
+        for source, q, a, rating in reader:
+            if (config.min_difficulty is None) or (int(rating) >= config.min_difficulty):
+                assert len(q) == 81 and len(a) == 81
+                inputs.append(np.frombuffer(q.replace('.', '0').encode(), dtype=np.uint8).reshape(9, 9) - ord('0'))
+                labels.append(np.frombuffer(a.encode(), dtype=np.uint8).reshape(9, 9) - ord('0'))
+    # If subsample_size is specified for the training set,
+    # randomly sample the desired number of examples.
+    if set_name == "train" and config.subsample_size is not None:
+        total_samples = len(inputs)
+        if config.subsample_size < total_samples:
+            indices = np.random.choice(total_samples, size=config.subsample_size, replace=False)
+            inputs = [inputs[i] for i in indices]
+            labels = [labels[i] for i in indices]
+    # Generate dataset
+    num_augments = config.num_aug if set_name == "train" else 0
+    results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]}
+    puzzle_id = 0
+    example_id = 0
+    results["puzzle_indices"].append(0)
+    results["group_indices"].append(0)
+    for orig_inp, orig_out in zip(tqdm(inputs), labels):
+        for aug_idx in range(1 + num_augments):
+            # First index is not augmented
+            if aug_idx == 0:
+                inp, out = orig_inp, orig_out
+            else:
+                inp, out = shuffle_sudoku(orig_inp, orig_out)
+            # Push puzzle (only single example)
+            results["inputs"].append(inp)
+            results["labels"].append(out)
+            example_id += 1
+            puzzle_id += 1
+            results["puzzle_indices"].append(example_id)
+            results["puzzle_identifiers"].append(0)
+        # Push group
+        results["group_indices"].append(puzzle_id)
+    # To Numpy
+    def _seq_to_numpy(seq):
+        arr = np.concatenate(seq).reshape(len(seq), -1)
+        assert np.all((arr >= 0) & (arr <= 9))
+        return arr + 1
+    results = {
+        "inputs": _seq_to_numpy(results["inputs"]),
+        "labels": _seq_to_numpy(results["labels"]),
+        "group_indices": np.array(results["group_indices"], dtype=np.int32),
+        "puzzle_indices": np.array(results["puzzle_indices"], dtype=np.int32),
+        "puzzle_identifiers": np.array(results["puzzle_identifiers"], dtype=np.int32),
+    }
+    # Metadata
+    metadata = PuzzleDatasetMetadata(
+        seq_len=81,
+        vocab_size=10 + 1,  # PAD + "0" ... "9"
+        pad_id=0,
+        ignore_label_id=0,
+        blank_identifier_id=0,
+        num_puzzle_identifiers=1,
+        total_groups=len(results["group_indices"]) - 1,
+        mean_puzzle_examples=1,
+        total_puzzles=len(results["group_indices"]) - 1,
+        sets=["all"]
+    )
+    # Save metadata as JSON.
+    save_dir = os.path.join(config.output_dir, set_name)
+    os.makedirs(save_dir, exist_ok=True)
+    with open(os.path.join(save_dir, "dataset.json"), "w") as f:
+        json.dump(metadata.model_dump(), f)
+    # Save data
+    for k, v in results.items():
+        np.save(os.path.join(save_dir, f"all__{k}.npy"), v)
+    # Save IDs mapping (for visualization only)
+    with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f:
+        json.dump(["<blank>"], f)
+@cli.command(singleton=True)
+def preprocess_data(config: DataProcessConfig):
+    convert_subset("train", config)
+    convert_subset("test", config)
+if __name__ == "__main__":
+    cli()

dataset/common.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import List, Optional
+import pydantic
+import numpy as np
+# Global list mapping each dihedral transform id to its inverse.
+# Index corresponds to the original tid, and the value is its inverse.
+DIHEDRAL_INVERSE = [0, 3, 2, 1, 4, 5, 6, 7]
+class PuzzleDatasetMetadata(pydantic.BaseModel):
+    pad_id: int
+    ignore_label_id: Optional[int]
+    blank_identifier_id: int
+    vocab_size: int
+    seq_len: int
+    num_puzzle_identifiers: int
+    total_groups: int
+    mean_puzzle_examples: float
+    total_puzzles: int
+    sets: List[str]
+def dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:
+    """8 dihedral symmetries by rotate, flip and mirror"""
+    if tid == 0:
+        return arr  # identity
+    elif tid == 1:
+        return np.rot90(arr, k=1)
+    elif tid == 2:
+        return np.rot90(arr, k=2)
+    elif tid == 3:
+        return np.rot90(arr, k=3)
+    elif tid == 4:
+        return np.fliplr(arr)       # horizontal flip
+    elif tid == 5:
+        return np.flipud(arr)       # vertical flip
+    elif tid == 6:
+        return arr.T                # transpose (reflection along main diagonal)
+    elif tid == 7:
+        return np.fliplr(np.rot90(arr, k=1))  # anti-diagonal reflection
+    else:
+        return arr
+def inverse_dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray:
+    return dihedral_transform(arr, DIHEDRAL_INVERSE[tid])

evaluators/arc.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from typing import Dict, Sequence, Optional
+import os
+import json
+import torch
+import numpy as np
+from numba import njit
+import torch.distributed as dist
+from dataset.build_arc_dataset import inverse_aug, grid_hash, arc_grid_to_np
+from dataset.common import PuzzleDatasetMetadata
+@njit
+def _crop(grid: np.ndarray):
+    """Find maximum-sized rectangle without any EOS token inside. """
+    grid = grid.reshape(30, 30)
+    max_area = 0
+    max_size = (0, 0)
+    nr, nc = grid.shape
+    num_c = nc
+    for num_r in range(1, nr + 1):
+        # Scan for maximum c
+        for c in range(1, num_c + 1):
+            x = grid[num_r - 1, c - 1]
+            if (x < 2) | (x > 11):
+                num_c = c - 1
+                break
+        area = num_r * num_c
+        if area > max_area:
+            max_area = area
+            max_size = (num_r, num_c)
+    return (grid[:max_size[0], :max_size[1]] - 2).astype(np.uint8)
+class ARC:
+    required_outputs = {"inputs", "puzzle_identifiers", "q_halt_logits", "preds"}
+    def __init__(self, data_path: str,
+        eval_metadata: PuzzleDatasetMetadata,
+        submission_K: int = 2,
+        pass_Ks: Sequence[int] = (1, 2, 5, 10, 100, 1000),
+        aggregated_voting: bool = True):
+        super().__init__()
+        self.pass_Ks = pass_Ks
+        self.submission_K = submission_K
+        self.aggregated_voting = aggregated_voting
+        self.blank_identifier_id = eval_metadata.blank_identifier_id
+        # Load identifiers and test puzzles
+        with open(os.path.join(data_path, "identifiers.json"), "r") as f:
+            self.identifier_map = json.load(f)
+        with open(os.path.join(data_path, "test_puzzles.json"), "r") as f:
+            self.test_puzzles = json.load(f)
+        # States
+        self._local_hmap = {}
+        self._local_preds = {}
+    def begin_eval(self):
+        if not self.aggregated_voting:
+            # Clear previous predictions
+            self._local_hmap = {}
+            self._local_preds = {}
+    def update_batch(self, batch: Dict[str, torch.Tensor], preds: Dict[str, torch.Tensor]):
+        # Collect required outputs to CPU
+        outputs = {}
+        q_values = None
+        for collection in (batch, preds):
+            for k, v in collection.items():
+                if k in self.required_outputs:
+                    if k == "q_halt_logits":
+                        q_values = v.to(torch.float64).sigmoid().cpu()
+                    else:
+                        outputs[k] = v.cpu()
+        assert q_values is not None
+        # Remove padding from outputs
+        mask = outputs["puzzle_identifiers"] != self.blank_identifier_id
+        outputs = {k: v[mask] for k, v in outputs.items()}
+        # Get predictions
+        for identifier, input, pred, q in zip(outputs["puzzle_identifiers"].numpy(), outputs["inputs"].numpy(), outputs["preds"].numpy(), q_values.numpy()):
+            name = self.identifier_map[identifier]
+            orig_name, _inverse_fn = inverse_aug(name)
+            input_hash = grid_hash(_inverse_fn(_crop(input)))
+            pred = _inverse_fn(_crop(pred))
+            assert np.all((pred >= 0) & (pred <= 9)), f"Puzzle {name}'s prediction out of 0-9 range."  # Sanity check
+            # Store into local state
+            pred_hash = grid_hash(pred)
+            self._local_hmap[pred_hash] = pred
+            self._local_preds.setdefault(orig_name, {})
+            self._local_preds[orig_name].setdefault(input_hash, [])
+            self._local_preds[orig_name][input_hash].append((pred_hash, float(q)))
+    def result(self, save_path: Optional[str], rank: int, world_size: int, group: Optional[torch.distributed.ProcessGroup] = None) -> Optional[Dict[str, float]]:
+        # Gather predictions to rank 0 for voting
+        global_hmap_preds = [None for _ in range(world_size)] if rank == 0 else None
+        dist.gather_object((self._local_hmap, self._local_preds), global_hmap_preds, dst=0, group=group)
+        # Rank 0 logic
+        if rank != 0:
+            return
+        submission = {}
+        correct = [0.0 for _ in range(len(self.pass_Ks))]
+        for name, puzzle in self.test_puzzles.items():
+            # Process test examples in this puzzle
+            submission[name] = []
+            num_test_correct = [0 for _ in range(len(self.pass_Ks))]
+            for pair in puzzle["test"]:
+                input_hash = grid_hash(arc_grid_to_np(pair["input"]))
+                label_hash = grid_hash(arc_grid_to_np(pair["output"]))
+                p_map = {}
+                for hmap, preds in global_hmap_preds:  # type: ignore
+                    for h, q in preds.get(name, {}).get(input_hash, {}):
+                        p_map.setdefault(h, [0, 0])
+                        p_map[h][0] += 1
+                        p_map[h][1] += q
+                if not len(p_map):
+                    print (f"Puzzle {name} has no predictions.")
+                    continue
+                for h, stats in p_map.items():
+                    stats[1] /= stats[0]
+                p_map = sorted(p_map.items(), key=lambda kv: kv[1], reverse=True)
+                # vote for different Ks
+                for i, k in enumerate(self.pass_Ks):
+                    ok = False
+                    for h, stats in p_map[:k]:
+                        ok |= h == label_hash
+                    num_test_correct[i] += ok
+                # Query grids
+                pred_grids = []
+                for h, stats in p_map[:self.submission_K]:
+                    for hmap, preds in global_hmap_preds:  # type: ignore
+                        if h in hmap:
+                            pred_grids.append(hmap[h])
+                            break
+                # Pad to K
+                while len(pred_grids) < self.submission_K:
+                    pred_grids.append(pred_grids[0])
+                submission[name].append({f"attempt_{i + 1}": grid.tolist() for i, grid in enumerate(pred_grids)})
+            # Total correctness
+            for i in range(len(self.pass_Ks)):
+                correct[i] += num_test_correct[i] / len(puzzle["test"])
+        # Save submission
+        if save_path is not None:
+            with open(os.path.join(save_path, "submission.json"), "w") as f:
+                json.dump(submission, f)
+        # Final result
+        all_results = {f"ARC/pass@{k}": correct[i] / len(self.test_puzzles) for i, k in enumerate(self.pass_Ks)}
+        return all_results

kaggle/combined/arc-agi_concept_challenges.json ADDED Viewed