import os import re from pathlib import Path import numpy as np TOKENIZED_DIR = "/data/tokenized" # group chunks by base name # cc2025-05_000000_chunk000.bin → cc2025-05_000000.bin groups = {} for f in sorted(Path(TOKENIZED_DIR).glob("*.bin")): m = re.match(r"(.+)_chunk\d+\.bin$", f.name) if m: base = m.group(1) + ".bin" groups.setdefault(base, []).append(f) print(f"Found {len(groups)} base files to assemble") for base_name, chunks in sorted(groups.items()): out_path = Path(TOKENIZED_DIR) / base_name if out_path.exists(): print(f" ✓ Already exists, skipping: {base_name}") continue chunks_sorted = sorted(chunks, key=lambda f: int(re.search(r"_chunk(\d+)\.bin$", f.name).group(1))) tmp_path = Path(TOKENIZED_DIR) / f"{base_name}.tmp" print(f" Merging {len(chunks_sorted)} chunks → {base_name}") total_tokens = 0 with open(tmp_path, "wb") as out: for chunk in chunks_sorted: arr = np.fromfile(chunk, dtype=np.uint16) arr.tofile(out) total_tokens += len(arr) del arr tmp_path.rename(out_path) print(f" ✓ {base_name} | {total_tokens:,} tokens") # delete chunks after successful merge for chunk in chunks_sorted: chunk.unlink() print("Done")