| import os |
| import re |
| from pathlib import Path |
| import numpy as np |
|
|
| TOKENIZED_DIR = "/data/tokenized" |
|
|
| |
| |
| groups = {} |
| for f in sorted(Path(TOKENIZED_DIR).glob("*.bin")): |
| m = re.match(r"(.+)_chunk\d+\.bin$", f.name) |
| if m: |
| base = m.group(1) + ".bin" |
| groups.setdefault(base, []).append(f) |
|
|
| print(f"Found {len(groups)} base files to assemble") |
|
|
| for base_name, chunks in sorted(groups.items()): |
| out_path = Path(TOKENIZED_DIR) / base_name |
| if out_path.exists(): |
| print(f" ✓ Already exists, skipping: {base_name}") |
| continue |
|
|
| chunks_sorted = sorted(chunks, key=lambda f: int(re.search(r"_chunk(\d+)\.bin$", f.name).group(1))) |
| tmp_path = Path(TOKENIZED_DIR) / f"{base_name}.tmp" |
|
|
| print(f" Merging {len(chunks_sorted)} chunks → {base_name}") |
| total_tokens = 0 |
|
|
| with open(tmp_path, "wb") as out: |
| for chunk in chunks_sorted: |
| arr = np.fromfile(chunk, dtype=np.uint16) |
| arr.tofile(out) |
| total_tokens += len(arr) |
| del arr |
|
|
| tmp_path.rename(out_path) |
| print(f" ✓ {base_name} | {total_tokens:,} tokens") |
|
|
| |
| for chunk in chunks_sorted: |
| chunk.unlink() |
|
|
| print("Done") |