Tok5 / app.py
Neon-tech's picture
Update app.py
977352c verified
Raw
History Blame Contribute Delete
1.31 kB
import os
import re
from pathlib import Path
import numpy as np
TOKENIZED_DIR = "/data/tokenized"
# group chunks by base name
# cc2025-05_000000_chunk000.bin → cc2025-05_000000.bin
groups = {}
for f in sorted(Path(TOKENIZED_DIR).glob("*.bin")):
m = re.match(r"(.+)_chunk\d+\.bin$", f.name)
if m:
base = m.group(1) + ".bin"
groups.setdefault(base, []).append(f)
print(f"Found {len(groups)} base files to assemble")
for base_name, chunks in sorted(groups.items()):
out_path = Path(TOKENIZED_DIR) / base_name
if out_path.exists():
print(f" ✓ Already exists, skipping: {base_name}")
continue
chunks_sorted = sorted(chunks, key=lambda f: int(re.search(r"_chunk(\d+)\.bin$", f.name).group(1)))
tmp_path = Path(TOKENIZED_DIR) / f"{base_name}.tmp"
print(f" Merging {len(chunks_sorted)} chunks → {base_name}")
total_tokens = 0
with open(tmp_path, "wb") as out:
for chunk in chunks_sorted:
arr = np.fromfile(chunk, dtype=np.uint16)
arr.tofile(out)
total_tokens += len(arr)
del arr
tmp_path.rename(out_path)
print(f" ✓ {base_name} | {total_tokens:,} tokens")
# delete chunks after successful merge
for chunk in chunks_sorted:
chunk.unlink()
print("Done")