Tok5

Runtime error

Tok5 / app.py

Update app.py

977352c verified about 2 months ago

1.31 kB

	import os
	import re
	from pathlib import Path
	import numpy as np

	TOKENIZED_DIR = "/data/tokenized"

	# group chunks by base name
	# cc2025-05_000000_chunk000.bin → cc2025-05_000000.bin
	groups = {}
	for f in sorted(Path(TOKENIZED_DIR).glob("*.bin")):
	m = re.match(r"(.+)_chunk\d+\.bin$", f.name)
	if m:
	base = m.group(1) + ".bin"
	groups.setdefault(base, []).append(f)

	print(f"Found {len(groups)} base files to assemble")

	for base_name, chunks in sorted(groups.items()):
	out_path = Path(TOKENIZED_DIR) / base_name
	if out_path.exists():
	print(f" ✓ Already exists, skipping: {base_name}")
	continue

	chunks_sorted = sorted(chunks, key=lambda f: int(re.search(r"_chunk(\d+)\.bin$", f.name).group(1)))
	tmp_path = Path(TOKENIZED_DIR) / f"{base_name}.tmp"

	print(f" Merging {len(chunks_sorted)} chunks → {base_name}")
	total_tokens = 0

	with open(tmp_path, "wb") as out:
	for chunk in chunks_sorted:
	arr = np.fromfile(chunk, dtype=np.uint16)
	arr.tofile(out)
	total_tokens += len(arr)
	del arr

	tmp_path.rename(out_path)
	print(f" ✓ {base_name} \| {total_tokens:,} tokens")

	# delete chunks after successful merge
	for chunk in chunks_sorted:
	chunk.unlink()

	print("Done")