grandline / tests /test_tokenize.py
dignity045's picture
Initial GrandLine implementation: deterministic shard-first dataset preprocessing for LLM pretraining
ed59144 verified
"""Test tokenization block."""
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from grandline.blocks.tokenize import TokenizeBlock
from grandline.types import Document, TokenizedDocument
class MockEncoding:
"""Mock for tokenizers.Encoding."""
def __init__(self, ids: list[int]):
self.ids = ids
self.tokens = [f"tok_{i}" for i in ids]
class MockTokenizer:
"""Mock for tokenizers.Tokenizer."""
def __init__(self):
self._no_padding = False
self._no_truncation = False
def no_padding(self):
self._no_padding = True
def no_truncation(self):
self._no_truncation = True
def encode_batch(self, texts, add_special_tokens=False):
# Deterministic mock: token count = word count
results = []
for text in texts:
words = text.split()
ids = list(range(1, len(words) + 1))
results.append(MockEncoding(ids))
return results
def test_tokenize_block_basic():
"""TokenizeBlock converts Documents to TokenizedDocuments."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)
docs = [
Document(text="hello world", doc_id="0", metadata={"blake3": "abc"}),
Document(text="one two three four", doc_id="1", metadata={"blake3": "def"}),
Document(text="single", doc_id="2", metadata={"blake3": "ghi"}),
]
results = list(block(iter(docs)))
assert len(results) == 3
assert all(isinstance(r, TokenizedDocument) for r in results)
# Check token counts match word counts
assert results[0].num_tokens == 2 # "hello world"
assert results[1].num_tokens == 4 # "one two three four"
assert results[2].num_tokens == 1 # "single"
# Check metadata carried forward
assert results[0].doc_hash == "abc"
assert results[1].doc_hash == "def"
def test_tokenize_block_batch_boundary():
"""Tokenization handles batch boundaries correctly."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)
docs = [
Document(text=f"word_{i} extra", doc_id=str(i))
for i in range(5)
]
results = list(block(iter(docs)))
assert len(results) == 5
# All should have 2 tokens ("word_i extra" → 2 words)
for r in results:
assert r.num_tokens == 2
def test_tokenize_block_signature():
"""Signature includes tokenizer identity."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="Qwen/Qwen3-0.6B")
sig = block.signature
assert "tokenize" in sig
assert "Qwen/Qwen3-0.6B" in sig
def test_tokenize_block_preserves_order():
"""Token output order matches input order."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=3)
docs = [Document(text=f"doc {i}", doc_id=str(i)) for i in range(7)]
results = list(block(iter(docs)))
assert [r.doc_id for r in results] == [str(i) for i in range(7)]
if __name__ == "__main__":
test_tokenize_block_basic()
test_tokenize_block_batch_boundary()
test_tokenize_block_signature()
test_tokenize_block_preserves_order()
print("All tokenize tests passed!")