Initial GrandLine implementation: deterministic shard-first dataset preprocessing for LLM pretraining
ed59144 verified | """Test tokenization block.""" | |
| import sys | |
| from pathlib import Path | |
| from unittest.mock import MagicMock, patch | |
| sys.path.insert(0, str(Path(__file__).parent.parent / "src")) | |
| from grandline.blocks.tokenize import TokenizeBlock | |
| from grandline.types import Document, TokenizedDocument | |
| class MockEncoding: | |
| """Mock for tokenizers.Encoding.""" | |
| def __init__(self, ids: list[int]): | |
| self.ids = ids | |
| self.tokens = [f"tok_{i}" for i in ids] | |
| class MockTokenizer: | |
| """Mock for tokenizers.Tokenizer.""" | |
| def __init__(self): | |
| self._no_padding = False | |
| self._no_truncation = False | |
| def no_padding(self): | |
| self._no_padding = True | |
| def no_truncation(self): | |
| self._no_truncation = True | |
| def encode_batch(self, texts, add_special_tokens=False): | |
| # Deterministic mock: token count = word count | |
| results = [] | |
| for text in texts: | |
| words = text.split() | |
| ids = list(range(1, len(words) + 1)) | |
| results.append(MockEncoding(ids)) | |
| return results | |
| def test_tokenize_block_basic(): | |
| """TokenizeBlock converts Documents to TokenizedDocuments.""" | |
| with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: | |
| mock_cls.from_pretrained.return_value = MockTokenizer() | |
| block = TokenizeBlock(tokenizer_name="test/model", batch_size=2) | |
| docs = [ | |
| Document(text="hello world", doc_id="0", metadata={"blake3": "abc"}), | |
| Document(text="one two three four", doc_id="1", metadata={"blake3": "def"}), | |
| Document(text="single", doc_id="2", metadata={"blake3": "ghi"}), | |
| ] | |
| results = list(block(iter(docs))) | |
| assert len(results) == 3 | |
| assert all(isinstance(r, TokenizedDocument) for r in results) | |
| # Check token counts match word counts | |
| assert results[0].num_tokens == 2 # "hello world" | |
| assert results[1].num_tokens == 4 # "one two three four" | |
| assert results[2].num_tokens == 1 # "single" | |
| # Check metadata carried forward | |
| assert results[0].doc_hash == "abc" | |
| assert results[1].doc_hash == "def" | |
| def test_tokenize_block_batch_boundary(): | |
| """Tokenization handles batch boundaries correctly.""" | |
| with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: | |
| mock_cls.from_pretrained.return_value = MockTokenizer() | |
| block = TokenizeBlock(tokenizer_name="test/model", batch_size=2) | |
| docs = [ | |
| Document(text=f"word_{i} extra", doc_id=str(i)) | |
| for i in range(5) | |
| ] | |
| results = list(block(iter(docs))) | |
| assert len(results) == 5 | |
| # All should have 2 tokens ("word_i extra" → 2 words) | |
| for r in results: | |
| assert r.num_tokens == 2 | |
| def test_tokenize_block_signature(): | |
| """Signature includes tokenizer identity.""" | |
| with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: | |
| mock_cls.from_pretrained.return_value = MockTokenizer() | |
| block = TokenizeBlock(tokenizer_name="Qwen/Qwen3-0.6B") | |
| sig = block.signature | |
| assert "tokenize" in sig | |
| assert "Qwen/Qwen3-0.6B" in sig | |
| def test_tokenize_block_preserves_order(): | |
| """Token output order matches input order.""" | |
| with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: | |
| mock_cls.from_pretrained.return_value = MockTokenizer() | |
| block = TokenizeBlock(tokenizer_name="test/model", batch_size=3) | |
| docs = [Document(text=f"doc {i}", doc_id=str(i)) for i in range(7)] | |
| results = list(block(iter(docs))) | |
| assert [r.doc_id for r in results] == [str(i) for i in range(7)] | |
| if __name__ == "__main__": | |
| test_tokenize_block_basic() | |
| test_tokenize_block_batch_boundary() | |
| test_tokenize_block_signature() | |
| test_tokenize_block_preserves_order() | |
| print("All tokenize tests passed!") | |