"""Test tokenization block.""" import sys from pathlib import Path from unittest.mock import MagicMock, patch sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from grandline.blocks.tokenize import TokenizeBlock from grandline.types import Document, TokenizedDocument class MockEncoding: """Mock for tokenizers.Encoding.""" def __init__(self, ids: list[int]): self.ids = ids self.tokens = [f"tok_{i}" for i in ids] class MockTokenizer: """Mock for tokenizers.Tokenizer.""" def __init__(self): self._no_padding = False self._no_truncation = False def no_padding(self): self._no_padding = True def no_truncation(self): self._no_truncation = True def encode_batch(self, texts, add_special_tokens=False): # Deterministic mock: token count = word count results = [] for text in texts: words = text.split() ids = list(range(1, len(words) + 1)) results.append(MockEncoding(ids)) return results def test_tokenize_block_basic(): """TokenizeBlock converts Documents to TokenizedDocuments.""" with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: mock_cls.from_pretrained.return_value = MockTokenizer() block = TokenizeBlock(tokenizer_name="test/model", batch_size=2) docs = [ Document(text="hello world", doc_id="0", metadata={"blake3": "abc"}), Document(text="one two three four", doc_id="1", metadata={"blake3": "def"}), Document(text="single", doc_id="2", metadata={"blake3": "ghi"}), ] results = list(block(iter(docs))) assert len(results) == 3 assert all(isinstance(r, TokenizedDocument) for r in results) # Check token counts match word counts assert results[0].num_tokens == 2 # "hello world" assert results[1].num_tokens == 4 # "one two three four" assert results[2].num_tokens == 1 # "single" # Check metadata carried forward assert results[0].doc_hash == "abc" assert results[1].doc_hash == "def" def test_tokenize_block_batch_boundary(): """Tokenization handles batch boundaries correctly.""" with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: mock_cls.from_pretrained.return_value = MockTokenizer() block = TokenizeBlock(tokenizer_name="test/model", batch_size=2) docs = [ Document(text=f"word_{i} extra", doc_id=str(i)) for i in range(5) ] results = list(block(iter(docs))) assert len(results) == 5 # All should have 2 tokens ("word_i extra" → 2 words) for r in results: assert r.num_tokens == 2 def test_tokenize_block_signature(): """Signature includes tokenizer identity.""" with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: mock_cls.from_pretrained.return_value = MockTokenizer() block = TokenizeBlock(tokenizer_name="Qwen/Qwen3-0.6B") sig = block.signature assert "tokenize" in sig assert "Qwen/Qwen3-0.6B" in sig def test_tokenize_block_preserves_order(): """Token output order matches input order.""" with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls: mock_cls.from_pretrained.return_value = MockTokenizer() block = TokenizeBlock(tokenizer_name="test/model", batch_size=3) docs = [Document(text=f"doc {i}", doc_id=str(i)) for i in range(7)] results = list(block(iter(docs))) assert [r.doc_id for r in results] == [str(i) for i in range(7)] if __name__ == "__main__": test_tokenize_block_basic() test_tokenize_block_batch_boundary() test_tokenize_block_signature() test_tokenize_block_preserves_order() print("All tokenize tests passed!")