File size: 3,872 Bytes
ed59144 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """Test tokenization block."""
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from grandline.blocks.tokenize import TokenizeBlock
from grandline.types import Document, TokenizedDocument
class MockEncoding:
"""Mock for tokenizers.Encoding."""
def __init__(self, ids: list[int]):
self.ids = ids
self.tokens = [f"tok_{i}" for i in ids]
class MockTokenizer:
"""Mock for tokenizers.Tokenizer."""
def __init__(self):
self._no_padding = False
self._no_truncation = False
def no_padding(self):
self._no_padding = True
def no_truncation(self):
self._no_truncation = True
def encode_batch(self, texts, add_special_tokens=False):
# Deterministic mock: token count = word count
results = []
for text in texts:
words = text.split()
ids = list(range(1, len(words) + 1))
results.append(MockEncoding(ids))
return results
def test_tokenize_block_basic():
"""TokenizeBlock converts Documents to TokenizedDocuments."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)
docs = [
Document(text="hello world", doc_id="0", metadata={"blake3": "abc"}),
Document(text="one two three four", doc_id="1", metadata={"blake3": "def"}),
Document(text="single", doc_id="2", metadata={"blake3": "ghi"}),
]
results = list(block(iter(docs)))
assert len(results) == 3
assert all(isinstance(r, TokenizedDocument) for r in results)
# Check token counts match word counts
assert results[0].num_tokens == 2 # "hello world"
assert results[1].num_tokens == 4 # "one two three four"
assert results[2].num_tokens == 1 # "single"
# Check metadata carried forward
assert results[0].doc_hash == "abc"
assert results[1].doc_hash == "def"
def test_tokenize_block_batch_boundary():
"""Tokenization handles batch boundaries correctly."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)
docs = [
Document(text=f"word_{i} extra", doc_id=str(i))
for i in range(5)
]
results = list(block(iter(docs)))
assert len(results) == 5
# All should have 2 tokens ("word_i extra" → 2 words)
for r in results:
assert r.num_tokens == 2
def test_tokenize_block_signature():
"""Signature includes tokenizer identity."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="Qwen/Qwen3-0.6B")
sig = block.signature
assert "tokenize" in sig
assert "Qwen/Qwen3-0.6B" in sig
def test_tokenize_block_preserves_order():
"""Token output order matches input order."""
with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
mock_cls.from_pretrained.return_value = MockTokenizer()
block = TokenizeBlock(tokenizer_name="test/model", batch_size=3)
docs = [Document(text=f"doc {i}", doc_id=str(i)) for i in range(7)]
results = list(block(iter(docs)))
assert [r.doc_id for r in results] == [str(i) for i in range(7)]
if __name__ == "__main__":
test_tokenize_block_basic()
test_tokenize_block_batch_boundary()
test_tokenize_block_signature()
test_tokenize_block_preserves_order()
print("All tokenize tests passed!")
|