File size: 3,872 Bytes
ed59144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Test tokenization block."""

import sys
from pathlib import Path
from unittest.mock import MagicMock, patch

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from grandline.blocks.tokenize import TokenizeBlock
from grandline.types import Document, TokenizedDocument


class MockEncoding:
    """Mock for tokenizers.Encoding."""

    def __init__(self, ids: list[int]):
        self.ids = ids
        self.tokens = [f"tok_{i}" for i in ids]


class MockTokenizer:
    """Mock for tokenizers.Tokenizer."""

    def __init__(self):
        self._no_padding = False
        self._no_truncation = False

    def no_padding(self):
        self._no_padding = True

    def no_truncation(self):
        self._no_truncation = True

    def encode_batch(self, texts, add_special_tokens=False):
        # Deterministic mock: token count = word count
        results = []
        for text in texts:
            words = text.split()
            ids = list(range(1, len(words) + 1))
            results.append(MockEncoding(ids))
        return results


def test_tokenize_block_basic():
    """TokenizeBlock converts Documents to TokenizedDocuments."""
    with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
        mock_cls.from_pretrained.return_value = MockTokenizer()

        block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)

        docs = [
            Document(text="hello world", doc_id="0", metadata={"blake3": "abc"}),
            Document(text="one two three four", doc_id="1", metadata={"blake3": "def"}),
            Document(text="single", doc_id="2", metadata={"blake3": "ghi"}),
        ]

        results = list(block(iter(docs)))

        assert len(results) == 3
        assert all(isinstance(r, TokenizedDocument) for r in results)

        # Check token counts match word counts
        assert results[0].num_tokens == 2  # "hello world"
        assert results[1].num_tokens == 4  # "one two three four"
        assert results[2].num_tokens == 1  # "single"

        # Check metadata carried forward
        assert results[0].doc_hash == "abc"
        assert results[1].doc_hash == "def"


def test_tokenize_block_batch_boundary():
    """Tokenization handles batch boundaries correctly."""
    with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
        mock_cls.from_pretrained.return_value = MockTokenizer()

        block = TokenizeBlock(tokenizer_name="test/model", batch_size=2)

        docs = [
            Document(text=f"word_{i} extra", doc_id=str(i))
            for i in range(5)
        ]

        results = list(block(iter(docs)))
        assert len(results) == 5

        # All should have 2 tokens ("word_i extra" → 2 words)
        for r in results:
            assert r.num_tokens == 2


def test_tokenize_block_signature():
    """Signature includes tokenizer identity."""
    with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
        mock_cls.from_pretrained.return_value = MockTokenizer()

        block = TokenizeBlock(tokenizer_name="Qwen/Qwen3-0.6B")

        sig = block.signature
        assert "tokenize" in sig
        assert "Qwen/Qwen3-0.6B" in sig


def test_tokenize_block_preserves_order():
    """Token output order matches input order."""
    with patch("grandline.blocks.tokenize.Tokenizer") as mock_cls:
        mock_cls.from_pretrained.return_value = MockTokenizer()

        block = TokenizeBlock(tokenizer_name="test/model", batch_size=3)

        docs = [Document(text=f"doc {i}", doc_id=str(i)) for i in range(7)]
        results = list(block(iter(docs)))

        assert [r.doc_id for r in results] == [str(i) for i in range(7)]


if __name__ == "__main__":
    test_tokenize_block_basic()
    test_tokenize_block_batch_boundary()
    test_tokenize_block_signature()
    test_tokenize_block_preserves_order()
    print("All tokenize tests passed!")