File size: 12,566 Bytes
4f0238f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""

Tests for Music Tokenizer Extension.

"""

import pytest
from unittest.mock import MagicMock, patch

from TouchGrass.tokenizer.music_token_extension import MusicTokenizerExtension


class TestMusicTokenizerExtension:
    """Test suite for MusicTokenizerExtension."""

    def setup_method(self):
        """Set up test fixtures."""
        self.special_tokens = {
            "[GUITAR]": 32000,
            "[PIANO]": 32001,
            "[DRUMS]": 32002,
            "[VOCALS]": 32003,
            "[THEORY]": 32004,
            "[PRODUCTION]": 32005,
            "[FRUSTRATED]": 32006,
            "[CONFUSED]": 32007,
            "[EXCITED]": 32008,
            "[CONFIDENT]": 32009,
            "[EASY]": 32010,
            "[MEDIUM]": 32011,
            "[HARD]": 32012,
            "[TAB]": 32013,
            "[CHORD]": 32014,
            "[SCALE]": 32015,
            "[INTERVAL]": 32016,
            "[PROGRESSION]": 32017,
            "[SIMPLIFY]": 32018,
            "[ENCOURAGE]": 32019,
        }
        self.music_vocab_extensions = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

    def test_tokenizer_initialization(self):
        """Test that tokenizer initializes correctly with special tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=self.music_vocab_extensions
            )

            assert ext.base_tokenizer == mock_tokenizer
            mock_tokenizer_class.from_pretrained.assert_called_once_with("Qwen/Qwen3.5-3B-Instruct")

    def test_special_tokens_added(self):
        """Test that special tokens are added to tokenizer."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            expected_tokens = list(self.special_tokens.keys())
            mock_tokenizer.add_special_tokens.assert_called_once_with(
                {"additional_special_tokens": expected_tokens}
            )

    def test_music_vocab_extensions_added(self):
        """Test that music vocabulary extensions are added."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens={},
                music_vocab_extensions=self.music_vocab_extensions
            )

            # Check that add_tokens was called with music vocab extensions
            assert mock_tokenizer.add_tokens.called
            added_tokens = mock_tokenizer.add_tokens.call_args[0][0]
            assert set(added_tokens) == set(self.music_vocab_extensions)

    def test_tokenizer_vocab_size_increased(self):
        """Test that vocab size is correctly increased after adding tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            num_special = len(self.special_tokens)
            num_music = len(self.music_vocab_extensions)
            expected_new_vocab_size = 32000 + num_special + num_music

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=self.music_vocab_extensions
            )

            assert ext.base_tokenizer.vocab_size == expected_new_vocab_size

    def test_encode_with_music_tokens(self):
        """Test encoding text with music tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer.encode.return_value = [1, 2, 32000, 3, 4]
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            result = ext.encode("Play a [GUITAR] chord")
            assert result == [1, 2, 32000, 3, 4]
            mock_tokenizer.encode.assert_called_once_with("Play a [GUITAR] chord")

    def test_decode_with_music_tokens(self):
        """Test decoding token IDs with music tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer.decode.return_value = "Play a [GUITAR] chord"
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            result = ext.decode([1, 2, 32000, 3, 4])
            assert result == "Play a [GUITAR] chord"
            mock_tokenizer.decode.assert_called_once_with([1, 2, 32000, 3, 4])

    def test_get_music_token_id(self):
        """Test retrieving token ID for a music token."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer.convert_tokens_to_ids.return_value = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            token_id = ext.get_music_token_id("[GUITAR]")
            assert token_id == 32000
            mock_tokenizer.convert_tokens_to_ids.assert_called_with("[GUITAR]")

    def test_has_music_token(self):
        """Test checking if a token is a music token."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            assert ext.has_music_token("[GUITAR]") is True
            assert ext.has_music_token("[UNKNOWN]") is False

    def test_get_music_domain_tokens(self):
        """Test retrieving all domain tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            domain_tokens = ext.get_music_domain_tokens()
            expected = ["[GUITAR]", "[PIANO]", "[DRUMS]", "[VOCALS]", "[THEORY]", "[PRODUCTION]"]
            assert domain_tokens == expected

    def test_get_emotion_tokens(self):
        """Test retrieving emotion tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            emotion_tokens = ext.get_emotion_tokens()
            expected = ["[FRUSTRATED]", "[CONFUSED]", "[EXCITED]", "[CONFIDENT]"]
            assert emotion_tokens == expected

    def test_get_difficulty_tokens(self):
        """Test retrieving difficulty tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            difficulty_tokens = ext.get_difficulty_tokens()
            expected = ["[EASY]", "[MEDIUM]", "[HARD]"]
            assert difficulty_tokens == expected

    def test_get_music_function_tokens(self):
        """Test retrieving music function tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            function_tokens = ext.get_music_function_tokens()
            expected = ["[TAB]", "[CHORD]", "[SCALE]", "[INTERVAL]", "[PROGRESSION]"]
            assert function_tokens == expected

    def test_get_eq_tokens(self):
        """Test retrieving EQ (emotional intelligence) tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32021
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=[]
            )

            eq_tokens = ext.get_eq_tokens()
            expected = ["[FRUSTRATED]", "[CONFUSED]", "[EXCITED]", "[CONFIDENT]", "[SIMPLIFY]", "[ENCOURAGE]"]
            assert eq_tokens == expected

    def test_token_count_with_music_tokens(self):
        """Test that token count increases after adding music tokens."""
        with patch('TouchGrass.tokenizer.music_token_extension.AutoTokenizer') as mock_tokenizer_class:
            mock_tokenizer = MagicMock()
            mock_tokenizer.vocab_size = 32000
            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer

            num_special = len(self.special_tokens)
            num_music = len(self.music_vocab_extensions)

            ext = MusicTokenizerExtension(
                "Qwen/Qwen3.5-3B-Instruct",
                special_tokens=self.special_tokens,
                music_vocab_extensions=self.music_vocab_extensions
            )

            expected_vocab_size = 32000 + num_special + num_music
            assert ext.base_tokenizer.vocab_size == expected_vocab_size
            assert ext.base_tokenizer.vocab_size > 32000


if __name__ == "__main__":
    pytest.main([__file__, "-v"])