Sapnous-VR-12B / test_tokenization_sapnous.py

Upload 20 files

07d61c1 verified about 1 year ago

6.52 kB

	# coding=utf-8
	# Copyright 2025-present, the HuggingFace Inc. Team and AIRAS Inc. Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import unittest
	import torch
	from pathlib import Path
	from transformers import AutoTokenizer
	from .tokenization_sapnous import SapnousTokenizer

	class TestSapnousTokenizer(unittest.TestCase):
	@classmethod
	def setUpClass(cls):
	# Create temporary vocab and merges files for testing
	cls.temp_dir = Path('test_tokenizer_files')
	cls.temp_dir.mkdir(exist_ok=True)

	# Create a simple test vocabulary
	cls.vocab_file = cls.temp_dir / 'vocab.json'
	cls.vocab = {
	'<\|endoftext\|>': 0,
	'<\|startoftext\|>': 1,
	'<\|pad\|>': 2,
	'<\|vision_start\|>': 3,
	'<\|vision_end\|>': 4,
	'<\|image\|>': 5,
	'<\|video\|>': 6,
	'hello': 7,
	'world': 8,
	'test': 9,
	}
	with cls.vocab_file.open('w', encoding='utf-8') as f:
	import json
	json.dump(cls.vocab, f)

	# Create test merges file
	cls.merges_file = cls.temp_dir / 'merges.txt'
	merges_content = "#version: 0.2\nh e\ne l\nl l\no w\nw o\no r\nr l\nl d"
	cls.merges_file.write_text(merges_content)

	# Initialize tokenizer
	cls.tokenizer = SapnousTokenizer(
	str(cls.vocab_file),
	str(cls.merges_file),
	)

	@classmethod
	def tearDownClass(cls):
	# Clean up temporary files
	import shutil
	shutil.rmtree(cls.temp_dir)

	def test_tokenizer_initialization(self):
	self.assertEqual(self.tokenizer.vocab_size, len(self.vocab))
	self.assertEqual(self.tokenizer.get_vocab(), self.vocab)

	# Test special tokens
	self.assertEqual(self.tokenizer.unk_token, '<\|endoftext\|>')
	self.assertEqual(self.tokenizer.bos_token, '<\|startoftext\|>')
	self.assertEqual(self.tokenizer.eos_token, '<\|endoftext\|>')
	self.assertEqual(self.tokenizer.pad_token, '<\|pad\|>')

	def test_tokenization(self):
	text = "hello world test"
	tokens = self.tokenizer.tokenize(text)
	self.assertIsInstance(tokens, list)
	self.assertTrue(all(isinstance(token, str) for token in tokens))

	# Test encoding
	input_ids = self.tokenizer.encode(text, add_special_tokens=False)
	self.assertIsInstance(input_ids, list)
	self.assertEqual(len(input_ids), 3) # 'hello', 'world', 'test'

	# Test decoding
	decoded_text = self.tokenizer.decode(input_ids)
	self.assertEqual(decoded_text.strip(), text)

	def test_special_tokens_handling(self):
	text = "hello world"
	# Test with special tokens
	tokens_with_special = self.tokenizer.encode(text, add_special_tokens=True)
	self.assertTrue(tokens_with_special[0] == self.tokenizer.bos_token_id)
	self.assertTrue(tokens_with_special[-1] == self.tokenizer.eos_token_id)

	# Test without special tokens
	tokens_without_special = self.tokenizer.encode(text, add_special_tokens=False)
	self.assertNotEqual(tokens_without_special[0], self.tokenizer.bos_token_id)
	self.assertNotEqual(tokens_without_special[-1], self.tokenizer.eos_token_id)

	def test_vision_tokens(self):
	# Test vision-specific token methods
	text = "This is an image description"
	vision_text = self.tokenizer.prepare_for_vision(text)
	self.assertTrue(vision_text.startswith('<\|vision_start\|>'))
	self.assertTrue(vision_text.endswith('<\|vision_end\|>'))

	image_text = self.tokenizer.prepare_for_image(text)
	self.assertTrue(image_text.startswith('<\|image\|>'))

	video_text = self.tokenizer.prepare_for_video(text)
	self.assertTrue(video_text.startswith('<\|video\|>'))

	def test_batch_encoding(self):
	texts = ["hello world", "test hello"]
	batch_encoding = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

	self.assertIsInstance(batch_encoding["input_ids"], torch.Tensor)
	self.assertIsInstance(batch_encoding["attention_mask"], torch.Tensor)
	self.assertEqual(batch_encoding["input_ids"].shape[0], len(texts))
	self.assertEqual(batch_encoding["attention_mask"].shape[0], len(texts))

	def test_save_and_load(self):
	# Test saving vocabulary
	save_dir = Path('test_save_tokenizer')
	save_dir.mkdir(exist_ok=True)

	try:
	vocab_files = self.tokenizer.save_vocabulary(str(save_dir))
	self.assertTrue(all(Path(f).exists() for f in vocab_files))

	# Test loading saved vocabulary
	loaded_tokenizer = SapnousTokenizer(*vocab_files)
	self.assertEqual(loaded_tokenizer.get_vocab(), self.tokenizer.get_vocab())

	# Test encoding/decoding with loaded tokenizer
	text = "hello world test"
	original_encoding = self.tokenizer.encode(text)
	loaded_encoding = loaded_tokenizer.encode(text)
	self.assertEqual(original_encoding, loaded_encoding)
	finally:
	# Clean up
	import shutil
	shutil.rmtree(save_dir)

	def test_auto_tokenizer_registration(self):
	# Test if the tokenizer can be loaded using AutoTokenizer
	config = {
	"model_type": "sapnous",
	"vocab_file": str(self.vocab_file),
	"merges_file": str(self.merges_file)
	}

	tokenizer = AutoTokenizer.from_pretrained(str(self.temp_dir), **config)
	self.assertIsInstance(tokenizer, SapnousTokenizer)

	if __name__ == '__main__':
	unittest.main()