MoTIF / utils /core /tests /llama3_tokenizer_test.py

Upload folder using huggingface_hub

3cf4fff verified 3 months ago

3.09 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.

	import os
	from unittest import TestCase

	from tokenizer import ChatFormat, Llama3Tokenizer

	# From current directory:
	# TOKENIZER_PATH=<path> python -m unittest llama3_tokenizer_test.py


	class TokenizerTests(TestCase):
	def setUp(self):
	self.tokenizer = Llama3Tokenizer(os.environ["TOKENIZER_PATH"])
	self.format = ChatFormat(self.tokenizer)

	def test_special_tokens(self):
	self.assertEqual(
	self.tokenizer.special_tokens["<\|begin_of_text\|>"],
	128000,
	)

	def test_encode(self):
	self.assertEqual(
	self.tokenizer.encode(
	"This is a test sentence.", add_bos=True, add_eos=True
	),
	[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
	)

	def test_decode(self):
	self.assertEqual(
	self.tokenizer.decode(
	[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
	),
	"<\|begin_of_text\|>This is a test sentence.<\|end_of_text\|>",
	)

	def test_encode_message(self):
	message = {
	"role": "user",
	"content": "This is a test sentence.",
	}
	self.assertEqual(
	self.format.encode_message(message),
	[
	128006, # <\|start_header_id\|>
	882, # "user"
	128007, # <\|end_header_id\|>
	271, # "\n\n"
	2028,
	374,
	264,
	1296,
	11914,
	13, # This is a test sentence.
	128009, # <\|eot_id\|>
	],
	)

	def test_encode_dialog(self):
	dialog = [
	{
	"role": "system",
	"content": "This is a test sentence.",
	},
	{
	"role": "user",
	"content": "This is a response.",
	},
	]
	self.assertEqual(
	self.format.encode_dialog_prompt(dialog),
	[
	128000, # <\|begin_of_text\|>
	128006, # <\|start_header_id\|>
	9125, # "system"
	128007, # <\|end_header_id\|>
	271, # "\n\n"
	2028,
	374,
	264,
	1296,
	11914,
	13, # "This is a test sentence."
	128009, # <\|eot_id\|>
	128006, # <\|start_header_id\|>
	882, # "user"
	128007, # <\|end_header_id\|>
	271, # "\n\n"
	2028,
	374,
	264,
	2077,
	13, # "This is a response.",
	128009, # <\|eot_id\|>
	128006, # <\|start_header_id\|>
	78191, # "assistant"
	128007, # <\|end_header_id\|>
	271, # "\n\n"
	],
	)