Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /components /tokenizer_test.cc

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 5 days ago

raw

history blame contribute delete

6.09 kB

	// Copyright 2025 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "runtime/components/tokenizer.h"

	#include <fcntl.h>

	#include <memory>
	#include <string>
	#include <utility>
	#include <vector>

	#include <gmock/gmock.h>
	#include <gtest/gtest.h>
	#include "absl/status/status.h" // from @com_google_absl
	#include "absl/status/statusor.h" // from @com_google_absl
	#include "absl/strings/string_view.h" // from @com_google_absl
	#include "litert/cc/litert_layout.h" // from @litert
	#include "litert/cc/litert_tensor_buffer.h" // from @litert
	#include "litert/test/matchers.h" // from @litert
	#include "runtime/util/convert_tensor_buffer.h"

	namespace litert::lm {
	namespace {

	class MockTokenizer : public Tokenizer {
	public:
	MOCK_METHOD(absl::StatusOr<std::vector<int>>, TextToTokenIds,
	(absl::string_view text), (override));
	MOCK_METHOD(absl::StatusOr<int>, TokenToId, (absl::string_view token),
	(override));
	MOCK_METHOD(absl::StatusOr<std::string>, TokenIdsToText,
	(const std::vector<int>& token_ids), (override));
	MOCK_METHOD(TokenizerType, GetTokenizerType, (), (const, override));
	MOCK_METHOD(std::vector<std::string>, GetTokens, (), (const, override));
	};

	TEST(TokenizerTest, TextToTensorBuffer) {
	auto tokenizer = std::make_unique<MockTokenizer>();
	EXPECT_CALL(*tokenizer, TextToTokenIds("Hello World!"))
	.WillOnce(
	testing::Return(std::vector<int>{90, 547, 58, 735, 210, 466, 2294}));

	absl::string_view text = "Hello World!";
	auto ids_or = tokenizer->TextToTokenIds(text);
	EXPECT_TRUE(ids_or.ok());

	auto tensor_or = tokenizer->TokenIdsToTensorBuffer(ids_or.value());
	auto tensor = std::move(tensor_or.value());
	LITERT_ASSERT_OK_AND_ASSIGN(auto tensor_type, tensor.TensorType());
	EXPECT_EQ(tensor_type.Layout().Dimensions(), ::litert::Dimensions({1, 7}));

	auto copied_data = CopyFromTensorBuffer2D<int>(tensor);
	EXPECT_TRUE(copied_data.HasValue());
	EXPECT_THAT((*copied_data)[0],
	::testing::ElementsAre(90, 547, 58, 735, 210, 466, 2294));
	}

	TEST(TokenizerTest, TensorBufferToTokenIds) {
	auto tokenizer = std::make_unique<MockTokenizer>();

	const std::vector<int> ids = {90, 547, 58, 735, 210, 466, 2294,
	224, 24, 8, 66, 246, 18, 2295};
	LITERT_ASSERT_OK_AND_ASSIGN(TensorBuffer tensor_buffer,
	CopyToTensorBuffer<int>(ids, {2, 7}));
	LITERT_ASSERT_OK_AND_ASSIGN(auto tensor_buffer_type,
	tensor_buffer.TensorType());
	EXPECT_EQ(tensor_buffer_type.Layout().Dimensions(),
	::litert::Dimensions({2, 7}));

	auto token_ids = Tokenizer::TensorBufferToTokenIds(tensor_buffer);
	EXPECT_TRUE(token_ids.ok());
	EXPECT_EQ(token_ids.value().size(), 2);
	EXPECT_EQ(token_ids.value()[0],
	std::vector<int>({90, 547, 58, 735, 210, 466, 2294}));
	EXPECT_EQ(token_ids.value()[1],
	std::vector<int>({224, 24, 8, 66, 246, 18, 2295}));
	}

	TEST(TokenizerTest, TokenIdsToTexts) {
	auto tokenizer = std::make_unique<MockTokenizer>();
	EXPECT_CALL(*tokenizer, TokenIdsToText(::testing::_))
	.WillOnce(testing::Return("▁Hello▁World!"))
	.WillOnce(testing::Return("▁How's▁it▁going?"));

	const std::vector<std::vector<int>> ids = {{90, 547, 58, 735, 210, 466, 2294},
	{224, 24, 8, 66, 246, 18, 2295}};

	auto texts = tokenizer->TokenIdsToTexts(/batch_size=/2, ids);
	EXPECT_TRUE(texts.ok());
	EXPECT_EQ(texts.value().size(), 2);
	EXPECT_EQ(texts.value()[0].value(), "▁Hello▁World!");
	EXPECT_EQ(texts.value()[1].value(), "▁How's▁it▁going?");
	}

	TEST(TokenizerTest, TokenIdsToTextsWithIncompleteBPESequence) {
	auto tokenizer = std::make_unique<MockTokenizer>();
	EXPECT_CALL(*tokenizer, TokenIdsToText(::testing::_))
	.WillOnce(testing::Return(absl::DataLossError("Incomplete BPE sequence")))
	.WillOnce(testing::Return("▁How's▁it▁going?"));

	const std::vector<std::vector<int>> ids = {{90, 547, 58, 735, 210, 466, 2294},
	{224, 24, 8, 66, 246, 18, 2295}};

	auto texts = tokenizer->TokenIdsToTexts(/batch_size=/2, ids);
	EXPECT_TRUE(texts.ok());
	EXPECT_EQ(texts.value().size(), 2);
	EXPECT_EQ(texts.value()[0].status().code(), absl::StatusCode::kDataLoss);
	EXPECT_EQ(texts.value()[1].value(), "▁How's▁it▁going?");
	}

	TEST(TokenizerTest, TokenToId) {
	auto tokenizer = std::make_unique<MockTokenizer>();
	EXPECT_CALL(*tokenizer, TokenToId("X")).WillOnce(testing::Return(123));
	EXPECT_EQ(tokenizer->TokenToId("X").value(), 123);
	}

	TEST(TokenizerTest, MergeTokenIds) {
	const std::vector<std::vector<int>> previous_ids = {{90, 547, 58, 735},
	{224, 24}};
	const std::vector<std::vector<int>> current_ids = {{210, 466, 2294},
	{8, 66, 246, 18, 2295}};
	auto merged = Tokenizer::MergeTokenIds(previous_ids, current_ids);
	EXPECT_TRUE(merged.ok());
	EXPECT_EQ(merged->size(), 2);
	EXPECT_EQ((*merged)[0], std::vector<int>({90, 547, 58, 735, 210, 466, 2294}));
	EXPECT_EQ((*merged)[1], std::vector<int>({224, 24, 8, 66, 246, 18, 2295}));
	}

	TEST(TokenizerTest, HasBpeSuffix) {
	EXPECT_TRUE(Tokenizer::HasBpeSuffix("test\xef\xbf\xbd"));
	EXPECT_FALSE(Tokenizer::HasBpeSuffix("test"));
	EXPECT_FALSE(Tokenizer::HasBpeSuffix(""));
	EXPECT_FALSE(Tokenizer::HasBpeSuffix("\xef\xbf\xbdtest"));
	}

	} // namespace
	} // namespace litert::lm