Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 6,086 Bytes

5f923cd

// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/components/tokenizer.h"

#include <fcntl.h>

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/status/status.h"  // from @com_google_absl
#include "absl/status/statusor.h"  // from @com_google_absl
#include "absl/strings/string_view.h"  // from @com_google_absl
#include "litert/cc/litert_layout.h"  // from @litert
#include "litert/cc/litert_tensor_buffer.h"  // from @litert
#include "litert/test/matchers.h"  // from @litert
#include "runtime/util/convert_tensor_buffer.h"

namespace litert::lm {
namespace {

class MockTokenizer : public Tokenizer {
 public:
  MOCK_METHOD(absl::StatusOr<std::vector<int>>, TextToTokenIds,
              (absl::string_view text), (override));
  MOCK_METHOD(absl::StatusOr<int>, TokenToId, (absl::string_view token),
              (override));
  MOCK_METHOD(absl::StatusOr<std::string>, TokenIdsToText,
              (const std::vector<int>& token_ids), (override));
  MOCK_METHOD(TokenizerType, GetTokenizerType, (), (const, override));
  MOCK_METHOD(std::vector<std::string>, GetTokens, (), (const, override));
};

TEST(TokenizerTest, TextToTensorBuffer) {
  auto tokenizer = std::make_unique<MockTokenizer>();
  EXPECT_CALL(*tokenizer, TextToTokenIds("Hello World!"))
      .WillOnce(
          testing::Return(std::vector<int>{90, 547, 58, 735, 210, 466, 2294}));

  absl::string_view text = "Hello World!";
  auto ids_or = tokenizer->TextToTokenIds(text);
  EXPECT_TRUE(ids_or.ok());

  auto tensor_or = tokenizer->TokenIdsToTensorBuffer(ids_or.value());
  auto tensor = std::move(tensor_or.value());
  LITERT_ASSERT_OK_AND_ASSIGN(auto tensor_type, tensor.TensorType());
  EXPECT_EQ(tensor_type.Layout().Dimensions(), ::litert::Dimensions({1, 7}));

  auto copied_data = CopyFromTensorBuffer2D<int>(tensor);
  EXPECT_TRUE(copied_data.HasValue());
  EXPECT_THAT((*copied_data)[0],
              ::testing::ElementsAre(90, 547, 58, 735, 210, 466, 2294));
}

TEST(TokenizerTest, TensorBufferToTokenIds) {
  auto tokenizer = std::make_unique<MockTokenizer>();

  const std::vector<int> ids = {90,  547, 58, 735, 210, 466, 2294,
                                224, 24,  8,  66,  246, 18,  2295};
  LITERT_ASSERT_OK_AND_ASSIGN(TensorBuffer tensor_buffer,
                              CopyToTensorBuffer<int>(ids, {2, 7}));
  LITERT_ASSERT_OK_AND_ASSIGN(auto tensor_buffer_type,
                              tensor_buffer.TensorType());
  EXPECT_EQ(tensor_buffer_type.Layout().Dimensions(),
            ::litert::Dimensions({2, 7}));

  auto token_ids = Tokenizer::TensorBufferToTokenIds(tensor_buffer);
  EXPECT_TRUE(token_ids.ok());
  EXPECT_EQ(token_ids.value().size(), 2);
  EXPECT_EQ(token_ids.value()[0],
            std::vector<int>({90, 547, 58, 735, 210, 466, 2294}));
  EXPECT_EQ(token_ids.value()[1],
            std::vector<int>({224, 24, 8, 66, 246, 18, 2295}));
}

TEST(TokenizerTest, TokenIdsToTexts) {
  auto tokenizer = std::make_unique<MockTokenizer>();
  EXPECT_CALL(*tokenizer, TokenIdsToText(::testing::_))
      .WillOnce(testing::Return("▁Hello▁World!"))
      .WillOnce(testing::Return("▁How's▁it▁going?"));

  const std::vector<std::vector<int>> ids = {{90, 547, 58, 735, 210, 466, 2294},
                                             {224, 24, 8, 66, 246, 18, 2295}};

  auto texts = tokenizer->TokenIdsToTexts(/*batch_size=*/2, ids);
  EXPECT_TRUE(texts.ok());
  EXPECT_EQ(texts.value().size(), 2);
  EXPECT_EQ(texts.value()[0].value(), "▁Hello▁World!");
  EXPECT_EQ(texts.value()[1].value(), "▁How's▁it▁going?");
}

TEST(TokenizerTest, TokenIdsToTextsWithIncompleteBPESequence) {
  auto tokenizer = std::make_unique<MockTokenizer>();
  EXPECT_CALL(*tokenizer, TokenIdsToText(::testing::_))
      .WillOnce(testing::Return(absl::DataLossError("Incomplete BPE sequence")))
      .WillOnce(testing::Return("▁How's▁it▁going?"));

  const std::vector<std::vector<int>> ids = {{90, 547, 58, 735, 210, 466, 2294},
                                             {224, 24, 8, 66, 246, 18, 2295}};

  auto texts = tokenizer->TokenIdsToTexts(/*batch_size=*/2, ids);
  EXPECT_TRUE(texts.ok());
  EXPECT_EQ(texts.value().size(), 2);
  EXPECT_EQ(texts.value()[0].status().code(), absl::StatusCode::kDataLoss);
  EXPECT_EQ(texts.value()[1].value(), "▁How's▁it▁going?");
}

TEST(TokenizerTest, TokenToId) {
  auto tokenizer = std::make_unique<MockTokenizer>();
  EXPECT_CALL(*tokenizer, TokenToId("X")).WillOnce(testing::Return(123));
  EXPECT_EQ(tokenizer->TokenToId("X").value(), 123);
}

TEST(TokenizerTest, MergeTokenIds) {
  const std::vector<std::vector<int>> previous_ids = {{90, 547, 58, 735},
                                                      {224, 24}};
  const std::vector<std::vector<int>> current_ids = {{210, 466, 2294},
                                                     {8, 66, 246, 18, 2295}};
  auto merged = Tokenizer::MergeTokenIds(previous_ids, current_ids);
  EXPECT_TRUE(merged.ok());
  EXPECT_EQ(merged->size(), 2);
  EXPECT_EQ((*merged)[0], std::vector<int>({90, 547, 58, 735, 210, 466, 2294}));
  EXPECT_EQ((*merged)[1], std::vector<int>({224, 24, 8, 66, 246, 18, 2295}));
}

TEST(TokenizerTest, HasBpeSuffix) {
  EXPECT_TRUE(Tokenizer::HasBpeSuffix("test\xef\xbf\xbd"));
  EXPECT_FALSE(Tokenizer::HasBpeSuffix("test"));
  EXPECT_FALSE(Tokenizer::HasBpeSuffix(""));
  EXPECT_FALSE(Tokenizer::HasBpeSuffix("\xef\xbf\xbdtest"));
}

}  // namespace
}  // namespace litert::lm