// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "runtime/core/tasks.h" #include #include #include #include #include #include #include #include #include #include #include "absl/base/nullability.h" // from @com_google_absl #include "absl/functional/any_invocable.h" // from @com_google_absl #include "absl/log/absl_log.h" // from @com_google_absl #include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/str_cat.h" // from @com_google_absl #include "absl/strings/str_replace.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl #include "litert/cc/litert_element_type.h" // from @litert #include "litert/cc/litert_macros.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert #include "runtime/components/constrained_decoding/constrained_decoder.h" #include "runtime/components/constrained_decoding/constraint.h" #include "runtime/components/sampler.h" #include "runtime/components/scoring_cpu_util.h" #include "runtime/components/stop_token_detector.h" #include "runtime/components/tokenizer.h" #include "runtime/engine/io_types.h" #include "runtime/executor/llm_executor.h" #include "runtime/executor/llm_executor_io_types.h" #include "runtime/executor/llm_executor_settings.h" #include "runtime/executor/llm_litert_compiled_model_executor.h" #include "runtime/proto/sampler_params.pb.h" #include "runtime/util/convert_tensor_buffer.h" #include "runtime/util/status_macros.h" //NOLINT #include "tflite/types/half.h" // from @litert namespace litert::lm::Tasks { namespace { // Converts a span of fp16 values to a vector of fp32 values. // TODO: b/499304966 - move this to a common util file and add tests. void ConvertFp16ToFp32(absl::Span fp16_values, std::vector& out) { out.resize(fp16_values.size()); for (int i = 0; i < fp16_values.size(); ++i) { out[i] = static_cast(fp16_values[i]); } } // TODO(b/423364170): all LLM Executors should respect the max number of tokens // returned by the model. We should remove this default value once all Executors // are compliant with the max number of tokens. constexpr int kDefaultMaxNumTokens = 4096; int TryGetMaxNumTokens(const LlmExecutor& executor) { auto settings = executor.GetExecutorSettings(); if (!settings.ok()) { // If the executor settings are not available, we will use the default // value. ABSL_LOG(WARNING) << "Failed to get executor settings: " << settings.status(); return kDefaultMaxNumTokens; } return settings->GetMaxNumTokens(); } // Check whether the decoding loop should stop. bool ShouldStop(bool hit_stop_tokens, int benchmark_decode_token_count, int num_decoded_steps, int current_step, int max_num_tokens, int max_output_tokens) { // Stopping conditions. if (hit_stop_tokens && benchmark_decode_token_count == 0) { // Only early stop if no decode step // is requested by benchmark. return true; } else if (benchmark_decode_token_count > 0 && num_decoded_steps >= benchmark_decode_token_count) { // Stop when the number of decode steps is equal to the // benchmark_decode_token_count (when specified). return true; } else if (current_step >= max_num_tokens) { // Reaching maximum number of kv-cache size. return true; } else if (num_decoded_steps >= max_output_tokens) { // Reaching maximum number of output tokens. return true; } return false; } // A wrapper class to run one step of the decode process, handling both internal // and external sampling. class DecodeOneStep { public: DecodeOneStep(LlmExecutor* absl_nonnull executor, Tokenizer* absl_nonnull tokenizer, int num_output_candidates, const StopTokenDetector& stop_token_detector, std::optional& benchmark_info, std::optional sampler, Constraint* constraint) : executor_(*executor), tokenizer_(*tokenizer), num_output_candidates_(num_output_candidates), sampler_(sampler), benchmark_info_(benchmark_info), stop_token_detector_(stop_token_detector) { if (constraint != nullptr) { constrained_decoder_ = std::make_unique( constraint, num_output_candidates_); } if (sampler_.has_value()) { // External sampling setup auto scores_tensor = CreateTensorBuffer({num_output_candidates_}); scores_tensor_ = std::move(*scores_tensor); } result_text_ = std::vector(num_output_candidates_, ""); bpe_partial_token_ids_ = std::vector>(num_output_candidates_); pending_stop_tokens_ = std::vector>(num_output_candidates_); } // Runs one step of the decode process and returns if all stops for all // candidates have been found. // For external sampling, `decoded_ids` must be provided and will be updated. // For internal sampling, `decoded_ids` is ignored. absl::StatusOr Run( std::optional decoded_ids = std::nullopt) { ASSIGN_OR_RETURN(auto token_ids, DecodeAndSample(std::move(decoded_ids))); size_t sequence_length = token_ids[0].size(); for (size_t i = 1; i < token_ids.size(); ++i) { RET_CHECK_EQ(token_ids[i].size(), sequence_length) << "The current implementation of ProcessTokens() requires that " "latest_tokens must contain sequences of the same length."; } for (int i = 0; i < num_output_candidates_; ++i) { result_text_[i].clear(); } for (size_t step = 0; step < sequence_length; ++step) { std::vector> step_tokens; step_tokens.reserve(num_output_candidates_); for (int batch = 0; batch < num_output_candidates_; ++batch) { step_tokens.push_back({token_ids[batch][step]}); } // Regardless of BPE, we always process the next tokens to detect stop // tokens. RETURN_IF_ERROR(stop_token_detector_.ProcessTokens(step_tokens)); // Merge BPE partial token ids with the next token ids if any. ASSIGN_OR_RETURN(step_tokens, tokenizer_.MergeTokenIds( bpe_partial_token_ids_, step_tokens)); auto decoded_result = tokenizer_.TokenIdsToTexts(num_output_candidates_, step_tokens); for (int i = 0; i < num_output_candidates_; ++i) { if (Tokenizer::IsIncompleteBpeSequence(decoded_result.value()[i])) { bpe_partial_token_ids_[i] = step_tokens[i]; } else if (!stop_token_detector_.GetStopTokensFound()[i]) { bpe_partial_token_ids_[i].clear(); // Handle partial stop tokens. int max_length = stop_token_detector_.MaxPartialStopTokenLength(i); if (max_length > 0) { pending_stop_tokens_[i].push(decoded_result.value()[i].value()); } // We only need the latest max_length tokens for partial stop tokens. // Add the extra ones to the result text and we could keep only the // latest max_length stop tokens in the queue. while (pending_stop_tokens_[i].size() > max_length) { result_text_[i] += pending_stop_tokens_[i].front(); pending_stop_tokens_[i].pop(); } // No partial stop token is found - add the current token to the // result text directly - this is the most common case. if (max_length == 0) { result_text_[i] += decoded_result.value()[i].value(); } } } if (sampler_.has_value()) { LITERT_ASSIGN_OR_RETURN(scores_span_, ReferTensorBufferAsSpan(scores_tensor_)); } is_first_step_ = false; ASSIGN_OR_RETURN(bool all_done, stop_token_detector_.AllDone()); if (all_done) { if (step != sequence_length - 1) { // we are done before all the tokens are processed, so we need to // rollback the processed tokens in executor. int diff = sequence_length - step; ASSIGN_OR_RETURN(int current_step, executor_.GetCurrentStep()); RETURN_IF_ERROR(executor_.SetCurrentStep(current_step - diff)); } return true; } } return false; } absl::Span GetScores() { return scores_span_; } const std::vector& GetResultText() const { return result_text_; } // This function is only supported for external sampling. // It computes the log likelihoods for the sampled ids corresponding to the // ids of a batch and returns it as a vector of floats. // step_input_ids: The ids corresponding to the input text for the batch. // decoded_ids: The decoded id tensor buffer in which the sampled ids are // written so that the model uses reference text future step. // Returns: A vector of log likelihoods for the sampled ids. // TODO: b/499304966 - Add tests for the float16 path. absl::StatusOr> RunScoreStep( const float temperature, const std::vector& step_input_ids, litert::TensorBuffer decoded_ids) { LITERT_ASSIGN_OR_RETURN(auto duplicate_decoded_ids, decoded_ids.Duplicate()); const ExecutorInputs inputs( ExecutorTextData(std::move(duplicate_decoded_ids)), /*vision_data=*/std::nullopt, /*audio_data=*/std::nullopt); // Decoding section. if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("executor_decode")); } ASSIGN_OR_RETURN(auto output_logits, executor_.DecodeLogits(inputs)); if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("executor_decode")); } decoded_ids.Write(step_input_ids); LITERT_ASSIGN_OR_RETURN(auto logits_tensor_type, output_logits.TensorType()); auto logits_dims = logits_tensor_type.Layout().Dimensions(); // Logits dims are {batch, seq, vocab}. For scoring, we expect batch size to // be the same as the input batch size, sequence length to be 1, and vocab // size to be the same as the tokenizer size. RET_CHECK_EQ(logits_dims.size(), 3) << "Output logits must have shape [batch, seq, vocab]."; const int batch_size = step_input_ids.size(); RET_CHECK_EQ(logits_dims[0], batch_size) << "Logits batch size does not match the input batch size."; RET_CHECK_EQ(logits_dims[1], 1) << "Scoring expects a single decode step."; absl::Span logits_data; std::vector logits_data_buffer; if (logits_tensor_type.ElementType() == litert::ElementType::Float32) { auto logits_data_or = ReferTensorBufferAsSpan(output_logits); if (!logits_data_or) { LITERT_ASSIGN_OR_RETURN(logits_data_buffer, CopyFromTensorBuffer(output_logits)); logits_data = absl::MakeSpan(logits_data_buffer); } else { logits_data = *logits_data_or; } } else if (logits_tensor_type.ElementType() == litert::ElementType::Float16) { LITERT_ASSIGN_OR_RETURN( auto logits_data_f16, CopyFromTensorBuffer(output_logits)); ConvertFp16ToFp32(absl::MakeConstSpan(logits_data_f16), logits_data_buffer); logits_data = absl::MakeSpan(logits_data_buffer); } else { return absl::InvalidArgumentError( absl::StrCat("Unsupported logits element type for scoring: ", logits_tensor_type.ElementType())); } RET_CHECK_EQ(logits_data.size(), batch_size * logits_dims[2]) << "Logits buffer size does not match logits tensor shape."; return ComputeLogLikelihood(logits_data, step_input_ids, temperature); } private: // Runs the core decoding and sampling step, for either internal or external // sampling. Returns a pointer to the tensor buffer containing the next token // IDs. absl::StatusOr>> DecodeAndSample( std::optional decoded_ids) { if (sampler_) { // External sampling path if (!decoded_ids) { return absl::InternalError( "decoded_ids must be provided for external sampling."); } LITERT_ASSIGN_OR_RETURN(auto duplicate_decoded_ids, decoded_ids->Duplicate()); ExecutorInputs inputs(ExecutorTextData(std::move(duplicate_decoded_ids)), std::nullopt, std::nullopt); // Update constraint state only with decode ids. // If this is the first step, last_token_ids comes from prefill, therefore // should be ignored. if (!is_first_step_ && constrained_decoder_) { LITERT_ASSIGN_OR_RETURN(auto last_token_ids, decoded_ids->Duplicate()); RETURN_IF_ERROR( constrained_decoder_->UpdateConstraintState(last_token_ids)); } // Decoding section. if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("executor_decode")); } ASSIGN_OR_RETURN(auto output_logits, executor_.DecodeLogits(inputs)); if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("executor_decode")); } // If constrained decoding is enabled, masks the logits based on the // constraint state. if (constrained_decoder_) { RETURN_IF_ERROR(constrained_decoder_->MaskLogits(output_logits)); } // Samping section. if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("sampling")); } RETURN_IF_ERROR(sampler_.value()->SampleToIdAndScoreBuffer( output_logits, decoded_ids.value(), &scores_tensor_)); if (benchmark_info_.has_value()) { RETURN_IF_ERROR(benchmark_info_->TimeMarkDelta("sampling")); } ASSIGN_OR_RETURN(auto token_ids, tokenizer_.TensorBufferToTokenIds(decoded_ids.value())); return token_ids; } else { // Internal sampling path // Benchmark executor_decode_and_sample section. if (benchmark_info_.has_value()) { RETURN_IF_ERROR( benchmark_info_->TimeMarkDelta("executor_decode_and_sample")); } std::vector> output_tokens; if (constrained_decoder_) { auto decode_params = ExecutorDecodeParams(); decode_params.SetConstraintDecoder(constrained_decoder_.get()); ASSIGN_OR_RETURN(output_tokens, executor_.Decode(decode_params)); } else { ASSIGN_OR_RETURN(output_tokens, executor_.Decode()); } if (benchmark_info_.has_value()) { RETURN_IF_ERROR( benchmark_info_->TimeMarkDelta("executor_decode_and_sample")); } return output_tokens; } } LlmExecutor& executor_; Tokenizer& tokenizer_; const int num_output_candidates_; std::optional sampler_; std::unique_ptr constrained_decoder_; std::optional benchmark_info_; StopTokenDetector stop_token_detector_; // For external sampling. // Holds the scores for the output candidates. Dim: {num_output_candidates} litert::TensorBuffer scores_tensor_; absl::Span scores_span_; // Common state std::vector> bpe_partial_token_ids_; std::vector> pending_stop_tokens_; std::vector result_text_; bool is_first_step_ = true; }; } // namespace absl::StatusOr Prefill( LlmExecutor& executor, ExecutorInputs& inputs, bool wait_for_completion, std::optional& benchmark_info) { const int max_num_tokens = TryGetMaxNumTokens(executor); ASSIGN_OR_RETURN(auto text_data, inputs.GetTextDataPtr()); RET_CHECK(text_data != nullptr) << "text_data must not be null."; LITERT_ASSIGN_OR_RETURN(auto token_id_tensor_type, text_data->GetTokenIds().TensorType()); auto num_tokens = token_id_tensor_type.Layout().Dimensions().back(); if (num_tokens >= max_num_tokens) { return absl::InvalidArgumentError(absl::StrCat( "Input token ids are too long. Exceeding the maximum number of tokens " "allowed: ", num_tokens, " >= ", max_num_tokens)); } LITERT_ASSIGN_OR_RETURN(auto ids_buffer_span, ReferTensorBufferAsSpan( text_data->GetTokenIds())); if (ids_buffer_span.empty()) { return absl::InternalError("Input token ids are empty."); } ExecutorPrefillParams params; // Wait for prefill to complete if benchmark mode is enabled. params.SetWaitForCompletion(wait_for_completion | benchmark_info.has_value()); if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimePrefillTurnStart()); } RETURN_IF_ERROR(executor.Prefill(inputs, params)); if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimePrefillTurnEnd(ids_buffer_span.size())); } return Responses(TaskState::kDone); } absl::StatusOr Decode( LlmExecutor& executor, Tokenizer& tokenizer, const StopTokenDetector& stop_token_detector, int num_output_candidates, std::optional& benchmark_info, std::optional sampler, Constraint* constraint, std::optional decoded_ids, absl::AnyInvocable)>& callback, std::atomic* cancelled, int max_output_tokens) { const bool is_streaming = callback != nullptr; const bool is_custom_sampling = sampler.has_value(); int benchmark_decode_token_count = 0; if (benchmark_info.has_value()) { // Initialize sampler early if the executor supports it. auto* compiled_model_executor = dynamic_cast(&executor); if (compiled_model_executor != nullptr) { compiled_model_executor->InitializeSampler().IgnoreError(); } benchmark_decode_token_count = benchmark_info->GetBenchmarkParams().num_decode_tokens(); RETURN_IF_ERROR(benchmark_info->TimeDecodeTurnStart()); } // The final decoded texts for each candidate. std::vector final_texts(num_output_candidates); // The final scores for each candidate. std::vector final_scores(num_output_candidates); // The accumulated scores for each candidate (for custom sampling). std::vector accumulated_scores(num_output_candidates); // The number of decoded tokens for each candidate (for custom sampling). std::vector num_decoded_tokens(num_output_candidates); ASSIGN_OR_RETURN(int executor_step_before_decode, executor.GetCurrentStep()); const int max_num_tokens = TryGetMaxNumTokens(executor); DecodeOneStep run_one_step(&executor, &tokenizer, num_output_candidates, stop_token_detector, benchmark_info, sampler, constraint); while (true) { if (cancelled != nullptr && cancelled->load()) { if (benchmark_info.has_value()) { ASSIGN_OR_RETURN(int current_step, executor.GetCurrentStep()); int num_decode_steps = current_step - executor_step_before_decode; // If the process is cancelled, we need to end this benchmark phase. RETURN_IF_ERROR(benchmark_info->TimeDecodeTurnEnd( num_decode_steps * num_output_candidates)); } if (is_custom_sampling) { // For external sampling, the sampled tokens are provided by the // sampler. We must run one prefill to add the last token as pending // token in the LLM Executor when cancellation happens. LITERT_ASSIGN_OR_RETURN(auto duplicated_decoded_ids, decoded_ids->Duplicate()); ExecutorInputs inputs; inputs.SetTextData(ExecutorTextData(std::move(duplicated_decoded_ids))); std::optional unused_benchmark_info; ASSIGN_OR_RETURN(auto current_step, executor.GetCurrentStep()); RETURN_IF_ERROR(executor.SetCurrentStep(current_step - 1)); auto status = Prefill(executor, inputs, /*wait_for_completion=*/true, unused_benchmark_info); if (!status.ok()) { return status.status(); } } return absl::CancelledError("Process cancelled."); } std::optional decoded_ids_to_use = std::nullopt; if (decoded_ids.has_value()) { LITERT_ASSIGN_OR_RETURN(decoded_ids_to_use, decoded_ids->Duplicate()); } absl::StatusOr all_done = run_one_step.Run(std::move(decoded_ids_to_use)); if (!all_done.ok()) { return all_done.status(); } std::vector step_texts; std::vector step_scores; if (is_streaming) { step_texts.resize(num_output_candidates); step_scores.resize(num_output_candidates); } bool any_updates = false; for (int j = 0; j < num_output_candidates; ++j) { std::string output_text = run_one_step.GetResultText()[j]; if (output_text.empty()) { // No output text for this candidate - could be due to // 1. early stopping. // 2. partial BPE sequence. // 3. matching partial stop tokens. continue; } any_updates = true; // The tokenizer may return a token with a special character "▁" that // should be replaced with a space. std::string result_text = absl::StrReplaceAll(output_text, {{"▁", " "}}); if (is_streaming) { step_texts[j] = result_text; if (is_custom_sampling) { step_scores[j] = run_one_step.GetScores()[j]; } } else { final_texts[j] += result_text; if (is_custom_sampling) { accumulated_scores[j] += run_one_step.GetScores()[j]; num_decoded_tokens[j]++; } } } if (is_streaming && any_updates) { callback(Responses(TaskState::kProcessing, std::move(step_texts), std::move(step_scores))); } ASSIGN_OR_RETURN(int current_step, executor.GetCurrentStep()); int num_decode_steps = current_step - executor_step_before_decode; if (ShouldStop(*all_done, benchmark_decode_token_count, num_decode_steps, current_step, max_num_tokens, max_output_tokens)) { break; } } int num_decode_steps = executor.GetCurrentStep().value() - executor_step_before_decode; if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimeDecodeTurnEnd(num_decode_steps * num_output_candidates)); } if (is_custom_sampling) { // For external sampling, the sampled tokens are provided by the sampler. We // must run one prefill to add the stop token as pending token in the LLM // Executor when stop condition is met. LITERT_ASSIGN_OR_RETURN(auto duplicated_decoded_ids, decoded_ids->Duplicate()); ExecutorInputs inputs; inputs.SetTextData(ExecutorTextData(std::move(duplicated_decoded_ids))); std::optional unused_benchmark_info; ASSIGN_OR_RETURN(auto current_step, executor.GetCurrentStep()); RETURN_IF_ERROR(executor.SetCurrentStep(current_step - 1)); auto status = Prefill(executor, inputs, /*wait_for_completion=*/true, unused_benchmark_info); if (!status.ok()) { return status.status(); } } if (is_streaming) { if (executor.GetCurrentStep().value() >= max_num_tokens) { return Responses(TaskState::kMaxNumTokensReached); } return Responses(TaskState::kDone); } // Finalize scores for non-streaming custom sampling. if (is_custom_sampling) { for (int j = 0; j < num_output_candidates; ++j) { if (num_decoded_tokens[j] > 0) { final_scores[j] = accumulated_scores[j] / num_decoded_tokens[j]; } else { final_scores[j] = -std::numeric_limits::infinity(); } } } TaskState task_state = executor.GetCurrentStep().value() >= max_num_tokens ? TaskState::kMaxNumTokensReached : TaskState::kDone; return Responses(std::move(task_state), std::move(final_texts), std::move(final_scores)); } absl::StatusOr Score( LlmExecutor& executor, Tokenizer& tokenizer, const std::vector& target_texts, const float temperature, litert::TensorBuffer decoded_ids, bool store_token_lengths) { const int num_output_candidates = target_texts.size(); const int max_num_tokens = TryGetMaxNumTokens(executor); std::optional benchmark_info; // Create a dummy StopTokenDetector as it's not used in ScoreCustomSampling. StopTokenDetector dummy_stop_token_detector(num_output_candidates); DecodeOneStep run_one_step(&executor, &tokenizer, /*num_output_candidates=*/num_output_candidates, dummy_stop_token_detector, benchmark_info, /*sampler=*/std::nullopt, /*constraint=*/nullptr); std::vector> ids_for_each_target_in_batch; ids_for_each_target_in_batch.reserve(target_texts.size()); int max_num_tokens_of_target_texts = 0; for (const auto& target : target_texts) { ASSIGN_OR_RETURN(std::vector ids, tokenizer.TextToTokenIds(target)); max_num_tokens_of_target_texts = std::max(max_num_tokens_of_target_texts, static_cast(ids.size())); ids_for_each_target_in_batch.push_back(std::move(ids)); } if (max_num_tokens_of_target_texts >= max_num_tokens) { return absl::InvalidArgumentError( absl::StrCat("Input token ids are too long. " "Exceeding the maximum number of tokens allowed: ", max_num_tokens_of_target_texts, " >= ", max_num_tokens)); } // The scores for each candidate. The scores are accumulated over the course // of the decoding process. std::vector scores(num_output_candidates); std::vector> token_scores(num_output_candidates); // We support multiple targets by padding the targets with a null token which // does not exist in the vocabulary and thus does not contribute to the // perplexity. std::vector decoded_ids_for_each_target_in_batch(num_output_candidates, 0); for (int i = 0; i < max_num_tokens_of_target_texts; ++i) { for (int j = 0; j < num_output_candidates; ++j) { const int size_of_jth_target = ids_for_each_target_in_batch[j].size(); if (i < size_of_jth_target) { decoded_ids_for_each_target_in_batch[j] = ids_for_each_target_in_batch[j][i]; } else { // Pad the target with a null token. Ignore the result at this step. decoded_ids_for_each_target_in_batch[j] = 0; } } LITERT_ASSIGN_OR_RETURN(auto decoded_ids_copy, decoded_ids.Duplicate()); ASSIGN_OR_RETURN(std::vector step_log_likelihoods, run_one_step.RunScoreStep( temperature, decoded_ids_for_each_target_in_batch, std::move(decoded_ids_copy))); for (int j = 0; j < num_output_candidates; ++j) { const int size_of_jth_target = ids_for_each_target_in_batch[j].size(); // Only add the log likelihood of the non-padded tokens to the score. if (i < size_of_jth_target) { scores[j] += step_log_likelihoods[j]; token_scores[j].push_back(step_log_likelihoods[j]); } } } std::vector token_lengths; if (store_token_lengths) { // Store the token lengths of the target texts for each candidate into // `Responses`. This is optional. token_lengths.reserve(num_output_candidates); for (int j = 0; j < num_output_candidates; ++j) { token_lengths.push_back(ids_for_each_target_in_batch[j].size()); } } auto responses = Responses(TaskState::kDone, /*response_texts=*/{}, std::move(scores), std::move(token_lengths)); responses.GetMutableTokenScores() = std::move(token_scores); return responses; } } // namespace litert::lm::Tasks