File size: 4,932 Bytes
5f923cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_ENGINE_LITERT_LM_LIB_H_
#define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_ENGINE_LITERT_LM_LIB_H_

#include <filesystem>
#include <fstream>
#include <ios>
#include <optional>
#include <set>
#include <string>
#include <vector>

#include "absl/base/log_severity.h"  // from @com_google_absl
#include "absl/log/log_entry.h"  // from @com_google_absl
#include "absl/log/log_sink.h"  // from @com_google_absl
#include "absl/status/status.h"  // from @com_google_absl
#include "absl/synchronization/mutex.h"  // from @com_google_absl
#include "runtime/engine/io_types.h"

namespace litert {
namespace lm {

class FileLogSink : public absl::LogSink {
 public:
  explicit FileLogSink(const std::string& filename) {
    std::filesystem::path path(filename);
    if (path.has_parent_path()) {
      std::filesystem::create_directories(path.parent_path());
    }
    file_.open(filename, std::ios_base::app);
  }

  void Send(const absl::LogEntry& entry) override {
    absl::MutexLock lock(&mutex_);
    file_ << entry.text_message_with_prefix_and_newline();
  }

 private:
  absl::Mutex mutex_;
  std::ofstream file_;
};

// Input data type for GPU Convolution and Fully Connected operations.
enum class ConvType {
  kAuto,   // Either float32/16 or int8 depending on the model.
  kFloat,  // Either float32 or float16 depending on the activation data type.
  kInt8,   // int8 quantized. Better latency with risk of less accuracy.
};

struct LiteRtLmSettings {
  std::string backend = "gpu";
  std::optional<std::string> vision_backend = std::nullopt;
  std::optional<std::string> audio_backend = std::nullopt;
  std::string sampler_backend = "";
  std::string model_path;
  bool load_model_from_descriptor = false;
  std::string input_prompt = "What is the tallest building in the world?";
  std::optional<std::string> expected_output = std::nullopt;
  std::optional<std::string> log_sink_file = std::nullopt;
  int max_num_tokens = 0;
  int max_output_tokens = -1;
  int max_num_images = 0;
  absl::LogSeverity min_log_level = absl::LogSeverity::kInfo;
  std::set<int> prefill_batch_sizes;
  int num_output_candidates = 1;
  bool benchmark = false;
  int benchmark_prefill_tokens = 0;
  int benchmark_decode_tokens = 0;
  bool async = true;
  bool report_peak_memory_footprint = false;
  bool force_f32 = false;
  bool multi_turns = false;
  int num_cpu_threads = 0;
  // Set external tensor mode false by default since it runs slightly faster
  // during decode as the layout changes optimized for GPU inference is done by
  // GPU, not by CPU.
  bool gpu_external_tensor_mode = false;
  bool configure_magic_numbers = true;
  bool verify_magic_numbers = false;
  bool clear_kv_cache_before_prefill = true;
  int num_logits_to_print_after_decode = 0;
  std::optional<std::string> score_target_text = std::nullopt;
  bool gpu_madvise_original_shared_tensors = true;
  bool disable_cache = false;
  std::string cache_dir = "";
  int prefill_chunk_size = -1;
  std::string preferred_device_substr = "";
  int num_threads_to_upload = -1;
  int num_threads_to_compile = -1;
  bool convert_weights_on_gpu = true;
  bool wait_for_weights_conversion_complete_in_benchmark = true;
  bool optimize_shader_compilation = true;
  bool share_constant_tensors = true;
  // If true, use Session instead of Conversation to run the inference.
  // Note that session does not add necessary prompt templates.
  bool use_session = false;
  int num_iterations = 1;
  std::string litert_dispatch_lib_dir = "";
  bool sampler_handles_input = true;
  ConvType conv_type = ConvType::kAuto;
  bool cache_compiled_shaders_only = false;
  std::string constraint_regex = "";
  bool use_submodel = false;
  bool enable_speculative_decoding = false;
};

struct LitertLmMetrics {
  std::optional<BenchmarkInfo> benchmark_info;
  float peak_mem_mb = 0.0f;
  float peak_private_mb = 0.0f;
};

// Runs the LLM inference with the given settings.
// If metrics is not null, the metrics will be populated with the metrics from
// the inference. Results from each iteration is saved in the vector.
absl::Status RunLiteRtLm(const LiteRtLmSettings& settings,
                         std::vector<LitertLmMetrics>* metrics = nullptr);

}  // namespace lm
}  // namespace litert

#endif  // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_ENGINE_LITERT_LM_LIB_H_