waltgrace commited on
Commit
6ce9fdd
·
verified ·
1 Parent(s): 6123ae4

Add common/common.h

Browse files
Files changed (1) hide show
  1. common/common.h +999 -0
common/common.h ADDED
@@ -0,0 +1,999 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-opt.h"
6
+ #include "ggml.h"
7
+ #include "llama-cpp.h"
8
+
9
+ #include <set>
10
+ #include <sstream>
11
+ #include <string>
12
+ #include <string_view>
13
+ #include <variant>
14
+ #include <vector>
15
+ #include <map>
16
+
17
+ #if defined(_WIN32) && !defined(_WIN32_WINNT)
18
+ #define _WIN32_WINNT 0x0A00
19
+ #endif
20
+
21
+ #ifdef _WIN32
22
+ #define DIRECTORY_SEPARATOR '\\'
23
+ #else
24
+ #define DIRECTORY_SEPARATOR '/'
25
+ #endif // _WIN32
26
+
27
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
28
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
29
+
30
+ #define print_build_info() do { \
31
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
32
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
33
+ } while(0)
34
+
35
+ struct common_time_meas {
36
+ common_time_meas(int64_t & t_acc, bool disable = false);
37
+ ~common_time_meas();
38
+
39
+ const int64_t t_start_us;
40
+
41
+ int64_t & t_acc;
42
+ };
43
+
44
+ struct common_adapter_lora_info {
45
+ std::string path;
46
+ float scale;
47
+
48
+ std::string task_name;
49
+ std::string prompt_prefix;
50
+
51
+ struct llama_adapter_lora * ptr;
52
+ };
53
+
54
+ using llama_tokens = std::vector<llama_token>;
55
+
56
+ // build info
57
+ extern int LLAMA_BUILD_NUMBER;
58
+ extern const char * LLAMA_COMMIT;
59
+ extern const char * LLAMA_COMPILER;
60
+ extern const char * LLAMA_BUILD_TARGET;
61
+
62
+ const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
63
+
64
+ struct common_control_vector_load_info;
65
+
66
+ //
67
+ // CPU utils
68
+ //
69
+
70
+ struct cpu_params {
71
+ int n_threads = -1;
72
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
73
+ bool mask_valid = false; // Default: any CPU
74
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
75
+ bool strict_cpu = false; // Use strict CPU placement
76
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
77
+ };
78
+
79
+ int32_t cpu_get_num_physical_cores();
80
+ int32_t cpu_get_num_math();
81
+
82
+ //
83
+ // Common params
84
+ //
85
+
86
+ enum llama_example {
87
+ LLAMA_EXAMPLE_BATCHED,
88
+ LLAMA_EXAMPLE_DEBUG,
89
+ LLAMA_EXAMPLE_COMMON,
90
+ LLAMA_EXAMPLE_SPECULATIVE,
91
+ LLAMA_EXAMPLE_COMPLETION,
92
+ LLAMA_EXAMPLE_CLI,
93
+ LLAMA_EXAMPLE_EMBEDDING,
94
+ LLAMA_EXAMPLE_PERPLEXITY,
95
+ LLAMA_EXAMPLE_RETRIEVAL,
96
+ LLAMA_EXAMPLE_PASSKEY,
97
+ LLAMA_EXAMPLE_IMATRIX,
98
+ LLAMA_EXAMPLE_BENCH,
99
+ LLAMA_EXAMPLE_SERVER,
100
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
101
+ LLAMA_EXAMPLE_EXPORT_LORA,
102
+ LLAMA_EXAMPLE_MTMD,
103
+ LLAMA_EXAMPLE_LOOKUP,
104
+ LLAMA_EXAMPLE_PARALLEL,
105
+ LLAMA_EXAMPLE_TTS,
106
+ LLAMA_EXAMPLE_DIFFUSION,
107
+ LLAMA_EXAMPLE_FINETUNE,
108
+ LLAMA_EXAMPLE_FIT_PARAMS,
109
+ LLAMA_EXAMPLE_RESULTS,
110
+ LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
111
+
112
+ LLAMA_EXAMPLE_COUNT,
113
+ };
114
+
115
+ enum common_sampler_type {
116
+ COMMON_SAMPLER_TYPE_NONE = 0,
117
+ COMMON_SAMPLER_TYPE_DRY = 1,
118
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
119
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
120
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
121
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
122
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
123
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
124
+ COMMON_SAMPLER_TYPE_XTC = 8,
125
+ COMMON_SAMPLER_TYPE_INFILL = 9,
126
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
127
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
128
+ COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
129
+ };
130
+
131
+ // dimensionality reduction methods, used by cvector-generator
132
+ enum dimre_method {
133
+ DIMRE_METHOD_PCA,
134
+ DIMRE_METHOD_MEAN,
135
+ };
136
+
137
+ enum common_conversation_mode {
138
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
139
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
140
+ COMMON_CONVERSATION_MODE_AUTO = 2,
141
+ };
142
+
143
+ enum common_grammar_trigger_type {
144
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
145
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
146
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
147
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
148
+ };
149
+
150
+ struct common_grammar_trigger {
151
+ common_grammar_trigger_type type;
152
+ std::string value;
153
+ llama_token token = LLAMA_TOKEN_NULL;
154
+ };
155
+
156
+ enum common_params_sampling_config : uint64_t {
157
+ COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
158
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
159
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
160
+ COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
161
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
162
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
163
+ COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
164
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
165
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
166
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
167
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
168
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
169
+ };
170
+
171
+ enum common_speculative_type {
172
+ COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
173
+ COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
174
+ COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
175
+ COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
176
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
177
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
178
+ COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
179
+ COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
180
+ COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
181
+ };
182
+
183
+ // Grammar type enumeration
184
+ enum common_grammar_type {
185
+ COMMON_GRAMMAR_TYPE_NONE, // no grammar set
186
+ COMMON_GRAMMAR_TYPE_USER, // user-provided GBNF (--grammar / "grammar" API field)
187
+ COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, // auto-generated from JSON schema (--json-schema / "json_schema" API field)
188
+ COMMON_GRAMMAR_TYPE_TOOL_CALLS, // auto-generated by chat template parser for function calling
189
+ };
190
+
191
+ // Grammar variant struct with type and grammar string
192
+ struct common_grammar {
193
+ common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
194
+ std::string grammar;
195
+
196
+ // Default constructor - no grammar
197
+ common_grammar() = default;
198
+
199
+ // Constructor with type and grammar string
200
+ common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
201
+ GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
202
+ }
203
+
204
+ // Check if a grammar is set
205
+ bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
206
+ };
207
+
208
+ // Returns the raw grammar string, or empty string if no grammar is set.
209
+ inline const std::string & common_grammar_value(const common_grammar & g) {
210
+ return g.grammar;
211
+ }
212
+
213
+ // Returns true when the generation_prompt should be prefilled into the grammar sampler.
214
+ // Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
215
+ inline bool common_grammar_needs_prefill(const common_grammar & g) {
216
+ return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
217
+ || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
218
+ }
219
+
220
+ // sampling parameters
221
+ struct common_params_sampling {
222
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
223
+
224
+ int32_t n_prev = 64; // number of previous tokens to remember
225
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
226
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
227
+ int32_t top_k = 40; // <= 0 to use vocab size
228
+ float top_p = 0.95f; // 1.0 = disabled
229
+ float min_p = 0.05f; // 0.0 = disabled
230
+ float xtc_probability = 0.00f; // 0.0 = disabled
231
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
232
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
233
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
234
+ float dynatemp_range = 0.00f; // 0.0 = disabled
235
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
236
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
237
+ float penalty_repeat = 1.00f; // 1.0 = disabled
238
+ float penalty_freq = 0.00f; // 0.0 = disabled
239
+ float penalty_present = 0.00f; // 0.0 = disabled
240
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
241
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
242
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
243
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
244
+ float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
245
+ float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
246
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
247
+ float top_n_sigma = -1.00f; // -1.0 = disabled
248
+ float mirostat_tau = 5.00f; // target entropy
249
+ float mirostat_eta = 0.10f; // learning rate
250
+ bool ignore_eos = false;
251
+ bool no_perf = false; // disable performance metrics
252
+ bool timing_per_token = false;
253
+
254
+ uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
255
+
256
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
257
+
258
+ std::vector<enum common_sampler_type> samplers = {
259
+ COMMON_SAMPLER_TYPE_PENALTIES,
260
+ COMMON_SAMPLER_TYPE_DRY,
261
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
262
+ COMMON_SAMPLER_TYPE_TOP_K,
263
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
264
+ COMMON_SAMPLER_TYPE_TOP_P,
265
+ COMMON_SAMPLER_TYPE_MIN_P,
266
+ COMMON_SAMPLER_TYPE_XTC,
267
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
268
+ };
269
+
270
+ common_grammar grammar; // optional grammar constraint (user / output-format / tool-calls)
271
+ bool grammar_lazy = false;
272
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
273
+ std::set<llama_token> preserved_tokens;
274
+
275
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
276
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
277
+
278
+ // The assistant generation prompt already prefilled into the prompt.
279
+ // Fed to the grammar sampler (to advance past pre-existing tokens) and used
280
+ // to determine the reasoning budget sampler's initial state.
281
+ // Only applied when the grammar is of output-format or tool-calls type.
282
+ std::string generation_prompt;
283
+
284
+ // reasoning budget sampler parameters
285
+ // these are populated by the server/CLI based on chat template params
286
+ int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
287
+ std::vector<llama_token> reasoning_budget_start; // start tag token sequence
288
+ std::vector<llama_token> reasoning_budget_end; // end tag token sequence
289
+ std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
290
+
291
+ bool backend_sampling = false;
292
+
293
+ bool has_logit_bias() const {
294
+ return !logit_bias.empty();
295
+ }
296
+
297
+ // print the parameters into a string
298
+ std::string print() const;
299
+ };
300
+
301
+ struct common_params_model {
302
+ std::string path = ""; // model local path // NOLINT
303
+ std::string url = ""; // model url to download // NOLINT
304
+ std::string hf_repo = ""; // HF repo // NOLINT
305
+ std::string hf_file = ""; // HF file // NOLINT
306
+ std::string docker_repo = ""; // Docker repo // NOLINT
307
+ std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
308
+ };
309
+
310
+ struct common_ngram_mod;
311
+
312
+ struct common_params_speculative {
313
+ common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
314
+
315
+ // general-purpose speculative decoding parameters
316
+
317
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
318
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
319
+ float p_split = 0.1f; // speculative decoding split probability
320
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
321
+
322
+ // ngram-based speculative decoding
323
+
324
+ uint16_t ngram_size_n = 12; // ngram size for lookup
325
+ uint16_t ngram_size_m = 48; // mgram size for speculative tokens
326
+ uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
327
+
328
+ std::shared_ptr<common_ngram_mod> ngram_mod;
329
+
330
+ std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
331
+ std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
332
+
333
+ // draft-model speculative decoding
334
+
335
+ struct common_params_model mparams_dft;
336
+
337
+ llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
338
+
339
+ llama_context_params cparams_dft; // these are the parameters for the draft llama_context
340
+
341
+ int32_t n_ctx = 0; // draft context size
342
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
343
+
344
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
345
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
346
+
347
+ struct cpu_params cpuparams;
348
+ struct cpu_params cpuparams_batch;
349
+
350
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
351
+
352
+ std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
353
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
354
+
355
+ bool has_dft() const {
356
+ return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
357
+ }
358
+ };
359
+
360
+ struct common_params_vocoder {
361
+ struct common_params_model model;
362
+
363
+ std::string speaker_file = ""; // speaker file path // NOLINT
364
+
365
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
366
+ };
367
+
368
+ struct common_params_diffusion {
369
+ int32_t steps = 128;
370
+ bool visual_mode = false;
371
+
372
+ float eps = 0; // epsilon for timesteps
373
+ int32_t block_length = 0; // block length for generation
374
+
375
+ int32_t algorithm = 4; // default algorithm: low-confidence
376
+ float alg_temp = 0.0f; // algorithm temperature
377
+
378
+ float cfg_scale = 0; // classifier-free guidance scale
379
+ bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
380
+ };
381
+
382
+ // reasoning API response format (not to be confused as chat template's reasoning format)
383
+ // only used by server
384
+ enum common_reasoning_format {
385
+ COMMON_REASONING_FORMAT_NONE,
386
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
387
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
388
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
389
+ // do not extend this enum unless you absolutely have to
390
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
391
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
392
+ };
393
+
394
+
395
+ struct lr_opt {
396
+ float lr0 = 1e-5; // learning rate at first epoch
397
+ float lr_min = -1;
398
+ float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
399
+ float scale_epoch = 0;
400
+ float wd = 0;
401
+ unsigned epochs = 2;
402
+
403
+ unsigned epoch; // set by optimizer outer (epochs) loop
404
+ // learning rate decay - constant LR per epoch only for now
405
+ float get_lr(float e) const;
406
+ float get_lr() const { return get_lr(epoch); }
407
+ // must call after arg parse, before get_lr
408
+ void init();
409
+ };
410
+
411
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
412
+
413
+ struct common_params {
414
+ int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
415
+ int32_t n_ctx = 0; // context size, 0 == context the model was trained with
416
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
417
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
418
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
419
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
420
+ int32_t n_parallel = 1; // number of parallel sequences to decode
421
+ int32_t n_sequences = 1; // number of sequences to decode
422
+ int32_t grp_attn_n = 1; // group-attention factor
423
+ int32_t grp_attn_w = 512; // group-attention width
424
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
425
+ float rope_freq_base = 0.0f; // RoPE base frequency
426
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
427
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
428
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
429
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
430
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
431
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
432
+
433
+ // offload params
434
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
435
+
436
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
437
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
438
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
439
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
440
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
441
+
442
+ // margin per device in bytes for fitting parameters to free memory:
443
+ std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
444
+
445
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
446
+
447
+ struct cpu_params cpuparams;
448
+ struct cpu_params cpuparams_batch;
449
+
450
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
451
+ void * cb_eval_user_data = nullptr;
452
+
453
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
454
+
455
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
456
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
457
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
458
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
459
+
460
+ struct common_params_sampling sampling;
461
+ struct common_params_speculative speculative;
462
+ struct common_params_vocoder vocoder;
463
+ struct common_params_diffusion diffusion;
464
+
465
+ struct common_params_model model;
466
+
467
+ std::set<std::string> model_alias; // model aliases // NOLINT
468
+ std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
469
+ std::string hf_token = ""; // HF token // NOLINT
470
+ std::string prompt = ""; // NOLINT
471
+ std::string system_prompt = ""; // NOLINT
472
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
473
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
474
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
475
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
476
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
477
+
478
+ // llama-debug specific options
479
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
480
+ bool save_logits = false; // whether to save logits to files // NOLINT
481
+ std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
482
+
483
+ std::vector<std::string> in_files; // all input files
484
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
485
+ std::vector<llama_model_kv_override> kv_overrides;
486
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
487
+
488
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
489
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
490
+
491
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
492
+
493
+ int32_t verbosity = 3; // LOG_LEVEL_INFO
494
+ int32_t control_vector_layer_start = -1; // layer range for control vector
495
+ int32_t control_vector_layer_end = -1; // layer range for control vector
496
+ bool offline = false;
497
+
498
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
499
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
500
+ // (which is more convenient to use for plotting)
501
+ //
502
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
503
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
504
+
505
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
506
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
507
+
508
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
509
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
510
+
511
+ bool kl_divergence = false; // compute KL divergence
512
+
513
+ bool check = false; // check rather than generate results for llama-results
514
+
515
+ bool usage = false; // print usage
516
+ bool completion = false; // print source-able completion script
517
+ bool use_color = false; // use color to distinguish generations and inputs
518
+ bool special = false; // enable special token output
519
+ bool interactive = false; // interactive mode
520
+ bool interactive_first = false; // wait for user input immediately
521
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
522
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
523
+
524
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
525
+ bool multiline_input = false; // reverse the usage of `\`
526
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
527
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
528
+ bool no_perf = false; // disable performance metrics
529
+ bool show_timings = true; // show timing information on CLI
530
+ bool ctx_shift = false; // context shift on infinite text generation
531
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
532
+ bool kv_unified = false; // enable unified KV cache
533
+
534
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
535
+ bool use_mmap = true; // enable mmap to use filesystem cache
536
+ bool use_direct_io = false; // read from disk without buffering
537
+ size_t expert_cache_size = 0; // expert LRU cache size in bytes for MoE models (0 = disabled)
538
+ bool use_mlock = false; // use mlock to keep model in memory
539
+ bool verbose_prompt = false; // print prompt tokens before generation
540
+ bool display_prompt = true; // print prompt before generation
541
+ bool no_kv_offload = false; // disable KV offloading
542
+ bool warmup = true; // warmup run
543
+ bool check_tensors = false; // validate tensor data
544
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
545
+ bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
546
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
547
+
548
+ bool single_turn = false; // single turn chat conversation
549
+
550
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
551
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
552
+
553
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
554
+
555
+ // multimodal models (see tools/mtmd)
556
+ struct common_params_model mmproj;
557
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
558
+ bool no_mmproj = false; // explicitly disable multimodal model
559
+ std::vector<std::string> image; // path to image file(s)
560
+ int image_min_tokens = -1;
561
+ int image_max_tokens = -1;
562
+
563
+ // finetune
564
+ struct lr_opt lr;
565
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
566
+ float val_split = 0.05f; // fraction of the data used for the validation set
567
+
568
+ // embedding
569
+ bool embedding = false; // get only sentence embedding
570
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
571
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
572
+ std::string embd_sep = "\n"; // separator of embeddings
573
+ std::string cls_sep = "\t"; // separator of classification sequences
574
+
575
+ // server params
576
+ int32_t port = 8080; // server listens on this network port
577
+ bool reuse_port = false; // allow multiple sockets to bind to the same port
578
+ int32_t timeout_read = 600; // http read timeout in seconds
579
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
580
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
581
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
582
+ bool cache_prompt = true; // whether to enable prompt caching
583
+ int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
584
+ int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
585
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
586
+
587
+ std::string hostname = "127.0.0.1";
588
+ std::string public_path = ""; // NOLINT
589
+ std::string api_prefix = ""; // NOLINT
590
+ std::string chat_template = ""; // NOLINT
591
+ bool use_jinja = true; // NOLINT
592
+ bool enable_chat_template = true;
593
+ bool force_pure_content_parser = false;
594
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
595
+ int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
596
+ int reasoning_budget = -1;
597
+ std::string reasoning_budget_message; // message injected before end tag when budget exhausted
598
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
599
+ int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
600
+
601
+ std::vector<std::string> api_keys;
602
+
603
+ std::string ssl_file_key = ""; // NOLINT
604
+ std::string ssl_file_cert = ""; // NOLINT
605
+
606
+ std::map<std::string, std::string> default_template_kwargs;
607
+
608
+ // webui configs
609
+ bool webui = true;
610
+ bool webui_mcp_proxy = false;
611
+ std::string webui_config_json;
612
+
613
+ // "advanced" endpoints are disabled by default for better security
614
+ bool endpoint_slots = true;
615
+ bool endpoint_props = false; // only control POST requests, not GET
616
+ bool endpoint_metrics = false;
617
+
618
+ // enable built-in tools
619
+ std::vector<std::string> server_tools;
620
+
621
+ // router server configs
622
+ std::string models_dir = ""; // directory containing models for the router server
623
+ std::string models_preset = ""; // directory containing model presets for the router server
624
+ int models_max = 4; // maximum number of models to load simultaneously
625
+ bool models_autoload = true; // automatically load models when requested via the router server
626
+
627
+ bool log_json = false;
628
+
629
+ std::string slot_save_path;
630
+ std::string media_path; // path to directory for loading media files
631
+
632
+ float slot_prompt_similarity = 0.1f;
633
+
634
+ // batched-bench params
635
+ bool is_pp_shared = false;
636
+ bool is_tg_separate = false;
637
+
638
+ std::vector<int32_t> n_pp;
639
+ std::vector<int32_t> n_tg;
640
+ std::vector<int32_t> n_pl;
641
+
642
+ // retrieval params
643
+ std::vector<std::string> context_files; // context files to embed
644
+
645
+ int32_t chunk_size = 64; // chunk size for context embedding
646
+
647
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
648
+
649
+ // passkey params
650
+ int32_t n_junk = 250; // number of times to repeat the junk text
651
+ int32_t i_pos = -1; // position of the passkey in the junk text
652
+
653
+ // imatrix params
654
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
655
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
656
+ int32_t i_chunk = 0; // start processing from this chunk
657
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
658
+
659
+ bool process_output = false; // collect data for the output tensor
660
+ bool compute_ppl = true; // whether to compute perplexity
661
+ bool show_statistics = false; // show imatrix statistics per tensor
662
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
663
+
664
+ // cvector-generator params
665
+ int n_pca_batch = 100;
666
+ int n_pca_iterations = 1000;
667
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
668
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
669
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
670
+
671
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
672
+
673
+ // batched-bench params
674
+ bool batched_bench_output_jsonl = false;
675
+
676
+ // common params
677
+ std::string out_file; // output filename for all example programs
678
+ // optional callback for model loading progress and cancellation:
679
+ // called with a progress value between 0.0 and 1.0.
680
+ // return false from callback to abort model loading or true to continue
681
+ llama_progress_callback load_progress_callback = NULL;
682
+ void * load_progress_callback_user_data = NULL;
683
+ };
684
+
685
+ // call once at the start of a program if it uses libcommon
686
+ // initializes the logging system and prints info about the build
687
+ void common_init();
688
+
689
+ std::string common_params_get_system_info(const common_params & params);
690
+
691
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
692
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
693
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
694
+ bool set_process_priority(enum ggml_sched_priority prio);
695
+
696
+ //
697
+ // String utils
698
+ //
699
+
700
+ #ifdef __GNUC__
701
+ # if defined(__MINGW32__) && !defined(__clang__)
702
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
703
+ # else
704
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
705
+ # endif
706
+ #else
707
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
708
+ #endif
709
+
710
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
711
+ std::string string_format(const char * fmt, ...);
712
+
713
+ std::string string_strip(const std::string & str);
714
+ std::string string_get_sortable_timestamp();
715
+
716
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
717
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
718
+ std::string string_repeat(const std::string & str, size_t n);
719
+
720
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
721
+
722
+ std::string regex_escape(const std::string & s);
723
+
724
+ template<class T>
725
+ static std::vector<T> string_split(const std::string & str, char delim) {
726
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
727
+ std::vector<T> values;
728
+ std::istringstream str_stream(str);
729
+ std::string token;
730
+ while (std::getline(str_stream, token, delim)) {
731
+ T value;
732
+ std::istringstream token_stream(token);
733
+ token_stream >> value;
734
+ values.push_back(value);
735
+ }
736
+ return values;
737
+ }
738
+
739
+ template<>
740
+ inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
741
+ {
742
+ std::vector<std::string> parts;
743
+ size_t begin_pos = 0;
744
+ size_t delim_pos = str.find(delim);
745
+ while (delim_pos != std::string::npos) {
746
+ std::string part = str.substr(begin_pos, delim_pos - begin_pos);
747
+ parts.emplace_back(part);
748
+ begin_pos = delim_pos + 1;
749
+ delim_pos = str.find(delim, begin_pos);
750
+ }
751
+ parts.emplace_back(str.substr(begin_pos));
752
+ return parts;
753
+ }
754
+
755
+ // remove when moving to c++20
756
+ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
757
+ return str.size() >= prefix.size() &&
758
+ str.compare(0, prefix.size(), prefix) == 0;
759
+ }
760
+
761
+ // remove when moving to c++20
762
+ inline bool string_ends_with(std::string_view str, std::string_view suffix) {
763
+ return str.size() >= suffix.size() &&
764
+ str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
765
+ }
766
+
767
+ inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
768
+ if (string_ends_with(str, suffix)) {
769
+ str.resize(str.size() - suffix.size());
770
+ return true;
771
+ }
772
+ return false;
773
+ }
774
+
775
+ inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
776
+ if (!str.empty() && !stop.empty()) {
777
+ const size_t max_len = std::min(str.size(), stop.size());
778
+ const char last_char = str.back();
779
+ for (size_t len = max_len; len > 0; --len) {
780
+ if (stop[len - 1] == last_char) {
781
+ if (string_ends_with(str, stop.substr(0, len))) {
782
+ return str.size() - len;
783
+ }
784
+ }
785
+ }
786
+ }
787
+ return std::string::npos;
788
+ }
789
+
790
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
791
+ void string_process_escapes(std::string & input);
792
+
793
+ std::string string_from(bool value);
794
+ std::string string_from(const std::vector<int> & values);
795
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
796
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
797
+
798
+ bool glob_match(const std::string & pattern, const std::string & str);
799
+
800
+ //
801
+ // Filesystem utils
802
+ //
803
+
804
+ bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
805
+ bool fs_create_directory_with_parents(const std::string & path);
806
+ bool fs_is_directory(const std::string & path);
807
+
808
+ std::string fs_get_cache_directory();
809
+ std::string fs_get_cache_file(const std::string & filename);
810
+
811
+ struct common_file_info {
812
+ std::string path;
813
+ std::string name;
814
+ size_t size = 0; // in bytes
815
+ bool is_dir = false;
816
+ };
817
+ std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
818
+
819
+ //
820
+ // TTY utils
821
+ //
822
+
823
+ // Auto-detect if colors can be enabled based on terminal and environment
824
+ bool tty_can_use_colors();
825
+
826
+ //
827
+ // Model utils
828
+ //
829
+
830
+ struct common_sampler;
831
+
832
+ // note: defines the model, context, samplers, ets. lifetimes
833
+ struct common_init_result {
834
+ common_init_result(common_params & params);
835
+ ~common_init_result();
836
+
837
+ llama_model * model();
838
+ llama_context * context();
839
+
840
+ common_sampler * sampler(llama_seq_id seq_id);
841
+ void reset_samplers();
842
+
843
+ std::vector<llama_adapter_lora_ptr> & lora();
844
+
845
+ private:
846
+ struct impl;
847
+ std::unique_ptr<impl> pimpl;
848
+ };
849
+
850
+ using common_init_result_ptr = std::unique_ptr<common_init_result>;
851
+
852
+ common_init_result_ptr common_init_from_params(common_params & params);
853
+
854
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
855
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
856
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
857
+
858
+ // clear LoRA adapters from context, then apply new list of adapters
859
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
860
+
861
+ std::string get_model_endpoint();
862
+
863
+ //
864
+ // Batch utils
865
+ //
866
+
867
+ void common_batch_clear(struct llama_batch & batch);
868
+
869
+ void common_batch_add(
870
+ struct llama_batch & batch,
871
+ llama_token id,
872
+ llama_pos pos,
873
+ const std::vector<llama_seq_id> & seq_ids,
874
+ bool logits);
875
+
876
+ // decodes a single batch of tokens for a prompt and manages session tokens
877
+ //
878
+ // Note: We save state before the last token so that we can replay it to ensure
879
+ // compatibility with all memory types. Recurrent/hybrid models cannot remove
880
+ // tokens from memory, so this approach works across all model architectures.
881
+ bool common_prompt_batch_decode(
882
+ struct llama_context * ctx,
883
+ const std::vector<llama_token> & embd,
884
+ int & n_past,
885
+ int n_batch,
886
+ std::string_view state_path,
887
+ bool save_state);
888
+
889
+ // replays the last token after loading state to regenerate logits
890
+ // used after loading session state to ensure the sampling context has valid logits
891
+ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
892
+
893
+ //
894
+ // Vocab utils
895
+ //
896
+
897
+ // tokenizes a string into a vector of tokens
898
+ // should work similar to Python's `tokenizer.encode`
899
+ std::vector<llama_token> common_tokenize(
900
+ const struct llama_context * ctx,
901
+ const std::string & text,
902
+ bool add_special,
903
+ bool parse_special = false);
904
+
905
+ std::vector<llama_token> common_tokenize(
906
+ const struct llama_vocab * vocab,
907
+ const std::string & text,
908
+ bool add_special,
909
+ bool parse_special = false);
910
+
911
+ // tokenizes a token into a piece, optionally renders special/control tokens
912
+ // should work similar to Python's `tokenizer.id_to_piece`
913
+ std::string common_token_to_piece(
914
+ const struct llama_context * ctx,
915
+ llama_token token,
916
+ bool special = true);
917
+
918
+ std::string common_token_to_piece(
919
+ const struct llama_vocab * vocab,
920
+ llama_token token,
921
+ bool special = true);
922
+
923
+ // detokenizes a vector of tokens into a string
924
+ // should work similar to Python's `tokenizer.decode`
925
+ // optionally renders special/control tokens
926
+ std::string common_detokenize(
927
+ const struct llama_context * ctx,
928
+ const std::vector<llama_token> & tokens,
929
+ bool special = true);
930
+
931
+ std::string common_detokenize(
932
+ const struct llama_vocab * vocab,
933
+ const std::vector<llama_token> & tokens,
934
+ bool special = true);
935
+
936
+ //
937
+ // Embedding utils
938
+ //
939
+
940
+ // TODO: replace embd_norm with an enum
941
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
942
+
943
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
944
+
945
+ //
946
+ // Control vector utils
947
+ //
948
+
949
+ struct common_control_vector_data {
950
+ int n_embd;
951
+
952
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
953
+ std::vector<float> data;
954
+ };
955
+
956
+ struct common_control_vector_load_info {
957
+ float strength;
958
+
959
+ std::string fname;
960
+ };
961
+
962
+ // Load control vectors, scale each by strength, and add them together.
963
+ // On error, returns {-1, empty}
964
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
965
+
966
+ //
967
+ // Split utils
968
+ //
969
+
970
+ namespace {
971
+
972
+ const char * const LLM_KV_SPLIT_NO = "split.no";
973
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
974
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
975
+
976
+ }
977
+
978
+ //
979
+ // MoE utils
980
+ //
981
+
982
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
983
+
984
+ inline std::string llm_ffn_exps_block_regex(int idx) {
985
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
986
+ }
987
+
988
+ inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
989
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
990
+ }
991
+
992
+ //
993
+ // training utils
994
+ //
995
+
996
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
997
+
998
+ // "adamw" or "sgd" (case insensitive)
999
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);