|
|
#pragma once |
|
|
|
|
|
#include "llama.h" |
|
|
#include "llama-cparams.h" |
|
|
#include "llama-graph.h" |
|
|
#include "llama-adapter.h" |
|
|
|
|
|
#include "ggml-cpp.h" |
|
|
#include "ggml-opt.h" |
|
|
|
|
|
#include <map> |
|
|
#include <vector> |
|
|
|
|
|
struct llama_model; |
|
|
class llama_batch_allocr; |
|
|
|
|
|
class llama_io_read_i; |
|
|
class llama_io_write_i; |
|
|
|
|
|
struct llama_memory_i; |
|
|
struct llama_memory_context_i; |
|
|
|
|
|
struct llama_context { |
|
|
|
|
|
llama_context( |
|
|
const llama_model & model, |
|
|
llama_context_params params); |
|
|
|
|
|
~llama_context(); |
|
|
|
|
|
void synchronize(); |
|
|
|
|
|
const llama_model & get_model() const; |
|
|
const llama_cparams & get_cparams() const; |
|
|
|
|
|
ggml_backend_sched_t get_sched() const; |
|
|
|
|
|
uint32_t n_ctx() const; |
|
|
uint32_t n_ctx_per_seq() const; |
|
|
uint32_t n_batch() const; |
|
|
uint32_t n_ubatch() const; |
|
|
uint32_t n_seq_max() const; |
|
|
|
|
|
uint32_t n_threads() const; |
|
|
uint32_t n_threads_batch() const; |
|
|
|
|
|
llama_memory_t get_memory() const; |
|
|
|
|
|
|
|
|
|
|
|
bool kv_self_update(bool optimize); |
|
|
void kv_self_defrag_sched(); |
|
|
|
|
|
enum llama_pooling_type pooling_type() const; |
|
|
|
|
|
float * get_logits(); |
|
|
float * get_logits_ith(int32_t i); |
|
|
|
|
|
float * get_embeddings(); |
|
|
float * get_embeddings_ith(int32_t i); |
|
|
float * get_embeddings_seq(llama_seq_id seq_id); |
|
|
|
|
|
void attach_threadpool( |
|
|
ggml_threadpool_t threadpool, |
|
|
ggml_threadpool_t threadpool_batch); |
|
|
|
|
|
void detach_threadpool(); |
|
|
|
|
|
void set_n_threads(int32_t n_threads, int32_t n_threads_batch); |
|
|
|
|
|
void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); |
|
|
|
|
|
void set_embeddings (bool value); |
|
|
void set_causal_attn(bool value); |
|
|
void set_warmup(bool value); |
|
|
|
|
|
void set_adapter_lora( |
|
|
llama_adapter_lora * adapter, |
|
|
float scale); |
|
|
|
|
|
bool rm_adapter_lora( |
|
|
llama_adapter_lora * adapter); |
|
|
|
|
|
void clear_adapter_lora(); |
|
|
|
|
|
bool apply_adapter_cvec( |
|
|
const float * data, |
|
|
size_t len, |
|
|
int32_t n_embd, |
|
|
int32_t il_start, |
|
|
int32_t il_end); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_graph_result * process_ubatch( |
|
|
const llama_ubatch & ubatch, |
|
|
llm_graph_type gtype, |
|
|
llama_memory_context_i * mctx, |
|
|
ggml_status & ret); |
|
|
|
|
|
int encode(const llama_batch & batch_inp); |
|
|
int decode(const llama_batch & batch_inp); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t state_get_size(); |
|
|
size_t state_get_data( uint8_t * dst, size_t size); |
|
|
size_t state_set_data(const uint8_t * src, size_t size); |
|
|
|
|
|
size_t state_seq_get_size(llama_seq_id seq_id); |
|
|
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size); |
|
|
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size); |
|
|
|
|
|
bool state_load_file( |
|
|
const char * filepath, |
|
|
llama_token * tokens_out, |
|
|
size_t n_token_capacity, |
|
|
size_t * n_token_count_out); |
|
|
|
|
|
bool state_save_file( |
|
|
const char * filepath, |
|
|
const llama_token * tokens, |
|
|
size_t n_token_count); |
|
|
|
|
|
size_t state_seq_load_file( |
|
|
llama_seq_id seq_id, |
|
|
const char * filepath, |
|
|
llama_token * tokens_out, |
|
|
size_t n_token_capacity, |
|
|
size_t * n_token_count_out); |
|
|
|
|
|
size_t state_seq_save_file( |
|
|
llama_seq_id seq_id, |
|
|
const char * filepath, |
|
|
const llama_token * tokens, |
|
|
size_t n_token_count); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llama_perf_context_data perf_get_data() const; |
|
|
void perf_reset(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params); |
|
|
|
|
|
void opt_epoch( |
|
|
ggml_opt_dataset_t dataset, |
|
|
ggml_opt_result_t result_train, |
|
|
ggml_opt_result_t result_eval, |
|
|
int64_t idata_split, |
|
|
ggml_opt_epoch_callback callback_train, |
|
|
ggml_opt_epoch_callback callback_eval); |
|
|
|
|
|
void opt_epoch_iter( |
|
|
ggml_opt_dataset_t dataset, |
|
|
ggml_opt_result_t result, |
|
|
const std::vector<llama_token> & tokens, |
|
|
const std::vector<llama_token> & labels_sparse, |
|
|
llama_batch & batch, |
|
|
ggml_opt_epoch_callback callback, |
|
|
bool train, |
|
|
int64_t idata_in_loop, |
|
|
int64_t ndata_in_loop, |
|
|
int64_t t_loop_start); |
|
|
|
|
|
private: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t output_reserve(int32_t n_outputs); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public: |
|
|
uint32_t graph_max_nodes() const; |
|
|
|
|
|
|
|
|
llm_graph_result * get_gf_res_reserve() const; |
|
|
|
|
|
|
|
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched); |
|
|
|
|
|
|
|
|
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); |
|
|
|
|
|
private: |
|
|
llm_graph_params graph_params( |
|
|
llm_graph_result * res, |
|
|
const llama_ubatch & ubatch, |
|
|
const llama_memory_context_i * mctx, |
|
|
llm_graph_type gtype) const; |
|
|
|
|
|
llm_graph_cb graph_get_cb() const; |
|
|
|
|
|
|
|
|
size_t state_write_data(llama_io_write_i & io); |
|
|
size_t state_read_data (llama_io_read_i & io); |
|
|
|
|
|
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id); |
|
|
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const llama_model & model; |
|
|
|
|
|
llama_cparams cparams; |
|
|
llama_adapter_cvec cvec; |
|
|
llama_adapter_loras loras; |
|
|
|
|
|
llama_cross cross; |
|
|
|
|
|
std::unique_ptr<llama_memory_i> memory; |
|
|
|
|
|
|
|
|
bool memory_force_optimize = false; |
|
|
|
|
|
|
|
|
size_t logits_size = 0; |
|
|
float * logits = nullptr; |
|
|
|
|
|
|
|
|
|
|
|
size_t embd_size = 0; |
|
|
float * embd = nullptr; |
|
|
|
|
|
|
|
|
|
|
|
std::map<llama_seq_id, std::vector<float>> embd_seq; |
|
|
|
|
|
|
|
|
std::unique_ptr<llama_batch_allocr> balloc; |
|
|
|
|
|
uint32_t n_outputs = 0; |
|
|
|
|
|
std::vector<int32_t> output_ids; |
|
|
|
|
|
ggml_backend_sched_ptr sched; |
|
|
|
|
|
ggml_backend_t backend_cpu = nullptr; |
|
|
std::vector<ggml_backend_ptr> backends; |
|
|
|
|
|
|
|
|
ggml_opt_context_t opt_ctx = nullptr; |
|
|
|
|
|
ggml_threadpool_t threadpool = nullptr; |
|
|
ggml_threadpool_t threadpool_batch = nullptr; |
|
|
|
|
|
ggml_abort_callback abort_callback = nullptr; |
|
|
void * abort_callback_data = nullptr; |
|
|
|
|
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns; |
|
|
|
|
|
|
|
|
std::vector<ggml_backend_t> backend_ptrs; |
|
|
std::vector<ggml_backend_buffer_type_t> backend_buft; |
|
|
|
|
|
llm_graph_result_ptr gf_res_prev; |
|
|
llm_graph_result_ptr gf_res_reserve; |
|
|
|
|
|
|
|
|
ggml_backend_buffer_ptr buf_output; |
|
|
|
|
|
bool has_evaluated_once = false; |
|
|
|
|
|
|
|
|
mutable int64_t t_start_us = 0; |
|
|
mutable int64_t t_load_us = 0; |
|
|
mutable int64_t t_p_eval_us = 0; |
|
|
mutable int64_t t_eval_us = 0; |
|
|
|
|
|
mutable int64_t t_compute_start_us = 0; |
|
|
mutable int64_t n_queued_tokens = 0; |
|
|
|
|
|
mutable int32_t n_p_eval = 0; |
|
|
mutable int32_t n_eval = 0; |
|
|
|
|
|
mutable int32_t n_reused = 0; |
|
|
}; |
|
|
|