| | #pragma once
|
| |
|
| | #include "llama-arch.h"
|
| | #include "llama-batch.h"
|
| | #include "llama-hparams.h"
|
| | #include "llama-adapter.h"
|
| |
|
| | #include <cstdint>
|
| | #include <vector>
|
| | #include <memory>
|
| | #include <set>
|
| | #include <functional>
|
| | #include <map>
|
| |
|
| | struct ggml_cgraph;
|
| | struct ggml_context;
|
| | struct ggml_tensor;
|
| |
|
| | struct llama_cparams;
|
| |
|
| | struct llama_memory_context_i;
|
| |
|
| | class llama_kv_cache_context;
|
| | class llama_kv_cache_iswa_context;
|
| | class llama_memory_recurrent_context;
|
| | class llama_memory_hybrid_context;
|
| | class llama_memory_hybrid_iswa_context;
|
| |
|
| |
|
| | enum llm_graph_type {
|
| | LLM_GRAPH_TYPE_DEFAULT,
|
| | LLM_GRAPH_TYPE_ENCODER,
|
| | LLM_GRAPH_TYPE_DECODER,
|
| | };
|
| |
|
| | enum llm_ffn_op_type {
|
| | LLM_FFN_SILU,
|
| | LLM_FFN_GELU,
|
| | LLM_FFN_RELU,
|
| | LLM_FFN_RELU_SQR,
|
| | LLM_FFN_SWIGLU,
|
| | LLM_FFN_GEGLU,
|
| | LLM_FFN_REGLU,
|
| | LLM_FFN_SWIGLU_OAI_MOE,
|
| | };
|
| |
|
| | enum llm_ffn_gate_type {
|
| | LLM_FFN_SEQ,
|
| | LLM_FFN_PAR,
|
| | };
|
| |
|
| | enum llm_norm_type {
|
| | LLM_NORM,
|
| | LLM_NORM_RMS,
|
| | LLM_NORM_GROUP,
|
| | };
|
| |
|
| |
|
| | struct llama_cross {
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | int64_t n_embd = 0;
|
| | int64_t n_enc = 0;
|
| |
|
| |
|
| | std::vector<float> v_embd;
|
| |
|
| |
|
| | std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
| | };
|
| |
|
| | struct llm_graph_params;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_i() {
|
| | const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
| | debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
| | }
|
| |
|
| | virtual ~llm_graph_input_i() = default;
|
| |
|
| | virtual void set_input(const llama_ubatch * ubatch) = 0;
|
| |
|
| |
|
| |
|
| | virtual bool can_reuse(const llm_graph_params & params) {
|
| |
|
| |
|
| | GGML_UNUSED(params);
|
| | return false;
|
| | }
|
| | protected:
|
| |
|
| | int debug = 0;
|
| | };
|
| |
|
| | using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
| |
|
| | class llm_graph_input_embd : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
|
| | virtual ~llm_graph_input_embd() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * tokens = nullptr;
|
| | ggml_tensor * embd = nullptr;
|
| |
|
| | const int64_t n_embd = 0;
|
| | };
|
| |
|
| | class llm_graph_input_pos : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
| | virtual ~llm_graph_input_pos() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * pos = nullptr;
|
| |
|
| | const uint32_t n_pos_per_embd = 1;
|
| | };
|
| |
|
| |
|
| | class llm_graph_input_attn_temp : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
| | : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
| | virtual ~llm_graph_input_attn_temp() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * attn_scale = nullptr;
|
| |
|
| | const uint32_t n_attn_temp_floor_scale;
|
| | const float f_attn_temp_scale;
|
| | const float f_attn_temp_offset;
|
| | };
|
| |
|
| | class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
|
| | virtual ~llm_graph_input_pos_bucket() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * pos_bucket = nullptr;
|
| |
|
| | const llama_hparams hparams;
|
| | };
|
| |
|
| | class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_pos_bucket_kv(
|
| | const llama_hparams & hparams,
|
| | const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
|
| | virtual ~llm_graph_input_pos_bucket_kv() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * pos_bucket = nullptr;
|
| |
|
| | const llama_hparams hparams;
|
| |
|
| | const llama_kv_cache_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_out_ids : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_out_ids(
|
| | const llama_hparams & hparams,
|
| | const llama_cparams & cparams,
|
| | uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
| | virtual ~llm_graph_input_out_ids() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * out_ids;
|
| |
|
| | const llama_hparams hparams;
|
| | const llama_cparams cparams;
|
| |
|
| | const uint32_t n_outputs;
|
| | };
|
| |
|
| | class llm_graph_input_mean : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
|
| | virtual ~llm_graph_input_mean() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * mean;
|
| |
|
| | const llama_cparams cparams;
|
| | };
|
| |
|
| | class llm_graph_input_cls : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
| | virtual ~llm_graph_input_cls() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * cls;
|
| |
|
| | const llama_cparams cparams;
|
| | const llm_arch arch;
|
| | };
|
| |
|
| | class llm_graph_input_rs : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
|
| | virtual ~llm_graph_input_rs() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * s_copy;
|
| |
|
| |
|
| |
|
| | ggml_tensor * s_copy_main;
|
| | ggml_tensor * s_copy_extra;
|
| |
|
| | const llama_memory_recurrent_context * mctx;
|
| |
|
| |
|
| | uint32_t head;
|
| | int32_t rs_z;
|
| | };
|
| |
|
| | class llm_graph_input_cross_embd : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_cross_embd(
|
| | const llama_cross * cross) : cross(cross) {}
|
| | virtual ~llm_graph_input_cross_embd() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * cross_embd;
|
| |
|
| | const llama_cross * cross;
|
| | };
|
| |
|
| | class llm_graph_input_attn_no_cache : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
|
| | hparams(hparams),
|
| | cparams(cparams) {
|
| | }
|
| | ~llm_graph_input_attn_no_cache() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
| | ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
| |
|
| |
|
| | ggml_tensor * self_kq_mask = nullptr;
|
| | ggml_tensor * self_kq_mask_cnv = nullptr;
|
| | ggml_tensor * self_kq_mask_swa = nullptr;
|
| | ggml_tensor * self_kq_mask_swa_cnv = nullptr;
|
| |
|
| | const llama_hparams hparams;
|
| | const llama_cparams cparams;
|
| | };
|
| |
|
| | class llm_graph_input_attn_kv : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_kv(
|
| | const llama_hparams & hparams,
|
| | const llama_cparams & cparams,
|
| | const llama_kv_cache_context * mctx) :
|
| | hparams(hparams),
|
| | cparams(cparams),
|
| | mctx(mctx) {
|
| | }
|
| | ~llm_graph_input_attn_kv() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
| | ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
| |
|
| | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
| |
|
| | ggml_tensor * self_k_idxs = nullptr;
|
| | ggml_tensor * self_v_idxs = nullptr;
|
| |
|
| | ggml_tensor * self_kq_mask = nullptr;
|
| | ggml_tensor * self_kq_mask_cnv = nullptr;
|
| |
|
| |
|
| |
|
| |
|
| | const llama_hparams hparams;
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_kv_cache_context * mctx;
|
| | };
|
| |
|
| |
|
| |
|
| | class llm_graph_input_attn_k : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_k(
|
| | const llama_hparams & hparams,
|
| | const llama_cparams & cparams,
|
| | const llama_kv_cache_context * mctx) :
|
| | hparams(hparams),
|
| | cparams(cparams),
|
| | mctx(mctx) {
|
| | }
|
| | ~llm_graph_input_attn_k() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
| |
|
| | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
| |
|
| | ggml_tensor * self_k_idxs = nullptr;
|
| |
|
| | ggml_tensor * self_kq_mask = nullptr;
|
| | ggml_tensor * self_kq_mask_cnv = nullptr;
|
| |
|
| | const llama_hparams hparams;
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_kv_cache_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_kv_iswa(
|
| | const llama_hparams & hparams,
|
| | const llama_cparams & cparams,
|
| | const llama_kv_cache_iswa_context * mctx) :
|
| | hparams(hparams),
|
| | cparams(cparams),
|
| | mctx(mctx) {
|
| | }
|
| | ~llm_graph_input_attn_kv_iswa() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
| | ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
| | ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
| | ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
|
| |
|
| | ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
| | ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
| |
|
| | ggml_tensor * self_k_idxs = nullptr;
|
| | ggml_tensor * self_v_idxs = nullptr;
|
| | ggml_tensor * self_k_idxs_swa = nullptr;
|
| | ggml_tensor * self_v_idxs_swa = nullptr;
|
| |
|
| | ggml_tensor * self_kq_mask = nullptr;
|
| | ggml_tensor * self_kq_mask_cnv = nullptr;
|
| | ggml_tensor * self_kq_mask_swa = nullptr;
|
| | ggml_tensor * self_kq_mask_swa_cnv = nullptr;
|
| |
|
| | const llama_hparams hparams;
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_kv_cache_iswa_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_attn_cross : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
|
| | ~llm_graph_input_attn_cross() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
|
| |
|
| | ggml_tensor * cross_kq_mask = nullptr;
|
| | ggml_tensor * cross_kq_mask_cnv = nullptr;
|
| |
|
| | const llama_cross * cross = nullptr;
|
| | };
|
| |
|
| | class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_mem_hybrid(
|
| | const llama_cparams & cparams,
|
| | std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs,
|
| | const llama_memory_hybrid_context * mctx) :
|
| | inp_attn(std::move(inp_attn)),
|
| | inp_rs(std::move(inp_rs)),
|
| | cparams(cparams),
|
| | mctx(mctx) { }
|
| | virtual ~llm_graph_input_mem_hybrid() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs;
|
| |
|
| | llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
| | llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
| |
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_memory_hybrid_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_mem_hybrid_k(
|
| | const llama_cparams & cparams,
|
| | std::unique_ptr<llm_graph_input_attn_k> inp_attn,
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs,
|
| | const llama_memory_hybrid_context * mctx) :
|
| | inp_attn(std::move(inp_attn)),
|
| | inp_rs(std::move(inp_rs)),
|
| | cparams(cparams),
|
| | mctx(mctx) { }
|
| | virtual ~llm_graph_input_mem_hybrid_k() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | std::unique_ptr<llm_graph_input_attn_k> inp_attn;
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs;
|
| |
|
| | llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
|
| | llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
| |
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_memory_hybrid_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_mem_hybrid_iswa(
|
| | const llama_cparams & cparams,
|
| | std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs,
|
| | const llama_memory_hybrid_iswa_context * mctx) :
|
| | inp_attn(std::move(inp_attn)),
|
| | inp_rs(std::move(inp_rs)),
|
| | cparams(cparams),
|
| | mctx(mctx) { }
|
| | virtual ~llm_graph_input_mem_hybrid_iswa() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| |
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
|
| | std::unique_ptr<llm_graph_input_rs> inp_rs;
|
| |
|
| | llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
|
| | llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
| |
|
| | const llama_cparams cparams;
|
| |
|
| | const llama_memory_hybrid_iswa_context * mctx;
|
| | };
|
| |
|
| | class llm_graph_input_sampling : public llm_graph_input_i {
|
| | public:
|
| | llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
|
| | samplers(std::move(samplers)) { }
|
| | virtual ~llm_graph_input_sampling() = default;
|
| |
|
| | void set_input(const llama_ubatch * ubatch) override;
|
| | bool can_reuse(const llm_graph_params & params) override;
|
| |
|
| | std::map<llama_seq_id, llama_sampler *> samplers;
|
| | };
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
| |
|
| | class llm_graph_result;
|
| |
|
| | struct llm_graph_params {
|
| | llm_arch arch = LLM_ARCH_UNKNOWN;
|
| |
|
| | llama_hparams hparams;
|
| | llama_cparams cparams;
|
| |
|
| | llama_ubatch ubatch;
|
| |
|
| | llm_graph_type gtype;
|
| |
|
| | ggml_backend_sched_t sched;
|
| | ggml_backend_t backend_cpu;
|
| |
|
| | const llama_adapter_cvec * cvec;
|
| | const llama_adapter_loras * loras;
|
| | const llama_memory_context_i * mctx;
|
| | const llama_cross * cross;
|
| |
|
| | std::map<llama_seq_id, llama_sampler *> samplers;
|
| |
|
| | static bool samplers_equal(
|
| | const std::map<llama_seq_id, llama_sampler *> & lhs,
|
| | const std::map<llama_seq_id, llama_sampler *> & rhs) {
|
| | if (lhs.size() != rhs.size()) {
|
| | return false;
|
| | }
|
| | for (const auto & [seq_id, sampler] : lhs) {
|
| | auto it = rhs.find(seq_id);
|
| | if (it == rhs.end() || it->second != sampler) {
|
| | return false;
|
| | }
|
| | }
|
| | return true;
|
| | }
|
| |
|
| | uint32_t n_outputs;
|
| |
|
| | llm_graph_cb cb;
|
| |
|
| | llm_graph_result * res;
|
| |
|
| |
|
| |
|
| | bool allow_reuse(const llm_graph_params & other) const {
|
| |
|
| | bool can_reuse_ubatch =
|
| | ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
|
| | ubatch.n_tokens == other.ubatch.n_tokens &&
|
| | ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
|
| | ubatch.n_seqs == other.ubatch.n_seqs &&
|
| | ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
| | (
|
| | (!ubatch.token && !other.ubatch.token) ||
|
| | (!ubatch.embd && !other.ubatch.embd)
|
| | );
|
| |
|
| |
|
| |
|
| | if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
| | if (!ubatch.data) {
|
| |
|
| |
|
| | can_reuse_ubatch = false;
|
| | } else {
|
| | for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
| | can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
|
| | }
|
| | }
|
| | }
|
| |
|
| | if (!can_reuse_ubatch) {
|
| | return false;
|
| | }
|
| |
|
| | if (n_outputs != other.n_outputs) {
|
| | return false;
|
| | }
|
| |
|
| | if (!samplers_equal(samplers, other.samplers)) {
|
| | return false;
|
| | }
|
| |
|
| | if (samplers.size() > 0) {
|
| | if (!ubatch.data || !other.ubatch.data) {
|
| | return false;
|
| | }
|
| |
|
| |
|
| | for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
| | if (ubatch.output[i] != other.ubatch.output[i] ||
|
| | ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
|
| | return false;
|
| | }
|
| | }
|
| | }
|
| |
|
| | return
|
| | cparams.embeddings == other.cparams.embeddings &&
|
| | cparams.causal_attn == other.cparams.causal_attn &&
|
| | arch == other.arch &&
|
| | gtype == other.gtype &&
|
| | cvec == other.cvec &&
|
| | loras == other.loras &&
|
| | cross == other.cross;
|
| | }
|
| | };
|
| |
|
| | class llm_graph_result {
|
| | public:
|
| | llm_graph_result(int64_t max_nodes);
|
| |
|
| | virtual ~llm_graph_result() = default;
|
| |
|
| | ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
|
| | ggml_tensor * get_logits() const { return t_logits; }
|
| | ggml_tensor * get_embd() const { return t_embd; }
|
| | ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
| |
|
| | ggml_cgraph * get_gf() const { return gf; }
|
| | ggml_context * get_ctx() const { return ctx_compute.get(); }
|
| |
|
| | int64_t get_max_nodes() const;
|
| |
|
| | void reset();
|
| |
|
| | void set_inputs(const llama_ubatch * ubatch);
|
| | void set_outputs();
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | bool can_reuse(const llm_graph_params & params);
|
| |
|
| | llm_graph_input_i * add_input(llm_graph_input_ptr input);
|
| |
|
| | void set_params(const llm_graph_params & params);
|
| |
|
| |
|
| | ggml_tensor * t_inp_tokens = nullptr;
|
| | ggml_tensor * t_inp_embd = nullptr;
|
| | ggml_tensor * t_logits = nullptr;
|
| | ggml_tensor * t_embd = nullptr;
|
| | ggml_tensor * t_embd_pooled = nullptr;
|
| |
|
| | std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
| | std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
| | std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
| | std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
| |
|
| | std::vector<llm_graph_input_ptr> inputs;
|
| |
|
| | ggml_context_ptr ctx_compute;
|
| |
|
| |
|
| | std::vector<uint8_t> buf_compute_meta;
|
| |
|
| | ggml_cgraph * gf;
|
| |
|
| | int64_t max_nodes;
|
| |
|
| | private:
|
| |
|
| |
|
| |
|
| | llm_graph_params params;
|
| |
|
| |
|
| | int debug = 0;
|
| | };
|
| |
|
| | using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
| |
|
| | struct llm_graph_context {
|
| | const llm_arch arch;
|
| |
|
| | const llama_hparams & hparams;
|
| | const llama_cparams & cparams;
|
| | const llama_ubatch & ubatch;
|
| |
|
| | const int64_t n_embd;
|
| | const int64_t n_layer;
|
| | const int64_t n_rot;
|
| | const int64_t n_ctx;
|
| | const int64_t n_head;
|
| | const int64_t n_head_kv;
|
| | const int64_t n_embd_head_k;
|
| | const int64_t n_embd_k_gqa;
|
| | const int64_t n_embd_head_v;
|
| | const int64_t n_embd_v_gqa;
|
| | const int64_t n_expert;
|
| | const int64_t n_expert_used;
|
| |
|
| | const float freq_base;
|
| | const float freq_scale;
|
| | const float ext_factor;
|
| | const float attn_factor;
|
| | const float beta_fast;
|
| | const float beta_slow;
|
| | const float norm_eps;
|
| | const float norm_rms_eps;
|
| |
|
| | const int64_t n_tokens;
|
| | const int64_t n_outputs;
|
| | const int32_t n_ctx_orig;
|
| |
|
| | const enum llama_pooling_type pooling_type;
|
| | const enum llama_rope_type rope_type;
|
| |
|
| | ggml_backend_sched_t sched;
|
| |
|
| | ggml_backend_t backend_cpu;
|
| |
|
| | const llama_adapter_cvec * cvec;
|
| | const llama_adapter_loras * loras;
|
| | const llama_memory_context_i * mctx;
|
| | const llama_cross * cross;
|
| |
|
| | std::map<llama_seq_id, llama_sampler *> samplers;
|
| |
|
| | const llm_graph_cb & cb_func;
|
| |
|
| | llm_graph_result * res;
|
| |
|
| | ggml_context * ctx0 = nullptr;
|
| | ggml_cgraph * gf = nullptr;
|
| |
|
| | llm_graph_context(const llm_graph_params & params);
|
| | virtual ~llm_graph_context() = default;
|
| |
|
| | void cb(ggml_tensor * cur, const char * name, int il) const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | ggml_tensor * build_cvec(
|
| | ggml_tensor * cur,
|
| | int il) const;
|
| |
|
| |
|
| | ggml_tensor * build_lora_mm(
|
| | ggml_tensor * w,
|
| | ggml_tensor * cur) const;
|
| |
|
| |
|
| | ggml_tensor * build_lora_mm_id(
|
| | ggml_tensor * w,
|
| | ggml_tensor * cur,
|
| | ggml_tensor * ids) const;
|
| |
|
| | ggml_tensor * build_norm(
|
| | ggml_tensor * cur,
|
| | ggml_tensor * mw,
|
| | ggml_tensor * mb,
|
| | llm_norm_type type,
|
| | int il) const;
|
| |
|
| | ggml_tensor * build_ffn(
|
| | ggml_tensor * cur,
|
| | ggml_tensor * up,
|
| | ggml_tensor * up_b,
|
| | ggml_tensor * up_s,
|
| | ggml_tensor * gate,
|
| | ggml_tensor * gate_b,
|
| | ggml_tensor * gate_s,
|
| | ggml_tensor * down,
|
| | ggml_tensor * down_b,
|
| | ggml_tensor * down_s,
|
| | ggml_tensor * act_scales,
|
| | llm_ffn_op_type type_op,
|
| | llm_ffn_gate_type type_gate,
|
| | int il) const;
|
| |
|
| |
|
| | ggml_tensor * build_moe_ffn(
|
| | ggml_tensor * cur,
|
| | ggml_tensor * gate_inp,
|
| | ggml_tensor * up_exps,
|
| | ggml_tensor * gate_exps,
|
| | ggml_tensor * down_exps,
|
| | ggml_tensor * exp_probs_b,
|
| | int64_t n_expert,
|
| | int64_t n_expert_used,
|
| | llm_ffn_op_type type_op,
|
| | bool norm_w,
|
| | bool scale_w,
|
| | float w_scale,
|
| | llama_expert_gating_func_type gating_op,
|
| | int il,
|
| | ggml_tensor * probs_in = nullptr,
|
| | ggml_tensor * gate_up_exps = nullptr) const;
|
| |
|
| | ggml_tensor * build_moe_ffn(
|
| | ggml_tensor * cur,
|
| | ggml_tensor * gate_inp,
|
| | ggml_tensor * gate_inp_b,
|
| | ggml_tensor * up_exps,
|
| | ggml_tensor * up_exps_b,
|
| | ggml_tensor * gate_exps,
|
| | ggml_tensor * gate_exps_b,
|
| | ggml_tensor * down_exps,
|
| | ggml_tensor * down_exps_b,
|
| | ggml_tensor * exp_probs_b,
|
| | int64_t n_expert,
|
| | int64_t n_expert_used,
|
| | llm_ffn_op_type type_op,
|
| | bool norm_w,
|
| | bool scale_w,
|
| | float w_scale,
|
| | llama_expert_gating_func_type gating_op,
|
| | int il,
|
| | ggml_tensor * probs_in = nullptr,
|
| | ggml_tensor * gate_up_exps = nullptr,
|
| | ggml_tensor * gate_up_exps_b = nullptr) const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
|
| | ggml_tensor * build_inp_pos() const;
|
| | ggml_tensor * build_inp_attn_scale() const;
|
| | ggml_tensor * build_inp_out_ids() const;
|
| | ggml_tensor * build_inp_mean() const;
|
| | ggml_tensor * build_inp_cls() const;
|
| |
|
| | ggml_tensor * build_inp_cross_embd() const;
|
| | ggml_tensor * build_inp_pos_bucket_enc() const;
|
| | ggml_tensor * build_inp_pos_bucket_dec() const;
|
| | ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | ggml_tensor * build_attn_mha(
|
| | ggml_tensor * q,
|
| | ggml_tensor * k,
|
| | ggml_tensor * v,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * kq_mask,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| | llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
| |
|
| | ggml_tensor * build_attn(
|
| | llm_graph_input_attn_no_cache * inp,
|
| | ggml_tensor * wo,
|
| | ggml_tensor * wo_b,
|
| | ggml_tensor * q_cur,
|
| | ggml_tensor * k_cur,
|
| | ggml_tensor * v_cur,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| | llm_graph_input_attn_kv * build_attn_inp_kv() const;
|
| |
|
| | ggml_tensor * build_attn(
|
| | llm_graph_input_attn_kv * inp,
|
| | ggml_tensor * wo,
|
| | ggml_tensor * wo_b,
|
| | ggml_tensor * q_cur,
|
| | ggml_tensor * k_cur,
|
| | ggml_tensor * v_cur,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| | llm_graph_input_attn_k * build_attn_inp_k() const;
|
| |
|
| | ggml_tensor * build_attn(
|
| | llm_graph_input_attn_k * inp,
|
| | ggml_tensor * wo,
|
| | ggml_tensor * wo_b,
|
| | ggml_tensor * q_cur,
|
| | ggml_tensor * k_cur,
|
| | ggml_tensor * v_cur,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| | llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
|
| |
|
| |
|
| | ggml_tensor * build_attn(
|
| | llm_graph_input_attn_kv_iswa * inp,
|
| | ggml_tensor * wo,
|
| | ggml_tensor * wo_b,
|
| | ggml_tensor * q_cur,
|
| | ggml_tensor * k_cur,
|
| | ggml_tensor * v_cur,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| | llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
| |
|
| | ggml_tensor * build_attn(
|
| | llm_graph_input_attn_cross * inp,
|
| | ggml_tensor * wo,
|
| | ggml_tensor * wo_b,
|
| | ggml_tensor * q_cur,
|
| | ggml_tensor * k_cur,
|
| | ggml_tensor * v_cur,
|
| | ggml_tensor * kq_b,
|
| | ggml_tensor * sinks,
|
| | ggml_tensor * v_mla,
|
| | float kq_scale,
|
| | int il) const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | ggml_tensor * build_rs(
|
| | ggml_tensor * s,
|
| | ggml_tensor * state_copy_main,
|
| | ggml_tensor * state_copy_extra,
|
| | int32_t state_size,
|
| | int32_t n_seqs,
|
| | uint32_t n_rs,
|
| | uint32_t rs_head,
|
| | uint32_t rs_size,
|
| | int32_t rs_zero,
|
| | const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
| |
|
| | llm_graph_input_rs * build_rs_inp() const;
|
| |
|
| | ggml_tensor * build_rs(
|
| | llm_graph_input_rs * inp,
|
| | ggml_tensor * s,
|
| | int32_t state_size,
|
| | int32_t n_seqs,
|
| | const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
| |
|
| | ggml_tensor * build_rwkv_token_shift_load(
|
| | llm_graph_input_rs * inp,
|
| | const llama_ubatch & ubatch,
|
| | int il) const;
|
| |
|
| | ggml_tensor * build_rwkv_token_shift_store(
|
| | ggml_tensor * token_shift,
|
| | const llama_ubatch & ubatch,
|
| | int il) const;
|
| |
|
| |
|
| |
|
| |
|
| | llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
| | llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
|
| |
|
| | llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | void build_pooling(
|
| | ggml_tensor * cls,
|
| | ggml_tensor * cls_b,
|
| | ggml_tensor * cls_out,
|
| | ggml_tensor * cls_out_b,
|
| | ggml_tensor * cls_norm) const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | void build_sampling() const;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | void build_dense_out(
|
| | ggml_tensor * dense_2,
|
| | ggml_tensor * dense_2_b,
|
| | ggml_tensor * dense_3) const;
|
| | };
|
| |
|
| |
|
| | int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
|
| |
|