|
|
#pragma once |
|
|
|
|
|
#include "llama-arch.h" |
|
|
#include "llama-batch.h" |
|
|
#include "llama-hparams.h" |
|
|
#include "llama-adapter.h" |
|
|
|
|
|
#include <cstdint> |
|
|
#include <vector> |
|
|
#include <memory> |
|
|
#include <set> |
|
|
#include <functional> |
|
|
|
|
|
struct ggml_cgraph; |
|
|
struct ggml_context; |
|
|
struct ggml_tensor; |
|
|
|
|
|
struct llama_cparams; |
|
|
|
|
|
struct llama_memory_context_i; |
|
|
|
|
|
class llama_kv_cache_unified_context; |
|
|
class llama_kv_cache_unified_iswa_context; |
|
|
class llama_memory_recurrent_context; |
|
|
class llama_memory_hybrid_context; |
|
|
|
|
|
|
|
|
enum llm_graph_type { |
|
|
LLM_GRAPH_TYPE_DEFAULT, |
|
|
LLM_GRAPH_TYPE_ENCODER, |
|
|
LLM_GRAPH_TYPE_DECODER, |
|
|
}; |
|
|
|
|
|
enum llm_ffn_op_type { |
|
|
LLM_FFN_SILU, |
|
|
LLM_FFN_GELU, |
|
|
LLM_FFN_RELU, |
|
|
LLM_FFN_RELU_SQR, |
|
|
LLM_FFN_SWIGLU, |
|
|
LLM_FFN_GEGLU, |
|
|
LLM_FFN_REGLU, |
|
|
}; |
|
|
|
|
|
enum llm_ffn_gate_type { |
|
|
LLM_FFN_SEQ, |
|
|
LLM_FFN_PAR, |
|
|
}; |
|
|
|
|
|
enum llm_norm_type { |
|
|
LLM_NORM, |
|
|
LLM_NORM_RMS, |
|
|
LLM_NORM_GROUP, |
|
|
}; |
|
|
|
|
|
|
|
|
struct llama_cross { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t n_embd = 0; |
|
|
int64_t n_enc = 0; |
|
|
|
|
|
|
|
|
std::vector<float> v_embd; |
|
|
|
|
|
|
|
|
std::vector<std::set<llama_seq_id>> seq_ids_enc; |
|
|
}; |
|
|
|
|
|
struct llm_graph_params; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class llm_graph_input_i { |
|
|
public: |
|
|
virtual ~llm_graph_input_i() = default; |
|
|
|
|
|
virtual void set_input(const llama_ubatch * ubatch) = 0; |
|
|
|
|
|
|
|
|
|
|
|
virtual bool can_reuse(const llm_graph_params & params) { |
|
|
|
|
|
|
|
|
GGML_UNUSED(params); |
|
|
return false; |
|
|
} |
|
|
}; |
|
|
|
|
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>; |
|
|
|
|
|
class llm_graph_input_embd : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_embd() = default; |
|
|
virtual ~llm_graph_input_embd() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
bool can_reuse(const llm_graph_params & params) override; |
|
|
|
|
|
ggml_tensor * tokens = nullptr; |
|
|
ggml_tensor * embd = nullptr; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_pos : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} |
|
|
virtual ~llm_graph_input_pos() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
bool can_reuse(const llm_graph_params & params) override; |
|
|
|
|
|
ggml_tensor * pos = nullptr; |
|
|
|
|
|
const uint32_t n_pos_per_embd = 1; |
|
|
}; |
|
|
|
|
|
|
|
|
class llm_graph_input_attn_temp : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) |
|
|
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} |
|
|
virtual ~llm_graph_input_attn_temp() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * attn_scale = nullptr; |
|
|
|
|
|
const uint32_t n_attn_temp_floor_scale; |
|
|
const float f_attn_temp_scale; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_pos_bucket : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} |
|
|
virtual ~llm_graph_input_pos_bucket() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * pos_bucket = nullptr; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_pos_bucket_kv( |
|
|
const llama_hparams & hparams, |
|
|
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {} |
|
|
virtual ~llm_graph_input_pos_bucket_kv() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * pos_bucket = nullptr; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
|
|
|
const llama_kv_cache_unified_context * mctx; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_out_ids : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_out_ids( |
|
|
const llama_hparams & hparams, |
|
|
const llama_cparams & cparams, |
|
|
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} |
|
|
virtual ~llm_graph_input_out_ids() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
bool can_reuse(const llm_graph_params & params) override; |
|
|
|
|
|
ggml_tensor * out_ids; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
const llama_cparams & cparams; |
|
|
|
|
|
const uint32_t n_outputs; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_mean : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} |
|
|
virtual ~llm_graph_input_mean() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * mean; |
|
|
|
|
|
const llama_cparams & cparams; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_cls : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} |
|
|
virtual ~llm_graph_input_cls() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * cls; |
|
|
|
|
|
const llama_cparams & cparams; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_rs : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {} |
|
|
virtual ~llm_graph_input_rs() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * s_copy; |
|
|
|
|
|
const llama_memory_recurrent_context * mctx; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_cross_embd : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_cross_embd( |
|
|
const llama_cross * cross) : cross(cross) {} |
|
|
virtual ~llm_graph_input_cross_embd() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * cross_embd; |
|
|
|
|
|
const llama_cross * cross; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_attn_no_cache : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) : |
|
|
hparams(hparams), |
|
|
cparams(cparams) { |
|
|
} |
|
|
~llm_graph_input_attn_no_cache() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * get_kq_mask() const { return kq_mask_cnv; } |
|
|
|
|
|
ggml_tensor * kq_mask = nullptr; |
|
|
ggml_tensor * kq_mask_cnv = nullptr; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
const llama_cparams & cparams; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_attn_kv_unified : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_attn_kv_unified( |
|
|
const llama_hparams & hparams, |
|
|
const llama_cparams & cparams, |
|
|
const llama_kv_cache_unified_context * mctx) : |
|
|
hparams(hparams), |
|
|
cparams(cparams), |
|
|
mctx(mctx) { |
|
|
} |
|
|
~llm_graph_input_attn_kv_unified() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
bool can_reuse(const llm_graph_params & params) override; |
|
|
|
|
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; } |
|
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; } |
|
|
|
|
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
|
|
|
|
|
ggml_tensor * self_k_idxs = nullptr; |
|
|
ggml_tensor * self_v_idxs = nullptr; |
|
|
|
|
|
ggml_tensor * self_kq_mask = nullptr; |
|
|
ggml_tensor * self_kq_mask_cnv = nullptr; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
const llama_cparams & cparams; |
|
|
|
|
|
const llama_kv_cache_unified_context * mctx; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_attn_kv_unified_iswa( |
|
|
const llama_hparams & hparams, |
|
|
const llama_cparams & cparams, |
|
|
const llama_kv_cache_unified_iswa_context * mctx) : |
|
|
hparams(hparams), |
|
|
cparams(cparams), |
|
|
mctx(mctx) { |
|
|
} |
|
|
~llm_graph_input_attn_kv_unified_iswa() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
bool can_reuse(const llm_graph_params & params) override; |
|
|
|
|
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; } |
|
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; } |
|
|
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } |
|
|
ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; } |
|
|
|
|
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } |
|
|
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } |
|
|
|
|
|
ggml_tensor * self_k_idxs = nullptr; |
|
|
ggml_tensor * self_v_idxs = nullptr; |
|
|
ggml_tensor * self_k_idxs_swa = nullptr; |
|
|
ggml_tensor * self_v_idxs_swa = nullptr; |
|
|
|
|
|
ggml_tensor * self_kq_mask = nullptr; |
|
|
ggml_tensor * self_kq_mask_cnv = nullptr; |
|
|
ggml_tensor * self_kq_mask_swa = nullptr; |
|
|
ggml_tensor * self_kq_mask_swa_cnv = nullptr; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
const llama_cparams & cparams; |
|
|
|
|
|
const llama_kv_cache_unified_iswa_context * mctx; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_attn_cross : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {} |
|
|
~llm_graph_input_attn_cross() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; } |
|
|
|
|
|
ggml_tensor * cross_kq_mask = nullptr; |
|
|
ggml_tensor * cross_kq_mask_cnv = nullptr; |
|
|
|
|
|
const llama_cross * cross = nullptr; |
|
|
}; |
|
|
|
|
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i { |
|
|
public: |
|
|
llm_graph_input_mem_hybrid( |
|
|
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn, |
|
|
std::unique_ptr<llm_graph_input_rs> inp_rs, |
|
|
const llama_memory_hybrid_context * mctx) : |
|
|
inp_attn(std::move(inp_attn)), |
|
|
inp_rs(std::move(inp_rs)), |
|
|
mctx(mctx) { } |
|
|
virtual ~llm_graph_input_mem_hybrid() = default; |
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override; |
|
|
|
|
|
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn; |
|
|
std::unique_ptr<llm_graph_input_rs> inp_rs; |
|
|
|
|
|
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); } |
|
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); } |
|
|
|
|
|
const llama_memory_hybrid_context * mctx; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>; |
|
|
|
|
|
class llm_graph_result; |
|
|
|
|
|
struct llm_graph_params { |
|
|
llm_arch arch = LLM_ARCH_UNKNOWN; |
|
|
|
|
|
llama_hparams hparams; |
|
|
llama_cparams cparams; |
|
|
|
|
|
llama_ubatch ubatch; |
|
|
|
|
|
llm_graph_type gtype; |
|
|
|
|
|
ggml_backend_sched_t sched; |
|
|
ggml_backend_t backend_cpu; |
|
|
|
|
|
const llama_adapter_cvec * cvec; |
|
|
const llama_adapter_loras * loras; |
|
|
const llama_memory_context_i * mctx; |
|
|
const llama_cross * cross; |
|
|
|
|
|
uint32_t n_outputs; |
|
|
|
|
|
llm_graph_cb cb; |
|
|
|
|
|
llm_graph_result * res; |
|
|
|
|
|
|
|
|
|
|
|
bool allow_reuse(const llm_graph_params & other) const { |
|
|
|
|
|
bool can_reuse_ubatch = |
|
|
ubatch.equal_seqs() == other.ubatch.equal_seqs() && |
|
|
ubatch.n_tokens == other.ubatch.n_tokens && |
|
|
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && |
|
|
ubatch.n_seqs == other.ubatch.n_seqs && |
|
|
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && |
|
|
( |
|
|
(!ubatch.token && !other.ubatch.token) || |
|
|
(!ubatch.embd && !other.ubatch.embd) |
|
|
); |
|
|
|
|
|
if (can_reuse_ubatch && !ubatch.equal_seqs()) { |
|
|
if (!ubatch.data) { |
|
|
|
|
|
|
|
|
can_reuse_ubatch = false; |
|
|
} else { |
|
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { |
|
|
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if (!can_reuse_ubatch) { |
|
|
return false; |
|
|
} |
|
|
|
|
|
return |
|
|
cparams.embeddings == other.cparams.embeddings && |
|
|
cparams.causal_attn == other.cparams.causal_attn && |
|
|
arch == other.arch && |
|
|
gtype == other.gtype && |
|
|
cvec == other.cvec && |
|
|
loras == other.loras && |
|
|
cross == other.cross && |
|
|
n_outputs == other.n_outputs; |
|
|
} |
|
|
}; |
|
|
|
|
|
class llm_graph_result { |
|
|
public: |
|
|
llm_graph_result(int64_t max_nodes); |
|
|
|
|
|
virtual ~llm_graph_result() = default; |
|
|
|
|
|
ggml_tensor * get_tokens() const { return t_tokens; } |
|
|
ggml_tensor * get_logits() const { return t_logits; } |
|
|
ggml_tensor * get_embd() const { return t_embd; } |
|
|
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } |
|
|
|
|
|
ggml_cgraph * get_gf() const { return gf; } |
|
|
ggml_context * get_ctx() const { return ctx_compute.get(); } |
|
|
|
|
|
int64_t get_max_nodes() const; |
|
|
|
|
|
void reset(); |
|
|
|
|
|
void set_inputs(const llama_ubatch * ubatch); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool can_reuse(const llm_graph_params & params); |
|
|
|
|
|
llm_graph_input_i * add_input(llm_graph_input_ptr input); |
|
|
|
|
|
void set_params(const llm_graph_params & params); |
|
|
|
|
|
|
|
|
ggml_tensor * t_tokens = nullptr; |
|
|
ggml_tensor * t_logits = nullptr; |
|
|
ggml_tensor * t_embd = nullptr; |
|
|
ggml_tensor * t_embd_pooled = nullptr; |
|
|
|
|
|
std::vector<llm_graph_input_ptr> inputs; |
|
|
|
|
|
ggml_context_ptr ctx_compute; |
|
|
|
|
|
|
|
|
std::vector<uint8_t> buf_compute_meta; |
|
|
|
|
|
ggml_cgraph * gf; |
|
|
|
|
|
int64_t max_nodes; |
|
|
|
|
|
private: |
|
|
|
|
|
|
|
|
|
|
|
llm_graph_params params; |
|
|
|
|
|
|
|
|
int debug = 0; |
|
|
}; |
|
|
|
|
|
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>; |
|
|
|
|
|
struct llm_graph_context { |
|
|
const llm_arch arch; |
|
|
|
|
|
const llama_hparams & hparams; |
|
|
const llama_cparams & cparams; |
|
|
const llama_ubatch & ubatch; |
|
|
|
|
|
const int64_t n_embd; |
|
|
const int64_t n_layer; |
|
|
const int64_t n_rot; |
|
|
const int64_t n_ctx; |
|
|
const int64_t n_head; |
|
|
const int64_t n_head_kv; |
|
|
const int64_t n_embd_head_k; |
|
|
const int64_t n_embd_k_gqa; |
|
|
const int64_t n_embd_head_v; |
|
|
const int64_t n_embd_v_gqa; |
|
|
const int64_t n_expert; |
|
|
const int64_t n_expert_used; |
|
|
|
|
|
const float freq_base; |
|
|
const float freq_scale; |
|
|
const float ext_factor; |
|
|
const float attn_factor; |
|
|
const float beta_fast; |
|
|
const float beta_slow; |
|
|
const float norm_eps; |
|
|
const float norm_rms_eps; |
|
|
|
|
|
const int64_t n_tokens; |
|
|
const int64_t n_outputs; |
|
|
const int32_t n_ctx_orig; |
|
|
|
|
|
const enum llama_pooling_type pooling_type; |
|
|
const enum llama_rope_type rope_type; |
|
|
|
|
|
ggml_backend_sched_t sched; |
|
|
|
|
|
ggml_backend_t backend_cpu; |
|
|
|
|
|
const llama_adapter_cvec * cvec; |
|
|
const llama_adapter_loras * loras; |
|
|
const llama_memory_context_i * mctx; |
|
|
const llama_cross * cross; |
|
|
|
|
|
const llm_graph_cb & cb_func; |
|
|
|
|
|
llm_graph_result * res; |
|
|
|
|
|
ggml_context * ctx0 = nullptr; |
|
|
ggml_cgraph * gf = nullptr; |
|
|
|
|
|
llm_graph_context(const llm_graph_params & params); |
|
|
virtual ~llm_graph_context() = default; |
|
|
|
|
|
void cb(ggml_tensor * cur, const char * name, int il) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_tensor * build_cvec( |
|
|
ggml_tensor * cur, |
|
|
int il) const; |
|
|
|
|
|
|
|
|
ggml_tensor * build_lora_mm( |
|
|
ggml_tensor * w, |
|
|
ggml_tensor * cur) const; |
|
|
|
|
|
|
|
|
ggml_tensor * build_lora_mm_id( |
|
|
ggml_tensor * w, |
|
|
ggml_tensor * cur, |
|
|
ggml_tensor * ids) const; |
|
|
|
|
|
ggml_tensor * build_norm( |
|
|
ggml_tensor * cur, |
|
|
ggml_tensor * mw, |
|
|
ggml_tensor * mb, |
|
|
llm_norm_type type, |
|
|
int il) const; |
|
|
|
|
|
ggml_tensor * build_ffn( |
|
|
ggml_tensor * cur, |
|
|
ggml_tensor * up, |
|
|
ggml_tensor * up_b, |
|
|
ggml_tensor * up_s, |
|
|
ggml_tensor * gate, |
|
|
ggml_tensor * gate_b, |
|
|
ggml_tensor * gate_s, |
|
|
ggml_tensor * down, |
|
|
ggml_tensor * down_b, |
|
|
ggml_tensor * down_s, |
|
|
ggml_tensor * act_scales, |
|
|
llm_ffn_op_type type_op, |
|
|
llm_ffn_gate_type type_gate, |
|
|
int il) const; |
|
|
|
|
|
ggml_tensor * build_moe_ffn( |
|
|
ggml_tensor * cur, |
|
|
ggml_tensor * gate_inp, |
|
|
ggml_tensor * up_exps, |
|
|
ggml_tensor * gate_exps, |
|
|
ggml_tensor * down_exps, |
|
|
ggml_tensor * exp_probs_b, |
|
|
int64_t n_expert, |
|
|
int64_t n_expert_used, |
|
|
llm_ffn_op_type type_op, |
|
|
bool norm_w, |
|
|
bool scale_w, |
|
|
float w_scale, |
|
|
llama_expert_gating_func_type gating_op, |
|
|
int il) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; |
|
|
ggml_tensor * build_inp_pos() const; |
|
|
ggml_tensor * build_inp_attn_scale() const; |
|
|
ggml_tensor * build_inp_out_ids() const; |
|
|
ggml_tensor * build_inp_mean() const; |
|
|
ggml_tensor * build_inp_cls() const; |
|
|
|
|
|
ggml_tensor * build_inp_cross_embd() const; |
|
|
ggml_tensor * build_inp_pos_bucket_enc() const; |
|
|
ggml_tensor * build_inp_pos_bucket_dec() const; |
|
|
ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_tensor * build_attn_mha( |
|
|
ggml_tensor * q, |
|
|
ggml_tensor * k, |
|
|
ggml_tensor * v, |
|
|
ggml_tensor * kq_b, |
|
|
ggml_tensor * kq_mask, |
|
|
ggml_tensor * v_mla, |
|
|
float kq_scale) const; |
|
|
|
|
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const; |
|
|
|
|
|
ggml_tensor * build_attn( |
|
|
llm_graph_input_attn_no_cache * inp, |
|
|
ggml_tensor * wo, |
|
|
ggml_tensor * wo_b, |
|
|
ggml_tensor * q_cur, |
|
|
ggml_tensor * k_cur, |
|
|
ggml_tensor * v_cur, |
|
|
ggml_tensor * kq_b, |
|
|
ggml_tensor * v_mla, |
|
|
float kq_scale, |
|
|
int il) const; |
|
|
|
|
|
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const; |
|
|
|
|
|
ggml_tensor * build_attn( |
|
|
llm_graph_input_attn_kv_unified * inp, |
|
|
ggml_tensor * wo, |
|
|
ggml_tensor * wo_b, |
|
|
ggml_tensor * q_cur, |
|
|
ggml_tensor * k_cur, |
|
|
ggml_tensor * v_cur, |
|
|
ggml_tensor * kq_b, |
|
|
ggml_tensor * v_mla, |
|
|
float kq_scale, |
|
|
int il) const; |
|
|
|
|
|
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const; |
|
|
|
|
|
|
|
|
ggml_tensor * build_attn( |
|
|
llm_graph_input_attn_kv_unified_iswa * inp, |
|
|
ggml_tensor * wo, |
|
|
ggml_tensor * wo_b, |
|
|
ggml_tensor * q_cur, |
|
|
ggml_tensor * k_cur, |
|
|
ggml_tensor * v_cur, |
|
|
ggml_tensor * kq_b, |
|
|
ggml_tensor * v_mla, |
|
|
float kq_scale, |
|
|
int il) const; |
|
|
|
|
|
llm_graph_input_attn_cross * build_attn_inp_cross() const; |
|
|
|
|
|
ggml_tensor * build_attn( |
|
|
llm_graph_input_attn_cross * inp, |
|
|
ggml_tensor * wo, |
|
|
ggml_tensor * wo_b, |
|
|
ggml_tensor * q_cur, |
|
|
ggml_tensor * k_cur, |
|
|
ggml_tensor * v_cur, |
|
|
ggml_tensor * kq_b, |
|
|
ggml_tensor * v_mla, |
|
|
float kq_scale, |
|
|
int il) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_tensor * build_rs( |
|
|
ggml_tensor * s, |
|
|
ggml_tensor * state_copy, |
|
|
int32_t state_size, |
|
|
int32_t n_seqs, |
|
|
uint32_t n_kv, |
|
|
uint32_t kv_head, |
|
|
uint32_t kv_size, |
|
|
int32_t rs_zero, |
|
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; |
|
|
|
|
|
llm_graph_input_rs * build_rs_inp() const; |
|
|
|
|
|
ggml_tensor * build_rs( |
|
|
llm_graph_input_rs * inp, |
|
|
ggml_tensor * s, |
|
|
int32_t state_size, |
|
|
int32_t n_seqs, |
|
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; |
|
|
|
|
|
ggml_tensor * build_rwkv_token_shift_load( |
|
|
llm_graph_input_rs * inp, |
|
|
const llama_ubatch & ubatch, |
|
|
int il) const; |
|
|
|
|
|
ggml_tensor * build_rwkv_token_shift_store( |
|
|
ggml_tensor * token_shift, |
|
|
const llama_ubatch & ubatch, |
|
|
int il) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void build_pooling( |
|
|
ggml_tensor * cls, |
|
|
ggml_tensor * cls_b, |
|
|
ggml_tensor * cls_out, |
|
|
ggml_tensor * cls_out_b) const; |
|
|
}; |
|
|
|
|
|
|
|
|
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional); |
|
|
|