| | #pragma once
|
| |
|
| | #include "common.h"
|
| | #include "llama.h"
|
| |
|
| | #include <string>
|
| | #include <unordered_set>
|
| | #include <list>
|
| | #include <map>
|
| |
|
| |
|
| | #include "server-common.h"
|
| |
|
| | using json = nlohmann::ordered_json;
|
| |
|
| | enum server_task_type {
|
| | SERVER_TASK_TYPE_COMPLETION,
|
| | SERVER_TASK_TYPE_EMBEDDING,
|
| | SERVER_TASK_TYPE_RERANK,
|
| | SERVER_TASK_TYPE_INFILL,
|
| | SERVER_TASK_TYPE_CANCEL,
|
| | SERVER_TASK_TYPE_NEXT_RESPONSE,
|
| | SERVER_TASK_TYPE_METRICS,
|
| | SERVER_TASK_TYPE_SLOT_SAVE,
|
| | SERVER_TASK_TYPE_SLOT_RESTORE,
|
| | SERVER_TASK_TYPE_SLOT_ERASE,
|
| | SERVER_TASK_TYPE_GET_LORA,
|
| | SERVER_TASK_TYPE_SET_LORA,
|
| | };
|
| |
|
| |
|
| | enum task_response_type {
|
| | TASK_RESPONSE_TYPE_NONE,
|
| | TASK_RESPONSE_TYPE_OAI_CHAT,
|
| | TASK_RESPONSE_TYPE_OAI_CMPL,
|
| | TASK_RESPONSE_TYPE_OAI_RESP,
|
| | TASK_RESPONSE_TYPE_OAI_EMBD,
|
| | TASK_RESPONSE_TYPE_ANTHROPIC,
|
| | };
|
| |
|
| | enum stop_type {
|
| | STOP_TYPE_NONE,
|
| | STOP_TYPE_EOS,
|
| | STOP_TYPE_WORD,
|
| | STOP_TYPE_LIMIT,
|
| | };
|
| |
|
| | struct task_params {
|
| | bool stream = true;
|
| | bool include_usage = false;
|
| | bool cache_prompt = true;
|
| | bool return_tokens = false;
|
| | bool return_progress = false;
|
| |
|
| | int32_t n_keep = 0;
|
| | int32_t n_discard = 0;
|
| | int32_t n_predict = -1;
|
| | int32_t n_indent = 0;
|
| | int32_t n_cmpl = 1;
|
| |
|
| | int32_t n_cache_reuse = 0;
|
| |
|
| | int64_t t_max_prompt_ms = -1;
|
| | int64_t t_max_predict_ms = -1;
|
| |
|
| | std::map<int, float> lora;
|
| |
|
| | std::vector<std::string> antiprompt;
|
| | std::vector<std::string> response_fields;
|
| |
|
| | bool timings_per_token = false;
|
| | bool post_sampling_probs = false;
|
| |
|
| | struct common_params_sampling sampling;
|
| | struct common_params_speculative speculative;
|
| |
|
| |
|
| | bool verbose = false;
|
| | task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
| | std::string oaicompat_model;
|
| | std::string oaicompat_cmpl_id;
|
| |
|
| |
|
| | common_chat_parser_params chat_parser_params;
|
| |
|
| |
|
| | int32_t embd_normalize = 2;
|
| |
|
| | json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
|
| | json to_json(bool only_metrics = false) const;
|
| | };
|
| |
|
| |
|
| | struct task_result_state {
|
| |
|
| | std::vector<common_chat_msg_diff> diffs;
|
| | common_chat_parser_params chat_parser_params;
|
| | common_chat_msg chat_msg;
|
| | std::string generated_text;
|
| | std::vector<std::string> generated_tool_call_ids;
|
| |
|
| |
|
| |
|
| | bool thinking_block_started = false;
|
| | bool text_block_started = false;
|
| |
|
| |
|
| | const std::string oai_resp_id;
|
| | const std::string oai_resp_reasoning_id;
|
| | const std::string oai_resp_message_id;
|
| | std::string oai_resp_fc_id;
|
| |
|
| | task_result_state(const common_chat_parser_params & chat_parser_params)
|
| | : chat_parser_params(chat_parser_params)
|
| | , oai_resp_id("resp_" + random_string())
|
| | , oai_resp_reasoning_id("rs_" + random_string())
|
| | , oai_resp_message_id("msg_" + random_string()) {}
|
| |
|
| |
|
| | common_chat_msg update_chat_msg(
|
| | const std::string & text_added,
|
| | bool is_partial,
|
| | std::vector<common_chat_msg_diff> & diffs);
|
| | };
|
| |
|
| | struct server_task {
|
| | int id = -1;
|
| |
|
| |
|
| | size_t index = 0;
|
| |
|
| |
|
| | int id_target = -1;
|
| | int id_slot = -1;
|
| |
|
| |
|
| | int id_parent = -1;
|
| |
|
| |
|
| | std::vector<server_task> child_tasks;
|
| |
|
| |
|
| | task_params params;
|
| | server_tokens tokens;
|
| |
|
| |
|
| |
|
| | bool cli = false;
|
| | std::string cli_prompt;
|
| | std::vector<raw_buffer> cli_files;
|
| |
|
| | server_task_type type;
|
| |
|
| |
|
| | struct slot_action {
|
| | int id_slot;
|
| | std::string filename;
|
| | std::string filepath;
|
| | };
|
| | slot_action slot_action;
|
| |
|
| |
|
| | bool metrics_reset_bucket = false;
|
| |
|
| |
|
| | std::map<int, float> set_lora;
|
| |
|
| | server_task() = default;
|
| |
|
| | server_task(server_task_type type) : type(type) {}
|
| |
|
| | int32_t n_tokens() const {
|
| | return tokens.size();
|
| | }
|
| |
|
| | bool need_embd() const {
|
| | switch (type) {
|
| | case SERVER_TASK_TYPE_EMBEDDING:
|
| | case SERVER_TASK_TYPE_RERANK:
|
| | return true;
|
| | default:
|
| | return false;
|
| | }
|
| | }
|
| |
|
| | bool need_logits() const {
|
| | switch (type) {
|
| | case SERVER_TASK_TYPE_COMPLETION:
|
| | case SERVER_TASK_TYPE_INFILL:
|
| | return true;
|
| | default:
|
| | return false;
|
| | }
|
| | }
|
| |
|
| | bool need_sampling() const {
|
| | switch (type) {
|
| | case SERVER_TASK_TYPE_COMPLETION:
|
| | case SERVER_TASK_TYPE_INFILL:
|
| | return true;
|
| | default:
|
| | return false;
|
| | }
|
| | }
|
| |
|
| | static task_params params_from_json_cmpl(
|
| | const llama_vocab * vocab,
|
| | const common_params & params_base,
|
| | const int n_ctx_slot,
|
| | const json & data);
|
| |
|
| |
|
| | static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
| | std::unordered_set<int> ids(tasks.size());
|
| | for (size_t i = 0; i < tasks.size(); i++) {
|
| | ids.insert(tasks[i].id);
|
| | for (auto & child : tasks[i].child_tasks) {
|
| | ids.insert(child.id);
|
| | }
|
| | }
|
| | return ids;
|
| | }
|
| |
|
| | void add_child(int id_parent, int id_child) {
|
| | server_task copy;
|
| |
|
| | copy.id = id_child;
|
| | copy.id_parent = id_parent;
|
| | copy.params = params;
|
| | copy.type = type;
|
| | copy.tokens = tokens.clone();
|
| | copy.id_slot = -1;
|
| |
|
| |
|
| |
|
| | if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
| | copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
|
| | }
|
| |
|
| | child_tasks.push_back(std::move(copy));
|
| | }
|
| |
|
| |
|
| |
|
| | task_result_state create_state() const {
|
| | return task_result_state(params.chat_parser_params);
|
| | }
|
| |
|
| | bool is_parent() const {
|
| | return child_tasks.size() > 0;
|
| | }
|
| |
|
| | bool is_child() const {
|
| | return id_parent != -1;
|
| | }
|
| | };
|
| |
|
| | struct result_timings {
|
| | int32_t cache_n = -1;
|
| |
|
| | int32_t prompt_n = -1;
|
| | double prompt_ms;
|
| | double prompt_per_token_ms;
|
| | double prompt_per_second;
|
| |
|
| | int32_t predicted_n = -1;
|
| | double predicted_ms;
|
| | double predicted_per_token_ms;
|
| | double predicted_per_second;
|
| |
|
| |
|
| | int32_t draft_n = 0;
|
| | int32_t draft_n_accepted = 0;
|
| |
|
| | json to_json() const;
|
| | };
|
| |
|
| | struct result_prompt_progress {
|
| | int32_t total = 0;
|
| | int32_t cache = 0;
|
| | int32_t processed = 0;
|
| | int64_t time_ms = 0;
|
| |
|
| | json to_json() const;
|
| | };
|
| |
|
| | struct server_task_result {
|
| | int id = -1;
|
| | int id_slot = -1;
|
| |
|
| |
|
| | size_t index = 0;
|
| |
|
| | virtual bool is_error() {
|
| |
|
| | return false;
|
| | }
|
| | virtual bool is_stop() {
|
| |
|
| | return true;
|
| | }
|
| | virtual void update(task_result_state &) {
|
| |
|
| | }
|
| | virtual json to_json() = 0;
|
| | virtual ~server_task_result() = default;
|
| | };
|
| |
|
| |
|
| | using server_task_result_ptr = std::unique_ptr<server_task_result>;
|
| |
|
| | struct completion_token_output {
|
| | llama_token tok;
|
| | float prob;
|
| | std::string text_to_send;
|
| | struct prob_info {
|
| | llama_token tok;
|
| | std::string txt;
|
| | float prob;
|
| | };
|
| | std::vector<prob_info> probs;
|
| |
|
| | json to_json(bool post_sampling_probs) const;
|
| |
|
| | static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
|
| |
|
| | static float logarithm(float x);
|
| |
|
| | static std::vector<unsigned char> str_to_bytes(const std::string & str);
|
| |
|
| | };
|
| |
|
| | struct server_task_result_cmpl_final : server_task_result {
|
| | std::string content;
|
| | llama_tokens tokens;
|
| |
|
| | bool stream;
|
| | bool include_usage;
|
| | result_timings timings;
|
| | std::string prompt;
|
| |
|
| | bool truncated;
|
| | int32_t n_decoded;
|
| | int32_t n_prompt_tokens;
|
| | int32_t n_tokens_cached;
|
| | bool has_new_line;
|
| | std::string stopping_word;
|
| | stop_type stop = STOP_TYPE_NONE;
|
| |
|
| | bool post_sampling_probs;
|
| | std::vector<completion_token_output> probs_output;
|
| | std::vector<std::string> response_fields;
|
| |
|
| | task_params generation_params;
|
| |
|
| |
|
| | bool verbose = false;
|
| | task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
| | std::string oaicompat_model;
|
| | std::string oaicompat_cmpl_id;
|
| | common_chat_msg oaicompat_msg;
|
| |
|
| | std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
|
| | bool is_updated = false;
|
| |
|
| |
|
| | std::string oai_resp_id;
|
| | std::string oai_resp_reasoning_id;
|
| | std::string oai_resp_message_id;
|
| |
|
| | virtual bool is_stop() override {
|
| | return true;
|
| | }
|
| |
|
| | virtual json to_json() override;
|
| |
|
| | virtual void update(task_result_state & state) override {
|
| | is_updated = true;
|
| | oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
|
| |
|
| | oai_resp_id = state.oai_resp_id;
|
| | oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
| | oai_resp_message_id = state.oai_resp_message_id;
|
| | }
|
| |
|
| | json to_json_non_oaicompat();
|
| |
|
| | json to_json_oaicompat();
|
| |
|
| | json to_json_oaicompat_chat();
|
| |
|
| | json to_json_oaicompat_chat_stream();
|
| |
|
| | json to_json_oaicompat_resp();
|
| |
|
| | json to_json_oaicompat_resp_stream();
|
| |
|
| | json to_json_anthropic();
|
| |
|
| | json to_json_anthropic_stream();
|
| | };
|
| |
|
| | struct server_task_result_cmpl_partial : server_task_result {
|
| | std::string content;
|
| | llama_tokens tokens;
|
| |
|
| | int32_t n_decoded;
|
| | int32_t n_prompt_tokens;
|
| |
|
| | bool post_sampling_probs;
|
| | bool is_progress = false;
|
| | completion_token_output prob_output;
|
| | result_timings timings;
|
| | result_prompt_progress progress;
|
| |
|
| |
|
| | bool verbose = false;
|
| | task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
| | std::string oaicompat_model;
|
| | std::string oaicompat_cmpl_id;
|
| | std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
|
| | bool is_updated = false;
|
| |
|
| |
|
| | bool thinking_block_started = false;
|
| | bool text_block_started = false;
|
| |
|
| |
|
| | std::string oai_resp_id;
|
| | std::string oai_resp_reasoning_id;
|
| | std::string oai_resp_message_id;
|
| | std::string oai_resp_fc_id;
|
| |
|
| |
|
| | bool anthropic_has_reasoning = false;
|
| |
|
| | virtual bool is_stop() override {
|
| | return false;
|
| | }
|
| |
|
| | virtual void update(task_result_state & state) override;
|
| |
|
| | virtual json to_json() override;
|
| |
|
| | json to_json_non_oaicompat();
|
| |
|
| | json to_json_oaicompat();
|
| |
|
| | json to_json_oaicompat_chat();
|
| |
|
| | json to_json_oaicompat_resp();
|
| |
|
| | json to_json_anthropic();
|
| | };
|
| |
|
| | struct server_task_result_embd : server_task_result {
|
| | std::vector<std::vector<float>> embedding;
|
| |
|
| | int32_t n_tokens;
|
| |
|
| |
|
| | task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
| |
|
| | virtual json to_json() override;
|
| |
|
| | json to_json_non_oaicompat();
|
| |
|
| | json to_json_oaicompat();
|
| | };
|
| |
|
| | struct server_task_result_rerank : server_task_result {
|
| | float score = -1e6;
|
| |
|
| | int32_t n_tokens;
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_error : server_task_result {
|
| | error_type err_type = ERROR_TYPE_SERVER;
|
| | std::string err_msg;
|
| |
|
| |
|
| | int32_t n_prompt_tokens = 0;
|
| | int32_t n_ctx = 0;
|
| |
|
| | virtual bool is_error() override {
|
| | return true;
|
| | }
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_metrics : server_task_result {
|
| | int n_idle_slots;
|
| | int n_processing_slots;
|
| | int n_tasks_deferred;
|
| | int64_t t_start;
|
| |
|
| |
|
| | uint64_t n_prompt_tokens_processed_total = 0;
|
| | uint64_t t_prompt_processing_total = 0;
|
| | uint64_t n_tokens_predicted_total = 0;
|
| | uint64_t t_tokens_generation_total = 0;
|
| |
|
| | uint64_t n_tokens_max = 0;
|
| |
|
| | uint64_t n_prompt_tokens_processed = 0;
|
| | uint64_t t_prompt_processing = 0;
|
| |
|
| | uint64_t n_tokens_predicted = 0;
|
| | uint64_t t_tokens_generation = 0;
|
| |
|
| | uint64_t n_decode_total = 0;
|
| | uint64_t n_busy_slots_total = 0;
|
| |
|
| |
|
| |
|
| | json slots_data = json::array();
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_slot_save_load : server_task_result {
|
| | std::string filename;
|
| | bool is_save;
|
| |
|
| | size_t n_tokens;
|
| | size_t n_bytes;
|
| | double t_ms;
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_slot_erase : server_task_result {
|
| | size_t n_erased;
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_get_lora : server_task_result {
|
| | struct lora {
|
| | common_adapter_lora_info info;
|
| | std::string alora_invocation_string;
|
| | llama_tokens alora_invocation_tokens;
|
| | };
|
| | std::vector<lora> loras;
|
| |
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_task_result_apply_lora : server_task_result {
|
| | virtual json to_json() override;
|
| | };
|
| |
|
| | struct server_prompt_checkpoint {
|
| | llama_pos pos_min;
|
| | llama_pos pos_max;
|
| |
|
| | int64_t n_tokens;
|
| |
|
| | std::vector<uint8_t> data;
|
| |
|
| | size_t size() const {
|
| | return data.size();
|
| | }
|
| | };
|
| |
|
| | struct server_prompt {
|
| | server_tokens tokens;
|
| |
|
| | std::vector<uint8_t> data;
|
| |
|
| | std::list<server_prompt_checkpoint> checkpoints;
|
| |
|
| | size_t size() const {
|
| | size_t res = data.size();
|
| |
|
| | for (const auto & checkpoint : checkpoints) {
|
| | res += checkpoint.size();
|
| | }
|
| |
|
| | return res;
|
| | }
|
| |
|
| | int n_tokens() const {
|
| | return tokens.size();
|
| | }
|
| |
|
| | server_prompt clone() const {
|
| | return server_prompt {
|
| | tokens.clone(),
|
| | data,
|
| | checkpoints
|
| | };
|
| | }
|
| | };
|
| |
|
| | struct server_prompt_cache {
|
| | server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
|
| | this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
|
| | this->limit_tokens = limit_tokens;
|
| | }
|
| |
|
| | std::list<server_prompt> states;
|
| |
|
| |
|
| | size_t limit_size = 0;
|
| |
|
| |
|
| | size_t limit_tokens = 0;
|
| |
|
| | size_t size() const;
|
| |
|
| | size_t n_tokens() const;
|
| |
|
| | server_prompt * alloc(const server_prompt & prompt, size_t state_size);
|
| |
|
| | bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
|
| |
|
| | void update();
|
| | };
|
| |
|