| | #pragma once
|
| |
|
| | #include "common.h"
|
| | #include "log.h"
|
| | #include "llama.h"
|
| | #include "chat.h"
|
| | #include "mtmd.h"
|
| |
|
| | #define JSON_ASSERT GGML_ASSERT
|
| | #include <nlohmann/json.hpp>
|
| |
|
| | #include <string>
|
| | #include <vector>
|
| | #include <cinttypes>
|
| |
|
| | using json = nlohmann::ordered_json;
|
| |
|
| | #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
| | #define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
| | #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
| | #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
| | #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
| |
|
| | #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
| | #define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
| | #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
| | #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
| | #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
| |
|
| | using raw_buffer = std::vector<uint8_t>;
|
| |
|
| | template <typename T>
|
| | static T json_value(const json & body, const std::string & key, const T & default_value) {
|
| |
|
| | if (body.contains(key) && !body.at(key).is_null()) {
|
| | try {
|
| | return body.at(key);
|
| | } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
|
| | LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
|
| | return default_value;
|
| | }
|
| | } else {
|
| | return default_value;
|
| | }
|
| | }
|
| |
|
| |
|
| | enum error_type {
|
| | ERROR_TYPE_INVALID_REQUEST,
|
| | ERROR_TYPE_AUTHENTICATION,
|
| | ERROR_TYPE_SERVER,
|
| | ERROR_TYPE_NOT_FOUND,
|
| | ERROR_TYPE_PERMISSION,
|
| | ERROR_TYPE_UNAVAILABLE,
|
| | ERROR_TYPE_NOT_SUPPORTED,
|
| | ERROR_TYPE_EXCEED_CONTEXT_SIZE,
|
| | };
|
| |
|
| |
|
| | struct server_grammar_trigger {
|
| | common_grammar_trigger value;
|
| |
|
| | server_grammar_trigger() = default;
|
| | server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
| | server_grammar_trigger(const json & in) {
|
| | value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
| | value.value = in.at("value").get<std::string>();
|
| | if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
| | value.token = (llama_token) in.at("token").get<int>();
|
| | }
|
| | }
|
| |
|
| | json to_json() const {
|
| | json out {
|
| | {"type", (int) value.type},
|
| | {"value", value.value},
|
| | };
|
| | if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
| | out["token"] = (int) value.token;
|
| | }
|
| | return out;
|
| | }
|
| | };
|
| |
|
| | json format_error_response(const std::string & message, const enum error_type type);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | std::string random_string();
|
| | std::string gen_chatcmplid();
|
| | std::string gen_tool_call_id();
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
|
| |
|
| |
|
| |
|
| | bool lora_should_clear_cache(
|
| | const std::vector<common_adapter_lora_info> & current,
|
| | const std::vector<common_adapter_lora_info> & next);
|
| |
|
| | std::map<int, float> parse_lora_request(const json & data);
|
| |
|
| | bool are_lora_equal(
|
| | const std::vector<common_adapter_lora_info> & l1,
|
| | const std::vector<common_adapter_lora_info> & l2);
|
| |
|
| |
|
| | std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | struct server_tokens {
|
| | bool has_mtmd = false;
|
| |
|
| | private:
|
| |
|
| |
|
| |
|
| | std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | llama_tokens tokens;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | public:
|
| | server_tokens() = default;
|
| | ~server_tokens() = default;
|
| |
|
| |
|
| |
|
| | server_tokens(const server_tokens&) = delete;
|
| | server_tokens& operator=(const server_tokens&) = delete;
|
| |
|
| |
|
| | server_tokens(server_tokens&&) = default;
|
| | server_tokens& operator=(server_tokens&&) = default;
|
| |
|
| |
|
| | llama_token operator[](size_t index) { return tokens[index]; }
|
| | const llama_token& operator[](size_t index) const { return tokens[index]; }
|
| |
|
| | server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
|
| | server_tokens(const llama_tokens & tokens, bool has_mtmd);
|
| |
|
| |
|
| | std::string str() const;
|
| |
|
| |
|
| | llama_pos pos_next(int64_t n_tokens = -1) const;
|
| |
|
| |
|
| | size_t size_up_to_pos(llama_pos max_pos) const;
|
| |
|
| | const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
|
| |
|
| | void push_back(llama_token tok);
|
| |
|
| |
|
| | void push_back(const mtmd_input_chunk * chunk);
|
| |
|
| |
|
| | void push_back(server_tokens & tokens);
|
| |
|
| |
|
| | void insert(const llama_tokens & inp_tokens);
|
| |
|
| |
|
| | const llama_tokens & get_text_tokens() const;
|
| |
|
| |
|
| | void set_token(llama_pos pos, llama_token id);
|
| |
|
| | size_t size() const { return tokens.size(); }
|
| |
|
| | bool empty() const { return tokens.empty(); }
|
| |
|
| | void clear() {
|
| | map_idx_to_media.clear();
|
| | tokens.clear();
|
| | }
|
| |
|
| | void keep_first(size_t n);
|
| |
|
| | std::string detokenize(const llama_context * ctx, bool special) const;
|
| |
|
| | size_t get_common_prefix(const server_tokens & b) const;
|
| |
|
| |
|
| | bool validate(const struct llama_context * ctx) const;
|
| |
|
| |
|
| | int32_t process_chunk(
|
| | llama_context * ctx,
|
| | mtmd_context * mctx,
|
| | size_t idx,
|
| | llama_pos pos,
|
| | int32_t seq_id,
|
| | size_t & n_tokens_out) const;
|
| |
|
| | server_tokens clone() const;
|
| | };
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | bool json_is_array_of_numbers(const json & data);
|
| |
|
| |
|
| | bool json_is_array_of_mixed_numbers_strings(const json & data);
|
| |
|
| |
|
| | bool json_is_array_and_contains_numbers(const json & data);
|
| |
|
| |
|
| | json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
|
| |
|
| | |
| | |
| | |
| | |
| |
|
| | llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
|
| |
|
| |
|
| |
|
| |
|
| | size_t validate_utf8(const std::string& text);
|
| |
|
| |
|
| | server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | std::vector<server_tokens> tokenize_input_prompts(
|
| | const llama_vocab * vocab,
|
| | mtmd_context * mctx,
|
| | const json & json_prompt,
|
| | bool add_special,
|
| | bool parse_special);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | struct server_chat_params {
|
| | bool use_jinja;
|
| | bool prefill_assistant;
|
| | common_reasoning_format reasoning_format;
|
| | std::map<std::string, std::string> chat_template_kwargs;
|
| | common_chat_templates_ptr tmpls;
|
| | bool allow_image;
|
| | bool allow_audio;
|
| | bool enable_thinking = true;
|
| | std::string media_path;
|
| | };
|
| |
|
| |
|
| | json oaicompat_completion_params_parse(const json & body);
|
| |
|
| |
|
| | json oaicompat_chat_params_parse(
|
| | json & body,
|
| | const server_chat_params & opt,
|
| | std::vector<raw_buffer> & out_files);
|
| |
|
| |
|
| | json convert_responses_to_chatcmpl(const json & body);
|
| |
|
| |
|
| | json convert_anthropic_to_oai(const json & body);
|
| |
|
| |
|
| | json format_embeddings_response_oaicompat(
|
| | const json & request,
|
| | const std::string & model_name,
|
| | const json & embeddings,
|
| | bool use_base64 = false);
|
| |
|
| |
|
| | json format_response_rerank(
|
| | const json & request,
|
| | const std::string & model_name,
|
| | const json & ranks,
|
| | bool is_tei_format,
|
| | std::vector<std::string> & texts,
|
| | int top_n);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
|
| |
|
| | std::string safe_json_to_str(const json & data);
|
| |
|
| | std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
|
| | std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens);
|
| |
|
| |
|
| | std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
|
| |
|
| |
|
| |
|
| | std::string format_oai_sse(const json & data);
|
| |
|
| | std::string format_oai_resp_sse(const json & data);
|
| |
|
| |
|
| | std::string format_anthropic_sse(const json & data);
|
| |
|
| | bool is_valid_utf8(const std::string & str);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | llama_tokens format_prompt_infill(
|
| | const llama_vocab * vocab,
|
| | const json & input_prefix,
|
| | const json & input_suffix,
|
| | const json & input_extra,
|
| | const int n_batch,
|
| | const int n_predict,
|
| | const int n_ctx,
|
| | const bool spm_infill,
|
| | const llama_tokens & tokens_prompt);
|
| |
|
| |
|
| | server_tokens format_prompt_rerank(
|
| | const struct llama_model * model,
|
| | const struct llama_vocab * vocab,
|
| | mtmd_context * mctx,
|
| | const std::string & query,
|
| | const std::string & doc);
|
| |
|