| | #pragma once
|
| |
|
| | #include "server-http.h"
|
| | #include "server-task.h"
|
| | #include "server-queue.h"
|
| |
|
| | #include <nlohmann/json_fwd.hpp>
|
| |
|
| | #include <cstddef>
|
| | #include <memory>
|
| | #include <set>
|
| |
|
| | struct server_context_impl;
|
| |
|
| | struct server_context_meta {
|
| | std::string build_info;
|
| | std::string model_name;
|
| | std::set<std::string> model_aliases;
|
| | std::set<std::string> model_tags;
|
| | std::string model_path;
|
| | bool has_mtmd;
|
| | bool has_inp_image;
|
| | bool has_inp_audio;
|
| | json json_webui_settings;
|
| | int slot_n_ctx;
|
| | enum llama_pooling_type pooling_type;
|
| |
|
| |
|
| | server_chat_params & chat_params;
|
| | std::map<std::string, bool> chat_template_caps;
|
| |
|
| |
|
| | std::string bos_token_str;
|
| | std::string eos_token_str;
|
| | llama_token fim_pre_token;
|
| | llama_token fim_sub_token;
|
| | llama_token fim_mid_token;
|
| | llama_token fim_pad_token;
|
| | llama_token fim_rep_token;
|
| | llama_token fim_sep_token;
|
| |
|
| |
|
| | enum llama_vocab_type model_vocab_type;
|
| | int32_t model_vocab_n_tokens;
|
| | int32_t model_n_ctx_train;
|
| | int32_t model_n_embd_inp;
|
| | uint64_t model_n_params;
|
| | uint64_t model_size;
|
| | };
|
| |
|
| | struct server_context {
|
| | std::unique_ptr<server_context_impl> impl;
|
| |
|
| | server_context();
|
| | ~server_context();
|
| |
|
| |
|
| |
|
| | bool load_model(const common_params & params);
|
| |
|
| |
|
| | void start_loop();
|
| |
|
| |
|
| | void terminate();
|
| |
|
| |
|
| |
|
| | llama_context * get_llama_context() const;
|
| |
|
| |
|
| | server_response_reader get_response_reader();
|
| |
|
| |
|
| |
|
| | server_context_meta get_meta() const;
|
| | };
|
| |
|
| |
|
| |
|
| | struct server_res_generator;
|
| |
|
| | struct server_routes {
|
| | server_routes(const common_params & params, server_context & ctx_server);
|
| |
|
| | void init_routes();
|
| |
|
| |
|
| | void update_meta(const server_context & ctx_server) {
|
| | this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
|
| | }
|
| |
|
| |
|
| |
|
| | server_http_context::handler_t get_health;
|
| | server_http_context::handler_t get_metrics;
|
| | server_http_context::handler_t get_slots;
|
| | server_http_context::handler_t post_slots;
|
| | server_http_context::handler_t get_props;
|
| | server_http_context::handler_t post_props;
|
| | server_http_context::handler_t get_api_show;
|
| | server_http_context::handler_t post_infill;
|
| | server_http_context::handler_t post_completions;
|
| | server_http_context::handler_t post_completions_oai;
|
| | server_http_context::handler_t post_chat_completions;
|
| | server_http_context::handler_t post_responses_oai;
|
| | server_http_context::handler_t post_anthropic_messages;
|
| | server_http_context::handler_t post_anthropic_count_tokens;
|
| | server_http_context::handler_t post_apply_template;
|
| | server_http_context::handler_t get_models;
|
| | server_http_context::handler_t post_tokenize;
|
| | server_http_context::handler_t post_detokenize;
|
| | server_http_context::handler_t post_embeddings;
|
| | server_http_context::handler_t post_embeddings_oai;
|
| | server_http_context::handler_t post_rerank;
|
| | server_http_context::handler_t get_lora_adapters;
|
| | server_http_context::handler_t post_lora_adapters;
|
| | private:
|
| | std::unique_ptr<server_res_generator> handle_completions_impl(
|
| | const server_http_req & req,
|
| | server_task_type type,
|
| | const json & data,
|
| | const std::vector<raw_buffer> & files,
|
| | task_response_type res_type);
|
| | std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
|
| | std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
|
| | std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
|
| | std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
|
| |
|
| |
|
| | std::unique_ptr<const server_context_meta> meta;
|
| |
|
| | const common_params & params;
|
| | const server_context_impl & ctx_server;
|
| |
|
| | server_queue & queue_tasks;
|
| | server_response & queue_results;
|
| | std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
|
| | };
|
| |
|