Spaces:
Sleeping
Sleeping
Dmitry Beresnev commited on
Commit ·
332826f
1
Parent(s): d9ce859
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
Browse files- Dockerfile +4 -4
- config.toml.example +3 -0
- cpp/config.cpp +224 -0
- cpp/config.h +13 -0
- cpp/http_helpers.cpp +97 -0
- cpp/http_helpers.h +14 -0
- cpp/llm_manager.cpp +15 -1512
- cpp/llm_manager_types.cpp +20 -0
- cpp/llm_manager_types.h +167 -0
- cpp/model_manager.cpp +253 -0
- cpp/model_manager.h +46 -0
- cpp/request_parsing.cpp +87 -0
- cpp/request_parsing.h +13 -0
- cpp/runtime_components.cpp +352 -0
- cpp/runtime_components.h +149 -0
- cpp/server.cpp +329 -0
- cpp/server.h +35 -0
- app.py → python/app.py +0 -0
- logger.py → python/logger.py +0 -0
Dockerfile
CHANGED
|
@@ -55,9 +55,9 @@ RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/l
|
|
| 55 |
ldd build/bin/llama-server || true
|
| 56 |
|
| 57 |
# Build C++ manager (Boost.Beast + JSON)
|
| 58 |
-
COPY cpp/
|
| 59 |
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
|
| 60 |
-
/build/
|
| 61 |
|
| 62 |
# Runtime stage
|
| 63 |
FROM debian:bookworm-slim
|
|
@@ -95,8 +95,7 @@ RUN useradd -m -u 1000 user && \
|
|
| 95 |
chown -R user:user /home/user
|
| 96 |
|
| 97 |
# Copy application code
|
| 98 |
-
COPY --chown=user:user
|
| 99 |
-
COPY --chown=user:user logger.py /home/user/logger.py
|
| 100 |
|
| 101 |
USER user
|
| 102 |
WORKDIR /home/user
|
|
@@ -105,6 +104,7 @@ WORKDIR /home/user
|
|
| 105 |
ENV HOME=/home/user \
|
| 106 |
LLAMA_CACHE=/home/user/.cache/llama.cpp \
|
| 107 |
PATH=/home/user/.local/bin:$PATH \
|
|
|
|
| 108 |
PYTHONUNBUFFERED=1
|
| 109 |
|
| 110 |
EXPOSE 7860
|
|
|
|
| 55 |
ldd build/bin/llama-server || true
|
| 56 |
|
| 57 |
# Build C++ manager (Boost.Beast + JSON)
|
| 58 |
+
COPY cpp/ /build/cpp/
|
| 59 |
RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
|
| 60 |
+
/build/cpp/*.cpp -o /build/llm-manager
|
| 61 |
|
| 62 |
# Runtime stage
|
| 63 |
FROM debian:bookworm-slim
|
|
|
|
| 95 |
chown -R user:user /home/user
|
| 96 |
|
| 97 |
# Copy application code
|
| 98 |
+
COPY --chown=user:user python/ /home/user/python/
|
|
|
|
| 99 |
|
| 100 |
USER user
|
| 101 |
WORKDIR /home/user
|
|
|
|
| 104 |
ENV HOME=/home/user \
|
| 105 |
LLAMA_CACHE=/home/user/.cache/llama.cpp \
|
| 106 |
PATH=/home/user/.local/bin:$PATH \
|
| 107 |
+
PYTHONPATH=/home/user/python \
|
| 108 |
PYTHONUNBUFFERED=1
|
| 109 |
|
| 110 |
EXPOSE 7860
|
config.toml.example
CHANGED
|
@@ -35,6 +35,9 @@ retry_after_sec = 5
|
|
| 35 |
[scheduler]
|
| 36 |
max_concurrent = 1
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
[rate_limit]
|
| 39 |
requests_per_minute = 60
|
| 40 |
estimated_tokens_per_minute = 6000
|
|
|
|
| 35 |
[scheduler]
|
| 36 |
max_concurrent = 1
|
| 37 |
|
| 38 |
+
[streaming]
|
| 39 |
+
enabled = false
|
| 40 |
+
|
| 41 |
[rate_limit]
|
| 42 |
requests_per_minute = 60
|
| 43 |
estimated_tokens_per_minute = 6000
|
cpp/config.cpp
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "config.h"
|
| 2 |
+
|
| 3 |
+
#include "http_helpers.h"
|
| 4 |
+
|
| 5 |
+
#include <algorithm>
|
| 6 |
+
#include <cstdlib>
|
| 7 |
+
#include <filesystem>
|
| 8 |
+
#include <fstream>
|
| 9 |
+
#include <unordered_map>
|
| 10 |
+
|
| 11 |
+
std::string get_env_or(const char *name, const std::string &fallback) {
|
| 12 |
+
const char *v = std::getenv(name);
|
| 13 |
+
return (v && *v) ? std::string(v) : fallback;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
int get_env_int_or(const char *name, int fallback) {
|
| 17 |
+
const char *v = std::getenv(name);
|
| 18 |
+
if (!v || !*v) return fallback;
|
| 19 |
+
try {
|
| 20 |
+
return std::stoi(v);
|
| 21 |
+
} catch (...) {
|
| 22 |
+
return fallback;
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
std::string trim_copy(const std::string &value) {
|
| 27 |
+
const auto first = value.find_first_not_of(" \t\r\n");
|
| 28 |
+
if (first == std::string::npos) return "";
|
| 29 |
+
const auto last = value.find_last_not_of(" \t\r\n");
|
| 30 |
+
return value.substr(first, last - first + 1);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
std::string strip_quotes(const std::string &value) {
|
| 34 |
+
if (value.size() >= 2) {
|
| 35 |
+
const char first = value.front();
|
| 36 |
+
const char last = value.back();
|
| 37 |
+
if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
|
| 38 |
+
return value.substr(1, value.size() - 2);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
return value;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
bool parse_bool_or(const std::string &value, bool fallback) {
|
| 45 |
+
const std::string normalized = trim_copy(value);
|
| 46 |
+
if (normalized == "true") return true;
|
| 47 |
+
if (normalized == "false") return false;
|
| 48 |
+
return fallback;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
Role parse_role_or(const std::string &value, Role fallback) {
|
| 52 |
+
const std::string normalized = trim_copy(value);
|
| 53 |
+
if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
|
| 54 |
+
if (normalized == "user" || normalized == "USER") return Role::USER;
|
| 55 |
+
return fallback;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
|
| 59 |
+
parse_simple_toml(const std::string &path) {
|
| 60 |
+
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
|
| 61 |
+
std::ifstream input(path);
|
| 62 |
+
if (!input.is_open()) return out;
|
| 63 |
+
|
| 64 |
+
std::string current_section;
|
| 65 |
+
std::string line;
|
| 66 |
+
while (std::getline(input, line)) {
|
| 67 |
+
auto hash = line.find('#');
|
| 68 |
+
if (hash != std::string::npos) line = line.substr(0, hash);
|
| 69 |
+
line = trim_copy(line);
|
| 70 |
+
if (line.empty()) continue;
|
| 71 |
+
|
| 72 |
+
if (line.front() == '[' && line.back() == ']') {
|
| 73 |
+
current_section = trim_copy(line.substr(1, line.size() - 2));
|
| 74 |
+
continue;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
const auto eq = line.find('=');
|
| 78 |
+
if (eq == std::string::npos) continue;
|
| 79 |
+
|
| 80 |
+
std::string key = trim_copy(line.substr(0, eq));
|
| 81 |
+
std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
|
| 82 |
+
out[current_section][key] = value;
|
| 83 |
+
}
|
| 84 |
+
return out;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
|
| 88 |
+
std::vector<ApiKeyRecord> keys;
|
| 89 |
+
std::ifstream input(path);
|
| 90 |
+
if (!input.is_open()) return keys;
|
| 91 |
+
|
| 92 |
+
std::string line;
|
| 93 |
+
bool in_api_key = false;
|
| 94 |
+
ApiKeyRecord current;
|
| 95 |
+
bool has_any_field = false;
|
| 96 |
+
|
| 97 |
+
auto flush_current = [&]() {
|
| 98 |
+
if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
|
| 99 |
+
keys.push_back(current);
|
| 100 |
+
}
|
| 101 |
+
current = ApiKeyRecord{};
|
| 102 |
+
has_any_field = false;
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
while (std::getline(input, line)) {
|
| 106 |
+
auto hash = line.find('#');
|
| 107 |
+
if (hash != std::string::npos) line = line.substr(0, hash);
|
| 108 |
+
line = trim_copy(line);
|
| 109 |
+
if (line.empty()) continue;
|
| 110 |
+
|
| 111 |
+
if (line == "[[api_keys]]") {
|
| 112 |
+
flush_current();
|
| 113 |
+
in_api_key = true;
|
| 114 |
+
continue;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
if (!in_api_key) continue;
|
| 118 |
+
if (line.front() == '[' && line.back() == ']') {
|
| 119 |
+
flush_current();
|
| 120 |
+
in_api_key = false;
|
| 121 |
+
continue;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const auto eq = line.find('=');
|
| 125 |
+
if (eq == std::string::npos) continue;
|
| 126 |
+
|
| 127 |
+
std::string key = trim_copy(line.substr(0, eq));
|
| 128 |
+
std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
|
| 129 |
+
has_any_field = true;
|
| 130 |
+
|
| 131 |
+
if (key == "key_id") current.key_id = value;
|
| 132 |
+
else if (key == "secret") current.secret = value;
|
| 133 |
+
else if (key == "role") current.role = parse_role_or(value, current.role);
|
| 134 |
+
else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
flush_current();
|
| 138 |
+
return keys;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
static std::string get_toml_string_or(
|
| 142 |
+
const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
|
| 143 |
+
const std::string §ion,
|
| 144 |
+
const std::string &key,
|
| 145 |
+
const std::string &fallback) {
|
| 146 |
+
const auto it = data.find(section);
|
| 147 |
+
if (it == data.end()) return fallback;
|
| 148 |
+
const auto kv = it->second.find(key);
|
| 149 |
+
if (kv == it->second.end() || kv->second.empty()) return fallback;
|
| 150 |
+
return kv->second;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
static int get_toml_int_or(
|
| 154 |
+
const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
|
| 155 |
+
const std::string §ion,
|
| 156 |
+
const std::string &key,
|
| 157 |
+
int fallback) {
|
| 158 |
+
const auto it = data.find(section);
|
| 159 |
+
if (it == data.end()) return fallback;
|
| 160 |
+
const auto kv = it->second.find(key);
|
| 161 |
+
if (kv == it->second.end() || kv->second.empty()) return fallback;
|
| 162 |
+
try {
|
| 163 |
+
return std::stoi(kv->second);
|
| 164 |
+
} catch (...) {
|
| 165 |
+
return fallback;
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
ManagerConfig load_manager_config() {
|
| 170 |
+
ManagerConfig cfg;
|
| 171 |
+
|
| 172 |
+
const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
|
| 173 |
+
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
|
| 174 |
+
if (std::filesystem::exists(config_path)) {
|
| 175 |
+
toml = parse_simple_toml(config_path);
|
| 176 |
+
log_line("config: loaded " + config_path);
|
| 177 |
+
} else {
|
| 178 |
+
log_line("config: using environment/defaults (file not found: " + config_path + ")");
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
cfg.server.host = get_env_or("MANAGER_HOST", get_toml_string_or(toml, "server", "host", cfg.server.host));
|
| 182 |
+
cfg.server.port = get_env_int_or("MANAGER_PORT", get_toml_int_or(toml, "server", "port", cfg.server.port));
|
| 183 |
+
|
| 184 |
+
cfg.worker.default_model = get_env_or("DEFAULT_MODEL", get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
|
| 185 |
+
cfg.worker.llama_server_bin = get_env_or("LLAMA_SERVER_BIN", get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
|
| 186 |
+
cfg.worker.host = get_env_or("WORKER_HOST", get_toml_string_or(toml, "worker", "host", cfg.worker.host));
|
| 187 |
+
cfg.worker.bind_host = get_env_or("WORKER_BIND_HOST", get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
|
| 188 |
+
cfg.worker.base_port = get_env_int_or("WORKER_BASE_PORT", get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
|
| 189 |
+
cfg.worker.switch_timeout_sec = get_env_int_or("SWITCH_TIMEOUT_SEC", get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
|
| 190 |
+
|
| 191 |
+
cfg.llama.n_ctx = get_env_int_or("MODEL_N_CTX", get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
|
| 192 |
+
cfg.llama.threads = get_env_int_or("MODEL_THREADS", get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
|
| 193 |
+
cfg.llama.ngl = get_env_int_or("MODEL_NGL", get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
|
| 194 |
+
cfg.llama.batch = get_env_int_or("MODEL_BATCH", get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
|
| 195 |
+
cfg.llama.ubatch = get_env_int_or("MODEL_UBATCH", get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
|
| 196 |
+
|
| 197 |
+
cfg.auth.header = get_env_or("AUTH_HEADER", get_toml_string_or(toml, "auth", "header", cfg.auth.header));
|
| 198 |
+
cfg.auth.scheme = get_env_or("AUTH_SCHEME", get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
|
| 199 |
+
|
| 200 |
+
cfg.limits.default_max_tokens = get_env_int_or("DEFAULT_MAX_TOKENS", get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
|
| 201 |
+
cfg.limits.max_tokens_per_request = get_env_int_or("MAX_TOKENS_PER_REQUEST", get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
|
| 202 |
+
cfg.limits.request_timeout_sec = get_env_int_or("REQUEST_TIMEOUT_SEC", get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
|
| 203 |
+
|
| 204 |
+
cfg.queue.max_size = static_cast<size_t>(std::max(
|
| 205 |
+
1,
|
| 206 |
+
get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
|
| 207 |
+
cfg.queue.max_tokens = get_env_int_or("QUEUE_MAX_TOKENS", get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
|
| 208 |
+
cfg.queue.admin_quota = get_env_int_or("QUEUE_ADMIN_QUOTA", get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
|
| 209 |
+
cfg.queue.retry_after_sec = get_env_int_or("QUEUE_RETRY_AFTER_SEC", get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
|
| 210 |
+
|
| 211 |
+
cfg.rate_limit.requests_per_minute = get_env_int_or("REQUESTS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
|
| 212 |
+
cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or("ESTIMATED_TOKENS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
|
| 213 |
+
|
| 214 |
+
cfg.scheduler.max_concurrent = get_env_int_or("SCHEDULER_MAX_CONCURRENT", get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
|
| 215 |
+
cfg.streaming.enabled = parse_bool_or(
|
| 216 |
+
get_env_or("STREAMING_ENABLED", get_toml_string_or(toml, "streaming", "enabled", cfg.streaming.enabled ? "true" : "false")),
|
| 217 |
+
cfg.streaming.enabled);
|
| 218 |
+
|
| 219 |
+
if (!config_path.empty() && std::filesystem::exists(config_path)) {
|
| 220 |
+
cfg.api_keys = parse_api_keys_toml(config_path);
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
return cfg;
|
| 224 |
+
}
|
cpp/config.h
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
|
| 5 |
+
#include "llm_manager_types.h"
|
| 6 |
+
|
| 7 |
+
std::string get_env_or(const char *name, const std::string &fallback);
|
| 8 |
+
int get_env_int_or(const char *name, int fallback);
|
| 9 |
+
std::string trim_copy(const std::string &value);
|
| 10 |
+
std::string strip_quotes(const std::string &value);
|
| 11 |
+
bool parse_bool_or(const std::string &value, bool fallback);
|
| 12 |
+
Role parse_role_or(const std::string &value, Role fallback);
|
| 13 |
+
ManagerConfig load_manager_config();
|
cpp/http_helpers.cpp
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "http_helpers.h"
|
| 2 |
+
|
| 3 |
+
#include <boost/asio.hpp>
|
| 4 |
+
#include <boost/beast/core.hpp>
|
| 5 |
+
#include <boost/beast/http.hpp>
|
| 6 |
+
|
| 7 |
+
#include <ctime>
|
| 8 |
+
#include <iomanip>
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include <sstream>
|
| 11 |
+
|
| 12 |
+
namespace asio = boost::asio;
|
| 13 |
+
namespace beast = boost::beast;
|
| 14 |
+
namespace http = beast::http;
|
| 15 |
+
|
| 16 |
+
std::string now_utc_iso() {
|
| 17 |
+
std::time_t t = std::time(nullptr);
|
| 18 |
+
std::tm tm{};
|
| 19 |
+
gmtime_r(&t, &tm);
|
| 20 |
+
std::ostringstream oss;
|
| 21 |
+
oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
|
| 22 |
+
return oss.str();
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
void log_line(const std::string &line) {
|
| 26 |
+
std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
std::string truncate_body(const std::string &body, size_t max_len) {
|
| 30 |
+
if (body.size() <= max_len) return body;
|
| 31 |
+
return body.substr(0, max_len) + "...[truncated]";
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
std::optional<std::string> extract_cancel_request_id(const std::string &path) {
|
| 35 |
+
const std::string prefix = "/requests/";
|
| 36 |
+
const std::string suffix = "/cancel";
|
| 37 |
+
if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
|
| 38 |
+
if (path.rfind(prefix, 0) != 0) return std::nullopt;
|
| 39 |
+
if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
|
| 40 |
+
const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
|
| 41 |
+
if (request_id.empty()) return std::nullopt;
|
| 42 |
+
return request_id;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
|
| 46 |
+
asio::io_context ioc;
|
| 47 |
+
asio::ip::tcp::resolver resolver(ioc);
|
| 48 |
+
beast::tcp_stream stream(ioc);
|
| 49 |
+
auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
|
| 50 |
+
stream.connect(results);
|
| 51 |
+
|
| 52 |
+
http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
|
| 53 |
+
req.set(http::field::host, "127.0.0.1");
|
| 54 |
+
req.set(http::field::content_type, "application/json");
|
| 55 |
+
req.set(http::field::user_agent, "llm-manager");
|
| 56 |
+
req.body() = body;
|
| 57 |
+
req.prepare_payload();
|
| 58 |
+
http::write(stream, req);
|
| 59 |
+
|
| 60 |
+
beast::flat_buffer buffer;
|
| 61 |
+
http::response<http::string_body> res;
|
| 62 |
+
http::read(stream, buffer, res);
|
| 63 |
+
beast::error_code ec;
|
| 64 |
+
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 65 |
+
return {res.result_int(), res.body()};
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target) {
|
| 69 |
+
asio::io_context ioc;
|
| 70 |
+
asio::ip::tcp::resolver resolver(ioc);
|
| 71 |
+
beast::tcp_stream stream(ioc);
|
| 72 |
+
auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
|
| 73 |
+
stream.connect(results);
|
| 74 |
+
|
| 75 |
+
http::request<http::string_body> req{http::verb::get, target, 11};
|
| 76 |
+
req.set(http::field::host, "127.0.0.1");
|
| 77 |
+
req.set(http::field::user_agent, "llm-manager");
|
| 78 |
+
req.set(http::field::accept_encoding, "gzip, identity");
|
| 79 |
+
http::write(stream, req);
|
| 80 |
+
|
| 81 |
+
beast::flat_buffer buffer;
|
| 82 |
+
http::response<http::string_body> res;
|
| 83 |
+
http::read(stream, buffer, res);
|
| 84 |
+
beast::error_code ec;
|
| 85 |
+
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 86 |
+
|
| 87 |
+
ProxiedGetResult out;
|
| 88 |
+
out.status = res.result_int();
|
| 89 |
+
out.body = res.body();
|
| 90 |
+
if (res.base().find(http::field::content_type) != res.base().end()) {
|
| 91 |
+
out.content_type = res.base()[http::field::content_type].to_string();
|
| 92 |
+
}
|
| 93 |
+
if (res.base().find(http::field::content_encoding) != res.base().end()) {
|
| 94 |
+
out.content_encoding = res.base()[http::field::content_encoding].to_string();
|
| 95 |
+
}
|
| 96 |
+
return out;
|
| 97 |
+
}
|
cpp/http_helpers.h
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <optional>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <utility>
|
| 6 |
+
|
| 7 |
+
#include "llm_manager_types.h"
|
| 8 |
+
|
| 9 |
+
std::string now_utc_iso();
|
| 10 |
+
void log_line(const std::string &line);
|
| 11 |
+
std::string truncate_body(const std::string &body, size_t max_len = 2000);
|
| 12 |
+
std::optional<std::string> extract_cancel_request_id(const std::string &path);
|
| 13 |
+
std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
|
| 14 |
+
ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target);
|
cpp/llm_manager.cpp
CHANGED
|
@@ -1,1530 +1,29 @@
|
|
| 1 |
#include <boost/asio.hpp>
|
| 2 |
-
#include <boost/beast/core.hpp>
|
| 3 |
-
#include <boost/beast/http.hpp>
|
| 4 |
-
#include <boost/beast/version.hpp>
|
| 5 |
-
#include <nlohmann/json.hpp>
|
| 6 |
|
| 7 |
-
#include <
|
| 8 |
-
#include <atomic>
|
| 9 |
-
#include <chrono>
|
| 10 |
-
#include <condition_variable>
|
| 11 |
-
#include <csignal>
|
| 12 |
-
#include <cstdlib>
|
| 13 |
-
#include <ctime>
|
| 14 |
-
#include <deque>
|
| 15 |
-
#include <filesystem>
|
| 16 |
-
#include <fstream>
|
| 17 |
-
#include <iomanip>
|
| 18 |
-
#include <iostream>
|
| 19 |
-
#include <memory>
|
| 20 |
-
#include <mutex>
|
| 21 |
-
#include <optional>
|
| 22 |
-
#include <sstream>
|
| 23 |
#include <string>
|
| 24 |
#include <thread>
|
| 25 |
-
#include <
|
| 26 |
-
#include <vector>
|
| 27 |
|
| 28 |
-
#include
|
| 29 |
-
#include
|
| 30 |
-
#include
|
|
|
|
|
|
|
| 31 |
|
| 32 |
namespace asio = boost::asio;
|
| 33 |
-
namespace beast = boost::beast;
|
| 34 |
-
namespace http = beast::http;
|
| 35 |
-
using json = nlohmann::json;
|
| 36 |
-
|
| 37 |
-
static void log_line(const std::string &line);
|
| 38 |
-
|
| 39 |
-
struct ServerConfig {
|
| 40 |
-
std::string host = "0.0.0.0";
|
| 41 |
-
int port = 7860;
|
| 42 |
-
};
|
| 43 |
-
|
| 44 |
-
struct WorkerConfig {
|
| 45 |
-
std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
|
| 46 |
-
std::string llama_server_bin = "/usr/local/bin/llama-server";
|
| 47 |
-
std::string host = "127.0.0.1";
|
| 48 |
-
std::string bind_host = "0.0.0.0";
|
| 49 |
-
int base_port = 8080;
|
| 50 |
-
int switch_timeout_sec = 300;
|
| 51 |
-
};
|
| 52 |
-
|
| 53 |
-
struct LlamaConfig {
|
| 54 |
-
int n_ctx = 8192;
|
| 55 |
-
int threads = 4;
|
| 56 |
-
int ngl = 0;
|
| 57 |
-
int batch = 128;
|
| 58 |
-
int ubatch = 64;
|
| 59 |
-
};
|
| 60 |
-
|
| 61 |
-
enum class Role {
|
| 62 |
-
ADMIN,
|
| 63 |
-
USER
|
| 64 |
-
};
|
| 65 |
-
|
| 66 |
-
struct AuthConfig {
|
| 67 |
-
std::string header = "Authorization";
|
| 68 |
-
std::string scheme = "Bearer";
|
| 69 |
-
};
|
| 70 |
-
|
| 71 |
-
struct LimitsConfig {
|
| 72 |
-
int default_max_tokens = 256;
|
| 73 |
-
int max_tokens_per_request = 2048;
|
| 74 |
-
int request_timeout_sec = 30;
|
| 75 |
-
};
|
| 76 |
-
|
| 77 |
-
struct QueueConfig {
|
| 78 |
-
size_t max_size = 100;
|
| 79 |
-
int max_tokens = 20000;
|
| 80 |
-
int admin_quota = 3;
|
| 81 |
-
int retry_after_sec = 5;
|
| 82 |
-
};
|
| 83 |
-
|
| 84 |
-
struct RateLimitConfig {
|
| 85 |
-
int requests_per_minute = 60;
|
| 86 |
-
int estimated_tokens_per_minute = 6000;
|
| 87 |
-
};
|
| 88 |
-
|
| 89 |
-
struct SchedulerConfig {
|
| 90 |
-
int max_concurrent = 1;
|
| 91 |
-
};
|
| 92 |
-
|
| 93 |
-
struct ApiKeyRecord {
|
| 94 |
-
std::string key_id;
|
| 95 |
-
std::string secret;
|
| 96 |
-
Role role = Role::USER;
|
| 97 |
-
bool enabled = true;
|
| 98 |
-
};
|
| 99 |
-
|
| 100 |
-
struct ManagerConfig {
|
| 101 |
-
ServerConfig server;
|
| 102 |
-
WorkerConfig worker;
|
| 103 |
-
LlamaConfig llama;
|
| 104 |
-
AuthConfig auth;
|
| 105 |
-
LimitsConfig limits;
|
| 106 |
-
QueueConfig queue;
|
| 107 |
-
RateLimitConfig rate_limit;
|
| 108 |
-
SchedulerConfig scheduler;
|
| 109 |
-
std::vector<ApiKeyRecord> api_keys;
|
| 110 |
-
};
|
| 111 |
-
|
| 112 |
-
struct WorkerInfo {
|
| 113 |
-
std::string model;
|
| 114 |
-
int port = 0;
|
| 115 |
-
pid_t pid = -1;
|
| 116 |
-
std::string last_loaded;
|
| 117 |
-
};
|
| 118 |
-
|
| 119 |
-
static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
|
| 120 |
-
|
| 121 |
-
static std::string now_utc_iso() {
|
| 122 |
-
std::time_t t = std::time(nullptr);
|
| 123 |
-
std::tm tm{};
|
| 124 |
-
gmtime_r(&t, &tm);
|
| 125 |
-
std::ostringstream oss;
|
| 126 |
-
oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
|
| 127 |
-
return oss.str();
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
static std::string get_env_or(const char *name, const std::string &fallback) {
|
| 131 |
-
const char *v = std::getenv(name);
|
| 132 |
-
return (v && *v) ? std::string(v) : fallback;
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
static int get_env_int_or(const char *name, int fallback) {
|
| 136 |
-
const char *v = std::getenv(name);
|
| 137 |
-
if (!v || !*v) return fallback;
|
| 138 |
-
try {
|
| 139 |
-
return std::stoi(v);
|
| 140 |
-
} catch (...) {
|
| 141 |
-
return fallback;
|
| 142 |
-
}
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
static std::string trim_copy(const std::string &value) {
|
| 146 |
-
const auto first = value.find_first_not_of(" \t\r\n");
|
| 147 |
-
if (first == std::string::npos) return "";
|
| 148 |
-
const auto last = value.find_last_not_of(" \t\r\n");
|
| 149 |
-
return value.substr(first, last - first + 1);
|
| 150 |
-
}
|
| 151 |
-
|
| 152 |
-
static std::string strip_quotes(const std::string &value) {
|
| 153 |
-
if (value.size() >= 2) {
|
| 154 |
-
const char first = value.front();
|
| 155 |
-
const char last = value.back();
|
| 156 |
-
if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
|
| 157 |
-
return value.substr(1, value.size() - 2);
|
| 158 |
-
}
|
| 159 |
-
}
|
| 160 |
-
return value;
|
| 161 |
-
}
|
| 162 |
-
|
| 163 |
-
static bool parse_bool_or(const std::string &value, bool fallback) {
|
| 164 |
-
const std::string normalized = trim_copy(value);
|
| 165 |
-
if (normalized == "true") return true;
|
| 166 |
-
if (normalized == "false") return false;
|
| 167 |
-
return fallback;
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
static Role parse_role_or(const std::string &value, Role fallback) {
|
| 171 |
-
const std::string normalized = trim_copy(value);
|
| 172 |
-
if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
|
| 173 |
-
if (normalized == "user" || normalized == "USER") return Role::USER;
|
| 174 |
-
return fallback;
|
| 175 |
-
}
|
| 176 |
-
|
| 177 |
-
static std::string role_to_string(Role role) {
|
| 178 |
-
return role == Role::ADMIN ? "admin" : "user";
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
-
enum class Priority {
|
| 182 |
-
ADMIN = 0,
|
| 183 |
-
USER = 1
|
| 184 |
-
};
|
| 185 |
-
|
| 186 |
-
static Priority role_to_priority(Role role) {
|
| 187 |
-
return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
|
| 188 |
-
}
|
| 189 |
-
|
| 190 |
-
enum class RequestState {
|
| 191 |
-
QUEUED,
|
| 192 |
-
RUNNING,
|
| 193 |
-
CANCELLED,
|
| 194 |
-
FAILED,
|
| 195 |
-
DONE
|
| 196 |
-
};
|
| 197 |
-
|
| 198 |
-
static std::string state_to_string(RequestState state) {
|
| 199 |
-
switch (state) {
|
| 200 |
-
case RequestState::QUEUED: return "queued";
|
| 201 |
-
case RequestState::RUNNING: return "running";
|
| 202 |
-
case RequestState::CANCELLED: return "cancelled";
|
| 203 |
-
case RequestState::FAILED: return "failed";
|
| 204 |
-
case RequestState::DONE: return "done";
|
| 205 |
-
}
|
| 206 |
-
return "unknown";
|
| 207 |
-
}
|
| 208 |
-
|
| 209 |
-
struct TokenEstimate {
|
| 210 |
-
int prompt_tokens = 0;
|
| 211 |
-
int requested_max_tokens = 0;
|
| 212 |
-
int estimated_total_tokens = 0;
|
| 213 |
-
};
|
| 214 |
-
|
| 215 |
-
struct RateLimitDecision {
|
| 216 |
-
bool allowed = true;
|
| 217 |
-
int retry_after_sec = 0;
|
| 218 |
-
std::string error;
|
| 219 |
-
};
|
| 220 |
-
|
| 221 |
-
struct RequestResult {
|
| 222 |
-
int status = 500;
|
| 223 |
-
std::string body;
|
| 224 |
-
std::string content_type = "application/json";
|
| 225 |
-
};
|
| 226 |
-
|
| 227 |
-
struct RequestContext {
|
| 228 |
-
std::string request_id;
|
| 229 |
-
std::string api_key_id;
|
| 230 |
-
Role role = Role::USER;
|
| 231 |
-
Priority priority = Priority::USER;
|
| 232 |
-
TokenEstimate estimate;
|
| 233 |
-
std::string request_body;
|
| 234 |
-
std::atomic<RequestState> state{RequestState::QUEUED};
|
| 235 |
-
std::atomic<bool> cancelled{false};
|
| 236 |
-
std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
|
| 237 |
-
std::chrono::steady_clock::time_point enqueue_time{created_at};
|
| 238 |
-
std::chrono::steady_clock::time_point start_time{};
|
| 239 |
-
std::mutex mu;
|
| 240 |
-
std::condition_variable cv;
|
| 241 |
-
bool completed = false;
|
| 242 |
-
RequestResult result;
|
| 243 |
-
};
|
| 244 |
-
|
| 245 |
-
static int estimate_text_tokens_rough(const std::string &text) {
|
| 246 |
-
if (text.empty()) return 0;
|
| 247 |
-
return std::max(1, static_cast<int>((text.size() + 3) / 4));
|
| 248 |
-
}
|
| 249 |
-
|
| 250 |
-
static std::string flatten_json_content(const json &content) {
|
| 251 |
-
if (content.is_string()) {
|
| 252 |
-
return content.get<std::string>();
|
| 253 |
-
}
|
| 254 |
-
if (content.is_array()) {
|
| 255 |
-
std::ostringstream oss;
|
| 256 |
-
bool first = true;
|
| 257 |
-
for (const auto &item : content) {
|
| 258 |
-
std::string part;
|
| 259 |
-
if (item.is_string()) {
|
| 260 |
-
part = item.get<std::string>();
|
| 261 |
-
} else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
|
| 262 |
-
part = item["text"].get<std::string>();
|
| 263 |
-
}
|
| 264 |
-
if (part.empty()) continue;
|
| 265 |
-
if (!first) oss << '\n';
|
| 266 |
-
oss << part;
|
| 267 |
-
first = false;
|
| 268 |
-
}
|
| 269 |
-
return oss.str();
|
| 270 |
-
}
|
| 271 |
-
return "";
|
| 272 |
-
}
|
| 273 |
-
|
| 274 |
-
static std::optional<TokenEstimate> estimate_chat_tokens(
|
| 275 |
-
const json &payload,
|
| 276 |
-
const LimitsConfig &limits,
|
| 277 |
-
std::string &error) {
|
| 278 |
-
if (!payload.is_object()) {
|
| 279 |
-
error = "Expected JSON object";
|
| 280 |
-
return std::nullopt;
|
| 281 |
-
}
|
| 282 |
-
if (!payload.contains("messages") || !payload["messages"].is_array()) {
|
| 283 |
-
error = "Expected 'messages' array";
|
| 284 |
-
return std::nullopt;
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
TokenEstimate estimate;
|
| 288 |
-
estimate.requested_max_tokens = limits.default_max_tokens;
|
| 289 |
-
if (payload.contains("max_tokens")) {
|
| 290 |
-
if (!payload["max_tokens"].is_number_integer()) {
|
| 291 |
-
error = "Expected integer 'max_tokens'";
|
| 292 |
-
return std::nullopt;
|
| 293 |
-
}
|
| 294 |
-
estimate.requested_max_tokens = payload["max_tokens"].get<int>();
|
| 295 |
-
}
|
| 296 |
-
|
| 297 |
-
if (estimate.requested_max_tokens <= 0) {
|
| 298 |
-
error = "'max_tokens' must be > 0";
|
| 299 |
-
return std::nullopt;
|
| 300 |
-
}
|
| 301 |
-
|
| 302 |
-
for (const auto &message : payload["messages"]) {
|
| 303 |
-
if (!message.is_object()) continue;
|
| 304 |
-
if (message.contains("role") && message["role"].is_string()) {
|
| 305 |
-
estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
|
| 306 |
-
}
|
| 307 |
-
if (message.contains("content")) {
|
| 308 |
-
estimate.prompt_tokens += estimate_text_tokens_rough(
|
| 309 |
-
flatten_json_content(message["content"]));
|
| 310 |
-
}
|
| 311 |
-
estimate.prompt_tokens += 4;
|
| 312 |
-
}
|
| 313 |
-
|
| 314 |
-
estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
|
| 315 |
-
if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
|
| 316 |
-
error = "Estimated request tokens exceed configured limit";
|
| 317 |
-
return std::nullopt;
|
| 318 |
-
}
|
| 319 |
-
|
| 320 |
-
error.clear();
|
| 321 |
-
return estimate;
|
| 322 |
-
}
|
| 323 |
-
|
| 324 |
-
static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
|
| 325 |
-
parse_simple_toml(const std::string &path) {
|
| 326 |
-
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
|
| 327 |
-
std::ifstream input(path);
|
| 328 |
-
if (!input.is_open()) return out;
|
| 329 |
-
|
| 330 |
-
std::string current_section;
|
| 331 |
-
std::string line;
|
| 332 |
-
while (std::getline(input, line)) {
|
| 333 |
-
auto hash = line.find('#');
|
| 334 |
-
if (hash != std::string::npos) line = line.substr(0, hash);
|
| 335 |
-
line = trim_copy(line);
|
| 336 |
-
if (line.empty()) continue;
|
| 337 |
-
|
| 338 |
-
if (line.front() == '[' && line.back() == ']') {
|
| 339 |
-
current_section = trim_copy(line.substr(1, line.size() - 2));
|
| 340 |
-
continue;
|
| 341 |
-
}
|
| 342 |
-
|
| 343 |
-
const auto eq = line.find('=');
|
| 344 |
-
if (eq == std::string::npos) continue;
|
| 345 |
-
|
| 346 |
-
std::string key = trim_copy(line.substr(0, eq));
|
| 347 |
-
std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
|
| 348 |
-
out[current_section][key] = value;
|
| 349 |
-
}
|
| 350 |
-
return out;
|
| 351 |
-
}
|
| 352 |
-
|
| 353 |
-
static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
|
| 354 |
-
std::vector<ApiKeyRecord> keys;
|
| 355 |
-
std::ifstream input(path);
|
| 356 |
-
if (!input.is_open()) return keys;
|
| 357 |
-
|
| 358 |
-
std::string line;
|
| 359 |
-
bool in_api_key = false;
|
| 360 |
-
ApiKeyRecord current;
|
| 361 |
-
bool has_any_field = false;
|
| 362 |
-
|
| 363 |
-
auto flush_current = [&]() {
|
| 364 |
-
if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
|
| 365 |
-
keys.push_back(current);
|
| 366 |
-
}
|
| 367 |
-
current = ApiKeyRecord{};
|
| 368 |
-
has_any_field = false;
|
| 369 |
-
};
|
| 370 |
-
|
| 371 |
-
while (std::getline(input, line)) {
|
| 372 |
-
auto hash = line.find('#');
|
| 373 |
-
if (hash != std::string::npos) line = line.substr(0, hash);
|
| 374 |
-
line = trim_copy(line);
|
| 375 |
-
if (line.empty()) continue;
|
| 376 |
-
|
| 377 |
-
if (line == "[[api_keys]]") {
|
| 378 |
-
flush_current();
|
| 379 |
-
in_api_key = true;
|
| 380 |
-
continue;
|
| 381 |
-
}
|
| 382 |
-
|
| 383 |
-
if (!in_api_key) continue;
|
| 384 |
-
|
| 385 |
-
if (line.front() == '[' && line.back() == ']') {
|
| 386 |
-
flush_current();
|
| 387 |
-
in_api_key = false;
|
| 388 |
-
continue;
|
| 389 |
-
}
|
| 390 |
-
|
| 391 |
-
const auto eq = line.find('=');
|
| 392 |
-
if (eq == std::string::npos) continue;
|
| 393 |
-
|
| 394 |
-
std::string key = trim_copy(line.substr(0, eq));
|
| 395 |
-
std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
|
| 396 |
-
has_any_field = true;
|
| 397 |
-
|
| 398 |
-
if (key == "key_id") current.key_id = value;
|
| 399 |
-
else if (key == "secret") current.secret = value;
|
| 400 |
-
else if (key == "role") current.role = parse_role_or(value, current.role);
|
| 401 |
-
else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
|
| 402 |
-
}
|
| 403 |
-
|
| 404 |
-
flush_current();
|
| 405 |
-
return keys;
|
| 406 |
-
}
|
| 407 |
-
|
| 408 |
-
static std::string get_toml_string_or(
|
| 409 |
-
const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
|
| 410 |
-
const std::string §ion,
|
| 411 |
-
const std::string &key,
|
| 412 |
-
const std::string &fallback) {
|
| 413 |
-
const auto it = data.find(section);
|
| 414 |
-
if (it == data.end()) return fallback;
|
| 415 |
-
const auto kv = it->second.find(key);
|
| 416 |
-
if (kv == it->second.end() || kv->second.empty()) return fallback;
|
| 417 |
-
return kv->second;
|
| 418 |
-
}
|
| 419 |
-
|
| 420 |
-
static int get_toml_int_or(
|
| 421 |
-
const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
|
| 422 |
-
const std::string §ion,
|
| 423 |
-
const std::string &key,
|
| 424 |
-
int fallback) {
|
| 425 |
-
const auto it = data.find(section);
|
| 426 |
-
if (it == data.end()) return fallback;
|
| 427 |
-
const auto kv = it->second.find(key);
|
| 428 |
-
if (kv == it->second.end() || kv->second.empty()) return fallback;
|
| 429 |
-
try {
|
| 430 |
-
return std::stoi(kv->second);
|
| 431 |
-
} catch (...) {
|
| 432 |
-
return fallback;
|
| 433 |
-
}
|
| 434 |
-
}
|
| 435 |
-
|
| 436 |
-
static ManagerConfig load_manager_config() {
|
| 437 |
-
ManagerConfig cfg;
|
| 438 |
-
|
| 439 |
-
const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
|
| 440 |
-
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
|
| 441 |
-
if (std::filesystem::exists(config_path)) {
|
| 442 |
-
toml = parse_simple_toml(config_path);
|
| 443 |
-
log_line("config: loaded " + config_path);
|
| 444 |
-
} else {
|
| 445 |
-
log_line("config: using environment/defaults (file not found: " + config_path + ")");
|
| 446 |
-
}
|
| 447 |
-
|
| 448 |
-
cfg.server.host = get_env_or(
|
| 449 |
-
"MANAGER_HOST",
|
| 450 |
-
get_toml_string_or(toml, "server", "host", cfg.server.host));
|
| 451 |
-
cfg.server.port = get_env_int_or(
|
| 452 |
-
"MANAGER_PORT",
|
| 453 |
-
get_toml_int_or(toml, "server", "port", cfg.server.port));
|
| 454 |
-
|
| 455 |
-
cfg.worker.default_model = get_env_or(
|
| 456 |
-
"DEFAULT_MODEL",
|
| 457 |
-
get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
|
| 458 |
-
cfg.worker.llama_server_bin = get_env_or(
|
| 459 |
-
"LLAMA_SERVER_BIN",
|
| 460 |
-
get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
|
| 461 |
-
cfg.worker.host = get_env_or(
|
| 462 |
-
"WORKER_HOST",
|
| 463 |
-
get_toml_string_or(toml, "worker", "host", cfg.worker.host));
|
| 464 |
-
cfg.worker.bind_host = get_env_or(
|
| 465 |
-
"WORKER_BIND_HOST",
|
| 466 |
-
get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
|
| 467 |
-
cfg.worker.base_port = get_env_int_or(
|
| 468 |
-
"WORKER_BASE_PORT",
|
| 469 |
-
get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
|
| 470 |
-
cfg.worker.switch_timeout_sec = get_env_int_or(
|
| 471 |
-
"SWITCH_TIMEOUT_SEC",
|
| 472 |
-
get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
|
| 473 |
-
|
| 474 |
-
cfg.llama.n_ctx = get_env_int_or(
|
| 475 |
-
"MODEL_N_CTX",
|
| 476 |
-
get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
|
| 477 |
-
cfg.llama.threads = get_env_int_or(
|
| 478 |
-
"MODEL_THREADS",
|
| 479 |
-
get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
|
| 480 |
-
cfg.llama.ngl = get_env_int_or(
|
| 481 |
-
"MODEL_NGL",
|
| 482 |
-
get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
|
| 483 |
-
cfg.llama.batch = get_env_int_or(
|
| 484 |
-
"MODEL_BATCH",
|
| 485 |
-
get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
|
| 486 |
-
cfg.llama.ubatch = get_env_int_or(
|
| 487 |
-
"MODEL_UBATCH",
|
| 488 |
-
get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
|
| 489 |
-
|
| 490 |
-
cfg.auth.header = get_env_or(
|
| 491 |
-
"AUTH_HEADER",
|
| 492 |
-
get_toml_string_or(toml, "auth", "header", cfg.auth.header));
|
| 493 |
-
cfg.auth.scheme = get_env_or(
|
| 494 |
-
"AUTH_SCHEME",
|
| 495 |
-
get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
|
| 496 |
-
|
| 497 |
-
cfg.limits.default_max_tokens = get_env_int_or(
|
| 498 |
-
"DEFAULT_MAX_TOKENS",
|
| 499 |
-
get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
|
| 500 |
-
cfg.limits.max_tokens_per_request = get_env_int_or(
|
| 501 |
-
"MAX_TOKENS_PER_REQUEST",
|
| 502 |
-
get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
|
| 503 |
-
cfg.limits.request_timeout_sec = get_env_int_or(
|
| 504 |
-
"REQUEST_TIMEOUT_SEC",
|
| 505 |
-
get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
|
| 506 |
-
|
| 507 |
-
cfg.queue.max_size = static_cast<size_t>(std::max(
|
| 508 |
-
1,
|
| 509 |
-
get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
|
| 510 |
-
cfg.queue.max_tokens = get_env_int_or(
|
| 511 |
-
"QUEUE_MAX_TOKENS",
|
| 512 |
-
get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
|
| 513 |
-
cfg.queue.admin_quota = get_env_int_or(
|
| 514 |
-
"QUEUE_ADMIN_QUOTA",
|
| 515 |
-
get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
|
| 516 |
-
cfg.queue.retry_after_sec = get_env_int_or(
|
| 517 |
-
"QUEUE_RETRY_AFTER_SEC",
|
| 518 |
-
get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
|
| 519 |
-
|
| 520 |
-
cfg.rate_limit.requests_per_minute = get_env_int_or(
|
| 521 |
-
"REQUESTS_PER_MINUTE",
|
| 522 |
-
get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
|
| 523 |
-
cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or(
|
| 524 |
-
"ESTIMATED_TOKENS_PER_MINUTE",
|
| 525 |
-
get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
|
| 526 |
-
|
| 527 |
-
cfg.scheduler.max_concurrent = get_env_int_or(
|
| 528 |
-
"SCHEDULER_MAX_CONCURRENT",
|
| 529 |
-
get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
|
| 530 |
-
|
| 531 |
-
if (!config_path.empty() && std::filesystem::exists(config_path)) {
|
| 532 |
-
cfg.api_keys = parse_api_keys_toml(config_path);
|
| 533 |
-
}
|
| 534 |
-
|
| 535 |
-
return cfg;
|
| 536 |
-
}
|
| 537 |
-
|
| 538 |
-
static bool is_alive(pid_t pid) {
|
| 539 |
-
if (pid <= 0) return false;
|
| 540 |
-
return kill(pid, 0) == 0;
|
| 541 |
-
}
|
| 542 |
-
|
| 543 |
-
static void shutdown_worker(pid_t pid, int wait_seconds = 15) {
|
| 544 |
-
if (pid <= 0) return;
|
| 545 |
-
kill(pid, SIGTERM);
|
| 546 |
-
const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
|
| 547 |
-
while (std::chrono::steady_clock::now() < deadline) {
|
| 548 |
-
int status = 0;
|
| 549 |
-
pid_t r = waitpid(pid, &status, WNOHANG);
|
| 550 |
-
if (r == pid) return;
|
| 551 |
-
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
| 552 |
-
}
|
| 553 |
-
kill(pid, SIGKILL);
|
| 554 |
-
int status = 0;
|
| 555 |
-
waitpid(pid, &status, 0);
|
| 556 |
-
}
|
| 557 |
-
|
| 558 |
-
class ModelManager {
|
| 559 |
-
public:
|
| 560 |
-
explicit ModelManager(const ManagerConfig &config)
|
| 561 |
-
: _default_model(config.worker.default_model),
|
| 562 |
-
_llama_server_bin(config.worker.llama_server_bin),
|
| 563 |
-
_worker_host(config.worker.host),
|
| 564 |
-
_worker_bind_host(config.worker.bind_host),
|
| 565 |
-
_base_port(config.worker.base_port),
|
| 566 |
-
_switch_timeout_sec(config.worker.switch_timeout_sec),
|
| 567 |
-
_n_ctx(config.llama.n_ctx),
|
| 568 |
-
_n_threads(config.llama.threads),
|
| 569 |
-
_n_gpu_layers(config.llama.ngl),
|
| 570 |
-
_n_batch(config.llama.batch),
|
| 571 |
-
_n_ubatch(config.llama.ubatch),
|
| 572 |
-
_next_port(_base_port) {}
|
| 573 |
-
|
| 574 |
-
bool initialize_default(std::string &error) {
|
| 575 |
-
return switch_model(_default_model, error);
|
| 576 |
-
}
|
| 577 |
-
|
| 578 |
-
bool switch_model(const std::string &model, std::string &error) {
|
| 579 |
-
{
|
| 580 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 581 |
-
if (_switch_in_progress) {
|
| 582 |
-
error = "Switch already in progress";
|
| 583 |
-
return false;
|
| 584 |
-
}
|
| 585 |
-
if (_active && _active->model == model && is_alive(_active->pid)) {
|
| 586 |
-
return true;
|
| 587 |
-
}
|
| 588 |
-
_switch_in_progress = true;
|
| 589 |
-
}
|
| 590 |
-
|
| 591 |
-
std::optional<WorkerInfo> old_worker;
|
| 592 |
-
{
|
| 593 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 594 |
-
if (_active) old_worker = _active;
|
| 595 |
-
}
|
| 596 |
-
|
| 597 |
-
int port = allocate_port();
|
| 598 |
-
pid_t pid = spawn_worker(model, port);
|
| 599 |
-
if (pid <= 0) {
|
| 600 |
-
finish_switch(false);
|
| 601 |
-
error = "Failed to start worker process";
|
| 602 |
-
return false;
|
| 603 |
-
}
|
| 604 |
-
|
| 605 |
-
if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
|
| 606 |
-
shutdown_worker(pid);
|
| 607 |
-
finish_switch(false);
|
| 608 |
-
error = "New model did not become ready in time";
|
| 609 |
-
return false;
|
| 610 |
-
}
|
| 611 |
-
|
| 612 |
-
WorkerInfo new_worker{model, port, pid, now_utc_iso()};
|
| 613 |
-
{
|
| 614 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 615 |
-
_active = new_worker;
|
| 616 |
-
_switch_in_progress = false;
|
| 617 |
-
}
|
| 618 |
-
|
| 619 |
-
if (old_worker && old_worker->pid != pid) {
|
| 620 |
-
shutdown_worker(old_worker->pid);
|
| 621 |
-
}
|
| 622 |
-
return true;
|
| 623 |
-
}
|
| 624 |
-
|
| 625 |
-
bool restart_active(std::string &error) {
|
| 626 |
-
std::optional<WorkerInfo> old_worker;
|
| 627 |
-
std::string model;
|
| 628 |
-
{
|
| 629 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 630 |
-
if (_switch_in_progress) {
|
| 631 |
-
error = "Switch already in progress";
|
| 632 |
-
return false;
|
| 633 |
-
}
|
| 634 |
-
if (!_active || !is_alive(_active->pid)) {
|
| 635 |
-
error = "No active model";
|
| 636 |
-
return false;
|
| 637 |
-
}
|
| 638 |
-
_switch_in_progress = true;
|
| 639 |
-
old_worker = _active;
|
| 640 |
-
model = _active->model;
|
| 641 |
-
}
|
| 642 |
-
|
| 643 |
-
shutdown_worker(old_worker->pid);
|
| 644 |
-
|
| 645 |
-
int port = allocate_port();
|
| 646 |
-
pid_t pid = spawn_worker(model, port);
|
| 647 |
-
if (pid <= 0) {
|
| 648 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 649 |
-
_active = std::nullopt;
|
| 650 |
-
_switch_in_progress = false;
|
| 651 |
-
error = "Failed to start worker process";
|
| 652 |
-
return false;
|
| 653 |
-
}
|
| 654 |
-
|
| 655 |
-
if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
|
| 656 |
-
shutdown_worker(pid);
|
| 657 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 658 |
-
_active = std::nullopt;
|
| 659 |
-
_switch_in_progress = false;
|
| 660 |
-
error = "New model did not become ready in time";
|
| 661 |
-
return false;
|
| 662 |
-
}
|
| 663 |
-
|
| 664 |
-
WorkerInfo new_worker{model, port, pid, now_utc_iso()};
|
| 665 |
-
{
|
| 666 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 667 |
-
_active = new_worker;
|
| 668 |
-
_switch_in_progress = false;
|
| 669 |
-
}
|
| 670 |
-
return true;
|
| 671 |
-
}
|
| 672 |
-
|
| 673 |
-
std::optional<WorkerInfo> active_worker() {
|
| 674 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 675 |
-
if (_active && is_alive(_active->pid)) return _active;
|
| 676 |
-
return std::nullopt;
|
| 677 |
-
}
|
| 678 |
-
|
| 679 |
-
json models_view() {
|
| 680 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 681 |
-
json out;
|
| 682 |
-
out["status"] = (_active && is_alive(_active->pid)) ? "ready" : "no_active_model";
|
| 683 |
-
out["switch_in_progress"] = _switch_in_progress;
|
| 684 |
-
if (_active && is_alive(_active->pid)) {
|
| 685 |
-
out["current_model"] = _active->model;
|
| 686 |
-
out["last_loaded"] = _active->last_loaded;
|
| 687 |
-
out["active_pid"] = _active->pid;
|
| 688 |
-
out["active_port"] = _active->port;
|
| 689 |
-
} else {
|
| 690 |
-
out["current_model"] = nullptr;
|
| 691 |
-
out["last_loaded"] = nullptr;
|
| 692 |
-
out["active_pid"] = nullptr;
|
| 693 |
-
out["active_port"] = nullptr;
|
| 694 |
-
}
|
| 695 |
-
return out;
|
| 696 |
-
}
|
| 697 |
-
|
| 698 |
-
private:
|
| 699 |
-
std::mutex _mu;
|
| 700 |
-
std::optional<WorkerInfo> _active;
|
| 701 |
-
bool _switch_in_progress = false;
|
| 702 |
-
|
| 703 |
-
std::string _default_model;
|
| 704 |
-
std::string _llama_server_bin;
|
| 705 |
-
std::string _worker_host;
|
| 706 |
-
std::string _worker_bind_host;
|
| 707 |
-
int _base_port;
|
| 708 |
-
int _switch_timeout_sec;
|
| 709 |
-
int _n_ctx;
|
| 710 |
-
int _n_threads;
|
| 711 |
-
int _n_gpu_layers;
|
| 712 |
-
int _n_batch;
|
| 713 |
-
int _n_ubatch;
|
| 714 |
-
int _next_port;
|
| 715 |
-
|
| 716 |
-
int allocate_port() {
|
| 717 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 718 |
-
return _next_port++;
|
| 719 |
-
}
|
| 720 |
-
|
| 721 |
-
void finish_switch(bool ok) {
|
| 722 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 723 |
-
if (!ok) _switch_in_progress = false;
|
| 724 |
-
}
|
| 725 |
-
|
| 726 |
-
pid_t spawn_worker(const std::string &model, int port) {
|
| 727 |
-
pid_t pid = fork();
|
| 728 |
-
if (pid < 0) return -1;
|
| 729 |
-
if (pid == 0) {
|
| 730 |
-
setsid();
|
| 731 |
-
std::string port_s = std::to_string(port);
|
| 732 |
-
std::string n_ctx_s = std::to_string(_n_ctx);
|
| 733 |
-
std::string threads_s = std::to_string(_n_threads);
|
| 734 |
-
std::string ngl_s = std::to_string(_n_gpu_layers);
|
| 735 |
-
std::string batch_s = std::to_string(_n_batch);
|
| 736 |
-
std::string ubatch_s = std::to_string(_n_ubatch);
|
| 737 |
-
|
| 738 |
-
std::vector<std::string> args = {
|
| 739 |
-
_llama_server_bin,
|
| 740 |
-
"-hf", model,
|
| 741 |
-
"--host", _worker_bind_host,
|
| 742 |
-
"--port", port_s,
|
| 743 |
-
"-c", n_ctx_s,
|
| 744 |
-
"-t", threads_s,
|
| 745 |
-
"-ngl", ngl_s,
|
| 746 |
-
"--cont-batching",
|
| 747 |
-
"-b", batch_s,
|
| 748 |
-
"--ubatch-size", ubatch_s
|
| 749 |
-
};
|
| 750 |
-
|
| 751 |
-
std::vector<char *> argv;
|
| 752 |
-
argv.reserve(args.size() + 1);
|
| 753 |
-
for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
|
| 754 |
-
argv.push_back(nullptr);
|
| 755 |
-
execvp(argv[0], argv.data());
|
| 756 |
-
_exit(127);
|
| 757 |
-
}
|
| 758 |
-
return pid;
|
| 759 |
-
}
|
| 760 |
-
|
| 761 |
-
bool wait_until_ready(pid_t pid, int port, int timeout_sec) {
|
| 762 |
-
const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
|
| 763 |
-
while (std::chrono::steady_clock::now() < deadline) {
|
| 764 |
-
if (!is_alive(pid)) return false;
|
| 765 |
-
try {
|
| 766 |
-
auto [status, _] = http_get(port, "/");
|
| 767 |
-
if (status == 200) return true;
|
| 768 |
-
} catch (...) {
|
| 769 |
-
}
|
| 770 |
-
std::this_thread::sleep_for(std::chrono::milliseconds(800));
|
| 771 |
-
}
|
| 772 |
-
return false;
|
| 773 |
-
}
|
| 774 |
-
|
| 775 |
-
std::pair<int, std::string> http_get(int port, const std::string &target) {
|
| 776 |
-
asio::io_context ioc;
|
| 777 |
-
asio::ip::tcp::resolver resolver(ioc);
|
| 778 |
-
beast::tcp_stream stream(ioc);
|
| 779 |
-
auto const results = resolver.resolve(_worker_host, std::to_string(port));
|
| 780 |
-
stream.connect(results);
|
| 781 |
-
|
| 782 |
-
http::request<http::string_body> req{http::verb::get, target, 11};
|
| 783 |
-
req.set(http::field::host, _worker_host);
|
| 784 |
-
req.set(http::field::user_agent, "llm-manager");
|
| 785 |
-
http::write(stream, req);
|
| 786 |
-
|
| 787 |
-
beast::flat_buffer buffer;
|
| 788 |
-
http::response<http::string_body> res;
|
| 789 |
-
http::read(stream, buffer, res);
|
| 790 |
-
beast::error_code ec;
|
| 791 |
-
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 792 |
-
return {res.result_int(), res.body()};
|
| 793 |
-
}
|
| 794 |
-
};
|
| 795 |
-
|
| 796 |
-
static std::atomic<uint64_t> g_req_id{1};
|
| 797 |
-
|
| 798 |
-
class RateLimiterStore {
|
| 799 |
-
public:
|
| 800 |
-
explicit RateLimiterStore(const RateLimitConfig &config)
|
| 801 |
-
: _requests_per_minute(std::max(0, config.requests_per_minute)),
|
| 802 |
-
_estimated_tokens_per_minute(std::max(0, config.estimated_tokens_per_minute)) {}
|
| 803 |
-
|
| 804 |
-
RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens) {
|
| 805 |
-
if (_requests_per_minute <= 0 && _estimated_tokens_per_minute <= 0) {
|
| 806 |
-
return {};
|
| 807 |
-
}
|
| 808 |
-
|
| 809 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 810 |
-
auto &bucket = _buckets[api_key_id];
|
| 811 |
-
const auto now = std::chrono::steady_clock::now();
|
| 812 |
-
refill(bucket.request_tokens, bucket.last_request_refill, _requests_per_minute, now);
|
| 813 |
-
refill(bucket.estimated_tokens, bucket.last_estimated_refill, _estimated_tokens_per_minute, now);
|
| 814 |
-
|
| 815 |
-
if (_requests_per_minute > 0 && bucket.request_tokens < 1.0) {
|
| 816 |
-
return {false, 1, "Rate limit exceeded: requests"};
|
| 817 |
-
}
|
| 818 |
-
if (_estimated_tokens_per_minute > 0 && bucket.estimated_tokens < estimated_tokens) {
|
| 819 |
-
return {false, 1, "Rate limit exceeded: estimated tokens"};
|
| 820 |
-
}
|
| 821 |
-
|
| 822 |
-
if (_requests_per_minute > 0) bucket.request_tokens -= 1.0;
|
| 823 |
-
if (_estimated_tokens_per_minute > 0) bucket.estimated_tokens -= estimated_tokens;
|
| 824 |
-
return {};
|
| 825 |
-
}
|
| 826 |
-
|
| 827 |
-
private:
|
| 828 |
-
struct Bucket {
|
| 829 |
-
double request_tokens = 0.0;
|
| 830 |
-
double estimated_tokens = 0.0;
|
| 831 |
-
std::chrono::steady_clock::time_point last_request_refill{};
|
| 832 |
-
std::chrono::steady_clock::time_point last_estimated_refill{};
|
| 833 |
-
};
|
| 834 |
-
|
| 835 |
-
std::mutex _mu;
|
| 836 |
-
std::unordered_map<std::string, Bucket> _buckets;
|
| 837 |
-
int _requests_per_minute;
|
| 838 |
-
int _estimated_tokens_per_minute;
|
| 839 |
-
|
| 840 |
-
static void refill(
|
| 841 |
-
double &tokens,
|
| 842 |
-
std::chrono::steady_clock::time_point &last_refill,
|
| 843 |
-
int limit_per_minute,
|
| 844 |
-
std::chrono::steady_clock::time_point now) {
|
| 845 |
-
if (limit_per_minute <= 0) return;
|
| 846 |
-
if (last_refill.time_since_epoch().count() == 0) {
|
| 847 |
-
tokens = limit_per_minute;
|
| 848 |
-
last_refill = now;
|
| 849 |
-
return;
|
| 850 |
-
}
|
| 851 |
-
const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
|
| 852 |
-
if (elapsed <= 0) return;
|
| 853 |
-
const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
|
| 854 |
-
tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
|
| 855 |
-
last_refill = now;
|
| 856 |
-
}
|
| 857 |
-
};
|
| 858 |
-
|
| 859 |
-
class RequestRegistry {
|
| 860 |
-
public:
|
| 861 |
-
std::shared_ptr<RequestContext> create(
|
| 862 |
-
const std::string &request_id,
|
| 863 |
-
const ApiKeyRecord &principal,
|
| 864 |
-
const TokenEstimate &estimate,
|
| 865 |
-
const std::string &request_body) {
|
| 866 |
-
auto ctx = std::make_shared<RequestContext>();
|
| 867 |
-
ctx->request_id = request_id;
|
| 868 |
-
ctx->api_key_id = principal.key_id;
|
| 869 |
-
ctx->role = principal.role;
|
| 870 |
-
ctx->priority = role_to_priority(principal.role);
|
| 871 |
-
ctx->estimate = estimate;
|
| 872 |
-
ctx->request_body = request_body;
|
| 873 |
-
ctx->created_at = std::chrono::steady_clock::now();
|
| 874 |
-
ctx->enqueue_time = ctx->created_at;
|
| 875 |
-
|
| 876 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 877 |
-
_requests[request_id] = ctx;
|
| 878 |
-
return ctx;
|
| 879 |
-
}
|
| 880 |
-
|
| 881 |
-
std::shared_ptr<RequestContext> find(const std::string &request_id) const {
|
| 882 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 883 |
-
const auto it = _requests.find(request_id);
|
| 884 |
-
if (it == _requests.end()) return nullptr;
|
| 885 |
-
return it->second;
|
| 886 |
-
}
|
| 887 |
-
|
| 888 |
-
void mark_state(const std::string &request_id, RequestState state) {
|
| 889 |
-
auto ctx = find(request_id);
|
| 890 |
-
if (ctx) ctx->state.store(state);
|
| 891 |
-
}
|
| 892 |
-
|
| 893 |
-
void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
|
| 894 |
-
{
|
| 895 |
-
std::lock_guard<std::mutex> lock(ctx->mu);
|
| 896 |
-
ctx->state.store(state);
|
| 897 |
-
ctx->result = std::move(result);
|
| 898 |
-
ctx->completed = true;
|
| 899 |
-
}
|
| 900 |
-
ctx->cv.notify_all();
|
| 901 |
-
}
|
| 902 |
-
|
| 903 |
-
std::shared_ptr<RequestContext> cancel_request(const std::string &request_id) {
|
| 904 |
-
auto ctx = find(request_id);
|
| 905 |
-
if (!ctx) return nullptr;
|
| 906 |
-
|
| 907 |
-
ctx->cancelled.store(true);
|
| 908 |
-
const auto state = ctx->state.load();
|
| 909 |
-
if (state == RequestState::QUEUED) {
|
| 910 |
-
complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 911 |
-
} else if (state == RequestState::RUNNING) {
|
| 912 |
-
ctx->state.store(RequestState::CANCELLED);
|
| 913 |
-
}
|
| 914 |
-
return ctx;
|
| 915 |
-
}
|
| 916 |
-
|
| 917 |
-
std::vector<std::shared_ptr<RequestContext>> cancel_all() {
|
| 918 |
-
std::vector<std::shared_ptr<RequestContext>> out;
|
| 919 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 920 |
-
out.reserve(_requests.size());
|
| 921 |
-
for (auto &[_, ctx] : _requests) {
|
| 922 |
-
ctx->cancelled.store(true);
|
| 923 |
-
const auto state = ctx->state.load();
|
| 924 |
-
if (state == RequestState::QUEUED) {
|
| 925 |
-
{
|
| 926 |
-
std::lock_guard<std::mutex> ctx_lock(ctx->mu);
|
| 927 |
-
ctx->state.store(RequestState::CANCELLED);
|
| 928 |
-
ctx->result = {499, R"({"error":"Request cancelled"})"};
|
| 929 |
-
ctx->completed = true;
|
| 930 |
-
}
|
| 931 |
-
ctx->cv.notify_all();
|
| 932 |
-
} else if (state == RequestState::RUNNING) {
|
| 933 |
-
ctx->state.store(RequestState::CANCELLED);
|
| 934 |
-
}
|
| 935 |
-
out.push_back(ctx);
|
| 936 |
-
}
|
| 937 |
-
return out;
|
| 938 |
-
}
|
| 939 |
-
|
| 940 |
-
private:
|
| 941 |
-
mutable std::mutex _mu;
|
| 942 |
-
std::unordered_map<std::string, std::shared_ptr<RequestContext>> _requests;
|
| 943 |
-
};
|
| 944 |
-
|
| 945 |
-
class PrioritySchedulerQueue {
|
| 946 |
-
public:
|
| 947 |
-
explicit PrioritySchedulerQueue(const QueueConfig &config)
|
| 948 |
-
: _max_size(config.max_size),
|
| 949 |
-
_max_tokens(config.max_tokens),
|
| 950 |
-
_admin_quota(std::max(1, config.admin_quota)),
|
| 951 |
-
_retry_after_sec(std::max(1, config.retry_after_sec)) {}
|
| 952 |
-
|
| 953 |
-
bool try_push(const std::shared_ptr<RequestContext> &ctx) {
|
| 954 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 955 |
-
if (_current_size >= _max_size) return false;
|
| 956 |
-
if (_current_tokens + ctx->estimate.estimated_total_tokens > _max_tokens) return false;
|
| 957 |
-
|
| 958 |
-
if (ctx->priority == Priority::ADMIN) {
|
| 959 |
-
_admin_queue.push_back(ctx);
|
| 960 |
-
} else {
|
| 961 |
-
_user_queue.push_back(ctx);
|
| 962 |
-
}
|
| 963 |
-
++_current_size;
|
| 964 |
-
_current_tokens += ctx->estimate.estimated_total_tokens;
|
| 965 |
-
_cv.notify_one();
|
| 966 |
-
return true;
|
| 967 |
-
}
|
| 968 |
-
|
| 969 |
-
std::shared_ptr<RequestContext> pop_next() {
|
| 970 |
-
std::unique_lock<std::mutex> lock(_mu);
|
| 971 |
-
_cv.wait(lock, [&]() { return _stopped || _current_size > 0; });
|
| 972 |
-
if (_stopped) return nullptr;
|
| 973 |
-
|
| 974 |
-
std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
|
| 975 |
-
if (!_admin_queue.empty() && (_admin_streak < _admin_quota || _user_queue.empty())) {
|
| 976 |
-
selected_queue = &_admin_queue;
|
| 977 |
-
++_admin_streak;
|
| 978 |
-
} else if (!_user_queue.empty()) {
|
| 979 |
-
selected_queue = &_user_queue;
|
| 980 |
-
_admin_streak = 0;
|
| 981 |
-
} else if (!_admin_queue.empty()) {
|
| 982 |
-
selected_queue = &_admin_queue;
|
| 983 |
-
_admin_streak = 1;
|
| 984 |
-
}
|
| 985 |
-
|
| 986 |
-
if (!selected_queue || selected_queue->empty()) return nullptr;
|
| 987 |
-
|
| 988 |
-
auto best_it = std::min_element(
|
| 989 |
-
selected_queue->begin(),
|
| 990 |
-
selected_queue->end(),
|
| 991 |
-
[](const auto &a, const auto &b) {
|
| 992 |
-
return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
|
| 993 |
-
});
|
| 994 |
-
auto ctx = *best_it;
|
| 995 |
-
selected_queue->erase(best_it);
|
| 996 |
-
--_current_size;
|
| 997 |
-
_current_tokens -= ctx->estimate.estimated_total_tokens;
|
| 998 |
-
return ctx;
|
| 999 |
-
}
|
| 1000 |
-
|
| 1001 |
-
void stop() {
|
| 1002 |
-
std::lock_guard<std::mutex> lock(_mu);
|
| 1003 |
-
_stopped = true;
|
| 1004 |
-
_cv.notify_all();
|
| 1005 |
-
}
|
| 1006 |
-
|
| 1007 |
-
int retry_after_sec() const {
|
| 1008 |
-
return _retry_after_sec;
|
| 1009 |
-
}
|
| 1010 |
-
|
| 1011 |
-
private:
|
| 1012 |
-
mutable std::mutex _mu;
|
| 1013 |
-
std::condition_variable _cv;
|
| 1014 |
-
std::deque<std::shared_ptr<RequestContext>> _admin_queue;
|
| 1015 |
-
std::deque<std::shared_ptr<RequestContext>> _user_queue;
|
| 1016 |
-
size_t _max_size;
|
| 1017 |
-
size_t _current_size = 0;
|
| 1018 |
-
int _max_tokens;
|
| 1019 |
-
int _current_tokens = 0;
|
| 1020 |
-
int _admin_quota;
|
| 1021 |
-
int _admin_streak = 0;
|
| 1022 |
-
int _retry_after_sec;
|
| 1023 |
-
bool _stopped = false;
|
| 1024 |
-
};
|
| 1025 |
-
|
| 1026 |
-
class Scheduler {
|
| 1027 |
-
public:
|
| 1028 |
-
Scheduler(ModelManager &manager, RequestRegistry ®istry, const QueueConfig &queue_config)
|
| 1029 |
-
: _manager(manager), _registry(registry), _queue(queue_config) {
|
| 1030 |
-
_worker = std::thread([this]() { worker_loop(); });
|
| 1031 |
-
}
|
| 1032 |
-
|
| 1033 |
-
~Scheduler() {
|
| 1034 |
-
_queue.stop();
|
| 1035 |
-
if (_worker.joinable()) _worker.join();
|
| 1036 |
-
}
|
| 1037 |
-
|
| 1038 |
-
bool try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
|
| 1039 |
-
return _queue.try_push(ctx);
|
| 1040 |
-
}
|
| 1041 |
-
|
| 1042 |
-
int retry_after_sec() const {
|
| 1043 |
-
return _queue.retry_after_sec();
|
| 1044 |
-
}
|
| 1045 |
-
|
| 1046 |
-
private:
|
| 1047 |
-
ModelManager &_manager;
|
| 1048 |
-
RequestRegistry &_registry;
|
| 1049 |
-
PrioritySchedulerQueue _queue;
|
| 1050 |
-
std::thread _worker;
|
| 1051 |
-
|
| 1052 |
-
void worker_loop() {
|
| 1053 |
-
for (;;) {
|
| 1054 |
-
auto ctx = _queue.pop_next();
|
| 1055 |
-
if (!ctx) return;
|
| 1056 |
-
|
| 1057 |
-
if (ctx->cancelled.load()) {
|
| 1058 |
-
_registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 1059 |
-
continue;
|
| 1060 |
-
}
|
| 1061 |
-
|
| 1062 |
-
ctx->state.store(RequestState::RUNNING);
|
| 1063 |
-
ctx->start_time = std::chrono::steady_clock::now();
|
| 1064 |
-
auto worker = _manager.active_worker();
|
| 1065 |
-
if (!worker) {
|
| 1066 |
-
_registry.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
|
| 1067 |
-
continue;
|
| 1068 |
-
}
|
| 1069 |
-
|
| 1070 |
-
try {
|
| 1071 |
-
auto [status, body] = forward_chat(*worker, ctx->request_body);
|
| 1072 |
-
if (ctx->cancelled.load()) {
|
| 1073 |
-
_registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 1074 |
-
continue;
|
| 1075 |
-
}
|
| 1076 |
-
_registry.complete(ctx, RequestState::DONE, {status, body});
|
| 1077 |
-
} catch (const std::exception &e) {
|
| 1078 |
-
_registry.complete(
|
| 1079 |
-
ctx,
|
| 1080 |
-
RequestState::FAILED,
|
| 1081 |
-
{500, json({{"error", e.what()}}).dump()});
|
| 1082 |
-
}
|
| 1083 |
-
}
|
| 1084 |
-
}
|
| 1085 |
-
};
|
| 1086 |
-
|
| 1087 |
-
class ApiKeyAuth {
|
| 1088 |
-
public:
|
| 1089 |
-
explicit ApiKeyAuth(const ManagerConfig &config)
|
| 1090 |
-
: _header_name(config.auth.header), _scheme(config.auth.scheme) {
|
| 1091 |
-
for (const auto &record : config.api_keys) {
|
| 1092 |
-
_records_by_secret.emplace(record.secret, record);
|
| 1093 |
-
}
|
| 1094 |
-
}
|
| 1095 |
-
|
| 1096 |
-
bool enabled() const {
|
| 1097 |
-
return !_records_by_secret.empty();
|
| 1098 |
-
}
|
| 1099 |
-
|
| 1100 |
-
template <typename Body, typename Allocator>
|
| 1101 |
-
std::optional<ApiKeyRecord> authenticate(
|
| 1102 |
-
const http::request<Body, http::basic_fields<Allocator>> &req,
|
| 1103 |
-
std::string &error) const {
|
| 1104 |
-
if (!enabled()) {
|
| 1105 |
-
error.clear();
|
| 1106 |
-
return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
|
| 1107 |
-
}
|
| 1108 |
-
|
| 1109 |
-
const auto token = extract_bearer_token(req, error);
|
| 1110 |
-
if (!token) return std::nullopt;
|
| 1111 |
-
|
| 1112 |
-
const auto it = _records_by_secret.find(*token);
|
| 1113 |
-
if (it == _records_by_secret.end()) {
|
| 1114 |
-
error = "Invalid API key";
|
| 1115 |
-
return std::nullopt;
|
| 1116 |
-
}
|
| 1117 |
-
if (!it->second.enabled) {
|
| 1118 |
-
error = "API key disabled";
|
| 1119 |
-
return std::nullopt;
|
| 1120 |
-
}
|
| 1121 |
-
error.clear();
|
| 1122 |
-
return it->second;
|
| 1123 |
-
}
|
| 1124 |
-
|
| 1125 |
-
private:
|
| 1126 |
-
std::string _header_name;
|
| 1127 |
-
std::string _scheme;
|
| 1128 |
-
std::unordered_map<std::string, ApiKeyRecord> _records_by_secret;
|
| 1129 |
-
|
| 1130 |
-
template <typename Body, typename Allocator>
|
| 1131 |
-
std::optional<std::string> extract_bearer_token(
|
| 1132 |
-
const http::request<Body, http::basic_fields<Allocator>> &req,
|
| 1133 |
-
std::string &error) const {
|
| 1134 |
-
const auto header_it = req.find(_header_name);
|
| 1135 |
-
if (header_it == req.end()) {
|
| 1136 |
-
error = "Missing authorization header";
|
| 1137 |
-
return std::nullopt;
|
| 1138 |
-
}
|
| 1139 |
-
|
| 1140 |
-
const std::string value = trim_copy(header_it->value().to_string());
|
| 1141 |
-
const std::string prefix = _scheme + " ";
|
| 1142 |
-
if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
|
| 1143 |
-
error = "Invalid authorization scheme";
|
| 1144 |
-
return std::nullopt;
|
| 1145 |
-
}
|
| 1146 |
-
|
| 1147 |
-
std::string token = trim_copy(value.substr(prefix.size()));
|
| 1148 |
-
if (token.empty()) {
|
| 1149 |
-
error = "Missing API key";
|
| 1150 |
-
return std::nullopt;
|
| 1151 |
-
}
|
| 1152 |
-
return token;
|
| 1153 |
-
}
|
| 1154 |
-
};
|
| 1155 |
-
|
| 1156 |
-
static void log_line(const std::string &line) {
|
| 1157 |
-
std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
|
| 1158 |
-
}
|
| 1159 |
-
|
| 1160 |
-
static std::string truncate_body(const std::string &body, size_t max_len = 2000) {
|
| 1161 |
-
if (body.size() <= max_len) return body;
|
| 1162 |
-
return body.substr(0, max_len) + "...[truncated]";
|
| 1163 |
-
}
|
| 1164 |
-
|
| 1165 |
-
static std::optional<std::string> extract_cancel_request_id(const std::string &path) {
|
| 1166 |
-
const std::string prefix = "/requests/";
|
| 1167 |
-
const std::string suffix = "/cancel";
|
| 1168 |
-
if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
|
| 1169 |
-
if (path.rfind(prefix, 0) != 0) return std::nullopt;
|
| 1170 |
-
if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
|
| 1171 |
-
const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
|
| 1172 |
-
if (request_id.empty()) return std::nullopt;
|
| 1173 |
-
return request_id;
|
| 1174 |
-
}
|
| 1175 |
-
|
| 1176 |
-
static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
|
| 1177 |
-
asio::io_context ioc;
|
| 1178 |
-
asio::ip::tcp::resolver resolver(ioc);
|
| 1179 |
-
beast::tcp_stream stream(ioc);
|
| 1180 |
-
auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
|
| 1181 |
-
stream.connect(results);
|
| 1182 |
-
|
| 1183 |
-
http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
|
| 1184 |
-
req.set(http::field::host, "127.0.0.1");
|
| 1185 |
-
req.set(http::field::content_type, "application/json");
|
| 1186 |
-
req.set(http::field::user_agent, "llm-manager");
|
| 1187 |
-
req.body() = body;
|
| 1188 |
-
req.prepare_payload();
|
| 1189 |
-
http::write(stream, req);
|
| 1190 |
-
|
| 1191 |
-
beast::flat_buffer buffer;
|
| 1192 |
-
http::response<http::string_body> res;
|
| 1193 |
-
http::read(stream, buffer, res);
|
| 1194 |
-
beast::error_code ec;
|
| 1195 |
-
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 1196 |
-
return {res.result_int(), res.body()};
|
| 1197 |
-
}
|
| 1198 |
-
|
| 1199 |
-
struct ProxiedGetResult {
|
| 1200 |
-
int status = 500;
|
| 1201 |
-
std::string body;
|
| 1202 |
-
std::string content_type = "text/plain; charset=utf-8";
|
| 1203 |
-
std::string content_encoding;
|
| 1204 |
-
};
|
| 1205 |
-
|
| 1206 |
-
static ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker,
|
| 1207 |
-
const std::string &target) {
|
| 1208 |
-
asio::io_context ioc;
|
| 1209 |
-
asio::ip::tcp::resolver resolver(ioc);
|
| 1210 |
-
beast::tcp_stream stream(ioc);
|
| 1211 |
-
auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
|
| 1212 |
-
stream.connect(results);
|
| 1213 |
-
|
| 1214 |
-
http::request<http::string_body> req{http::verb::get, target, 11};
|
| 1215 |
-
req.set(http::field::host, "127.0.0.1");
|
| 1216 |
-
req.set(http::field::user_agent, "llm-manager");
|
| 1217 |
-
req.set(http::field::accept_encoding, "gzip, identity");
|
| 1218 |
-
http::write(stream, req);
|
| 1219 |
-
|
| 1220 |
-
beast::flat_buffer buffer;
|
| 1221 |
-
http::response<http::string_body> res;
|
| 1222 |
-
http::read(stream, buffer, res);
|
| 1223 |
-
beast::error_code ec;
|
| 1224 |
-
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 1225 |
-
ProxiedGetResult out;
|
| 1226 |
-
out.status = res.result_int();
|
| 1227 |
-
out.body = res.body();
|
| 1228 |
-
if (res.base().find(http::field::content_type) != res.base().end()) {
|
| 1229 |
-
out.content_type = res.base()[http::field::content_type].to_string();
|
| 1230 |
-
}
|
| 1231 |
-
if (res.base().find(http::field::content_encoding) != res.base().end()) {
|
| 1232 |
-
out.content_encoding = res.base()[http::field::content_encoding].to_string();
|
| 1233 |
-
}
|
| 1234 |
-
return out;
|
| 1235 |
-
}
|
| 1236 |
-
|
| 1237 |
-
template <typename Body, typename Allocator>
|
| 1238 |
-
http::response<http::string_body> handle_request(
|
| 1239 |
-
ModelManager &manager,
|
| 1240 |
-
const ManagerConfig &config,
|
| 1241 |
-
const ApiKeyAuth &auth,
|
| 1242 |
-
RateLimiterStore &rate_limiter,
|
| 1243 |
-
RequestRegistry ®istry,
|
| 1244 |
-
Scheduler &scheduler,
|
| 1245 |
-
http::request<Body, http::basic_fields<Allocator>> &&req) {
|
| 1246 |
-
const auto start = std::chrono::steady_clock::now();
|
| 1247 |
-
const auto req_id_num = g_req_id.fetch_add(1);
|
| 1248 |
-
const std::string request_id = std::to_string(req_id_num);
|
| 1249 |
-
const std::string target = req.target().to_string();
|
| 1250 |
-
const std::string method = req.method_string().to_string();
|
| 1251 |
-
const std::string path = target.substr(0, target.find('?'));
|
| 1252 |
-
auto authenticated = std::optional<ApiKeyRecord>{};
|
| 1253 |
-
|
| 1254 |
-
log_line("request_id=" + request_id + " method=" + method + " path=" + target);
|
| 1255 |
-
if constexpr (std::is_same_v<Body, http::string_body>) {
|
| 1256 |
-
if (!req.body().empty()) {
|
| 1257 |
-
log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
|
| 1258 |
-
}
|
| 1259 |
-
}
|
| 1260 |
-
|
| 1261 |
-
auto json_response = [&](http::status status, const json &obj) {
|
| 1262 |
-
json payload = obj;
|
| 1263 |
-
payload["request_id"] = request_id;
|
| 1264 |
-
http::response<http::string_body> res{status, req.version()};
|
| 1265 |
-
res.set(http::field::content_type, "application/json");
|
| 1266 |
-
res.set(http::field::server, "llm-manager");
|
| 1267 |
-
res.set("X-Request-Id", request_id);
|
| 1268 |
-
res.keep_alive(req.keep_alive());
|
| 1269 |
-
res.body() = payload.dump();
|
| 1270 |
-
res.prepare_payload();
|
| 1271 |
-
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 1272 |
-
std::chrono::steady_clock::now() - start).count();
|
| 1273 |
-
log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
|
| 1274 |
-
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 1275 |
-
return res;
|
| 1276 |
-
};
|
| 1277 |
-
|
| 1278 |
-
auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
|
| 1279 |
-
auto res = json_response(status, obj);
|
| 1280 |
-
res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
|
| 1281 |
-
return res;
|
| 1282 |
-
};
|
| 1283 |
-
|
| 1284 |
-
auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
|
| 1285 |
-
std::string auth_error;
|
| 1286 |
-
authenticated = auth.authenticate(req, auth_error);
|
| 1287 |
-
if (!authenticated) {
|
| 1288 |
-
return json_response(http::status::unauthorized, {{"error", auth_error}});
|
| 1289 |
-
}
|
| 1290 |
-
if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
|
| 1291 |
-
return json_response(http::status::forbidden, {{"error", "Admin role required"}});
|
| 1292 |
-
}
|
| 1293 |
-
log_line("request_id=" + request_id +
|
| 1294 |
-
" api_key_id=" + authenticated->key_id +
|
| 1295 |
-
" role=" + role_to_string(authenticated->role));
|
| 1296 |
-
return std::nullopt;
|
| 1297 |
-
};
|
| 1298 |
-
|
| 1299 |
-
try {
|
| 1300 |
-
if (path == "/health" && req.method() == http::verb::get) {
|
| 1301 |
-
return json_response(http::status::ok, manager.models_view());
|
| 1302 |
-
}
|
| 1303 |
-
|
| 1304 |
-
if (path == "/models" && req.method() == http::verb::get) {
|
| 1305 |
-
return json_response(http::status::ok, manager.models_view());
|
| 1306 |
-
}
|
| 1307 |
-
|
| 1308 |
-
if (path == "/switch-model" && req.method() == http::verb::post) {
|
| 1309 |
-
if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
|
| 1310 |
-
std::string body(req.body().data(), req.body().size());
|
| 1311 |
-
json j = json::parse(body, nullptr, false);
|
| 1312 |
-
if (j.is_discarded()) {
|
| 1313 |
-
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 1314 |
-
}
|
| 1315 |
-
std::string model;
|
| 1316 |
-
if (j.contains("model_name")) model = j["model_name"].get<std::string>();
|
| 1317 |
-
if (j.contains("model")) model = j["model"].get<std::string>();
|
| 1318 |
-
if (model.empty()) {
|
| 1319 |
-
return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
|
| 1320 |
-
}
|
| 1321 |
-
|
| 1322 |
-
std::string err;
|
| 1323 |
-
bool ok = manager.switch_model(model, err);
|
| 1324 |
-
if (!ok) {
|
| 1325 |
-
auto status = (err == "Switch already in progress") ? http::status::conflict : http::status::internal_server_error;
|
| 1326 |
-
return json_response(status, {{"status", "error"}, {"error", err}});
|
| 1327 |
-
}
|
| 1328 |
-
auto state = manager.models_view();
|
| 1329 |
-
state["message"] = "Switched model successfully";
|
| 1330 |
-
return json_response(http::status::ok, state);
|
| 1331 |
-
}
|
| 1332 |
-
|
| 1333 |
-
if (path == "/stop" && req.method() == http::verb::post) {
|
| 1334 |
-
if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
|
| 1335 |
-
registry.cancel_all();
|
| 1336 |
-
std::string err;
|
| 1337 |
-
bool ok = manager.restart_active(err);
|
| 1338 |
-
if (!ok) {
|
| 1339 |
-
http::status status = http::status::internal_server_error;
|
| 1340 |
-
if (err == "Switch already in progress") {
|
| 1341 |
-
status = http::status::conflict;
|
| 1342 |
-
} else if (err == "No active model") {
|
| 1343 |
-
status = http::status::service_unavailable;
|
| 1344 |
-
}
|
| 1345 |
-
return json_response(status, {{"status", "error"}, {"error", err}});
|
| 1346 |
-
}
|
| 1347 |
-
auto state = manager.models_view();
|
| 1348 |
-
state["message"] = "Stopped in-flight prompts and restarted model";
|
| 1349 |
-
return json_response(http::status::ok, state);
|
| 1350 |
-
}
|
| 1351 |
-
|
| 1352 |
-
if (req.method() == http::verb::post) {
|
| 1353 |
-
if (auto cancel_id = extract_cancel_request_id(path)) {
|
| 1354 |
-
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 1355 |
-
auto ctx = registry.find(*cancel_id);
|
| 1356 |
-
if (!ctx) {
|
| 1357 |
-
return json_response(http::status::not_found, {{"error", "Unknown request id"}});
|
| 1358 |
-
}
|
| 1359 |
-
if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
|
| 1360 |
-
return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
|
| 1361 |
-
}
|
| 1362 |
-
|
| 1363 |
-
const auto previous_state = ctx->state.load();
|
| 1364 |
-
registry.cancel_request(*cancel_id);
|
| 1365 |
-
std::string restart_error;
|
| 1366 |
-
bool restarted = true;
|
| 1367 |
-
if (previous_state == RequestState::RUNNING) {
|
| 1368 |
-
restarted = manager.restart_active(restart_error);
|
| 1369 |
-
}
|
| 1370 |
-
|
| 1371 |
-
json payload = {
|
| 1372 |
-
{"cancelled_request_id", *cancel_id},
|
| 1373 |
-
{"state", state_to_string(ctx->state.load())}
|
| 1374 |
-
};
|
| 1375 |
-
if (!restarted) {
|
| 1376 |
-
payload["restart_error"] = restart_error;
|
| 1377 |
-
}
|
| 1378 |
-
return json_response(http::status::ok, payload);
|
| 1379 |
-
}
|
| 1380 |
-
}
|
| 1381 |
-
|
| 1382 |
-
if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
|
| 1383 |
-
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 1384 |
-
json payload = json::parse(req.body(), nullptr, false);
|
| 1385 |
-
if (payload.is_discarded()) {
|
| 1386 |
-
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 1387 |
-
}
|
| 1388 |
-
std::string token_error;
|
| 1389 |
-
auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
|
| 1390 |
-
if (!estimate) {
|
| 1391 |
-
return json_response(http::status::bad_request, {{"error", token_error}});
|
| 1392 |
-
}
|
| 1393 |
-
log_line("request_id=" + request_id +
|
| 1394 |
-
" prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
|
| 1395 |
-
" max_tokens=" + std::to_string(estimate->requested_max_tokens) +
|
| 1396 |
-
" estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
|
| 1397 |
-
|
| 1398 |
-
auto rate_limit_decision = rate_limiter.allow(
|
| 1399 |
-
authenticated->key_id,
|
| 1400 |
-
estimate->estimated_total_tokens);
|
| 1401 |
-
if (!rate_limit_decision.allowed) {
|
| 1402 |
-
return json_response_with_retry_after(
|
| 1403 |
-
http::status::too_many_requests,
|
| 1404 |
-
{{"error", rate_limit_decision.error}},
|
| 1405 |
-
rate_limit_decision.retry_after_sec);
|
| 1406 |
-
}
|
| 1407 |
-
|
| 1408 |
-
auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
|
| 1409 |
-
if (!scheduler.try_enqueue(ctx)) {
|
| 1410 |
-
ctx->cancelled.store(true);
|
| 1411 |
-
registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
|
| 1412 |
-
return json_response_with_retry_after(
|
| 1413 |
-
http::status::service_unavailable,
|
| 1414 |
-
{{"error", "Queue full"}},
|
| 1415 |
-
scheduler.retry_after_sec());
|
| 1416 |
-
}
|
| 1417 |
-
|
| 1418 |
-
std::unique_lock<std::mutex> lock(ctx->mu);
|
| 1419 |
-
const bool finished = ctx->cv.wait_for(
|
| 1420 |
-
lock,
|
| 1421 |
-
std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
|
| 1422 |
-
[&]() { return ctx->completed; });
|
| 1423 |
-
if (!finished) {
|
| 1424 |
-
lock.unlock();
|
| 1425 |
-
registry.cancel_request(request_id);
|
| 1426 |
-
std::string restart_error;
|
| 1427 |
-
bool restarted = true;
|
| 1428 |
-
if (ctx->state.load() == RequestState::RUNNING) {
|
| 1429 |
-
restarted = manager.restart_active(restart_error);
|
| 1430 |
-
}
|
| 1431 |
-
json timeout_payload = {
|
| 1432 |
-
{"error", "Request timed out"},
|
| 1433 |
-
{"state", state_to_string(ctx->state.load())}
|
| 1434 |
-
};
|
| 1435 |
-
if (!restarted) timeout_payload["restart_error"] = restart_error;
|
| 1436 |
-
return json_response(http::status::gateway_timeout, timeout_payload);
|
| 1437 |
-
}
|
| 1438 |
-
|
| 1439 |
-
const auto final_state = ctx->state.load();
|
| 1440 |
-
RequestResult result = ctx->result;
|
| 1441 |
-
lock.unlock();
|
| 1442 |
-
|
| 1443 |
-
if (final_state == RequestState::CANCELLED) {
|
| 1444 |
-
return json_response(http::status::ok, {{"status", "cancelled"}});
|
| 1445 |
-
}
|
| 1446 |
-
|
| 1447 |
-
http::response<http::string_body> res{
|
| 1448 |
-
static_cast<http::status>(result.status), req.version()};
|
| 1449 |
-
res.set(http::field::content_type, result.content_type);
|
| 1450 |
-
res.set(http::field::server, "llm-manager");
|
| 1451 |
-
res.set("X-Request-Id", request_id);
|
| 1452 |
-
res.keep_alive(req.keep_alive());
|
| 1453 |
-
res.body() = result.body;
|
| 1454 |
-
res.prepare_payload();
|
| 1455 |
-
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 1456 |
-
std::chrono::steady_clock::now() - start).count();
|
| 1457 |
-
log_line("request_id=" + request_id +
|
| 1458 |
-
" final_state=" + state_to_string(final_state) +
|
| 1459 |
-
" upstream_status=" + std::to_string(result.status) +
|
| 1460 |
-
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 1461 |
-
return res;
|
| 1462 |
-
}
|
| 1463 |
-
|
| 1464 |
-
// Proxy GET requests not handled by manager endpoints to active llama-server.
|
| 1465 |
-
// This enables llama.cpp UI/static routes (including "/").
|
| 1466 |
-
if (req.method() == http::verb::get) {
|
| 1467 |
-
auto worker = manager.active_worker();
|
| 1468 |
-
if (!worker) {
|
| 1469 |
-
return json_response(http::status::service_unavailable, {{"error", "No active model"}});
|
| 1470 |
-
}
|
| 1471 |
-
auto upstream = forward_get_to_worker(*worker, target);
|
| 1472 |
-
http::response<http::string_body> res{
|
| 1473 |
-
static_cast<http::status>(upstream.status), req.version()};
|
| 1474 |
-
res.set(http::field::content_type, upstream.content_type);
|
| 1475 |
-
if (!upstream.content_encoding.empty()) {
|
| 1476 |
-
res.set(http::field::content_encoding, upstream.content_encoding);
|
| 1477 |
-
}
|
| 1478 |
-
res.set(http::field::server, "llm-manager");
|
| 1479 |
-
res.set("X-Request-Id", request_id);
|
| 1480 |
-
res.keep_alive(req.keep_alive());
|
| 1481 |
-
res.body() = upstream.body;
|
| 1482 |
-
res.prepare_payload();
|
| 1483 |
-
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 1484 |
-
std::chrono::steady_clock::now() - start)
|
| 1485 |
-
.count();
|
| 1486 |
-
log_line("request_id=" + request_id +
|
| 1487 |
-
" proxied_get model=" + worker->model +
|
| 1488 |
-
" upstream_status=" + std::to_string(upstream.status) +
|
| 1489 |
-
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 1490 |
-
return res;
|
| 1491 |
-
}
|
| 1492 |
-
|
| 1493 |
-
return json_response(http::status::not_found, {{"error", "Not found"}});
|
| 1494 |
-
} catch (const std::exception &e) {
|
| 1495 |
-
return json_response(http::status::internal_server_error, {{"error", e.what()}});
|
| 1496 |
-
}
|
| 1497 |
-
}
|
| 1498 |
-
|
| 1499 |
-
void do_session(
|
| 1500 |
-
asio::ip::tcp::socket socket,
|
| 1501 |
-
ModelManager &manager,
|
| 1502 |
-
const ManagerConfig &config,
|
| 1503 |
-
const ApiKeyAuth &auth,
|
| 1504 |
-
RateLimiterStore &rate_limiter,
|
| 1505 |
-
RequestRegistry ®istry,
|
| 1506 |
-
Scheduler &scheduler) {
|
| 1507 |
-
try {
|
| 1508 |
-
beast::flat_buffer buffer;
|
| 1509 |
-
http::request<http::string_body> req;
|
| 1510 |
-
http::read(socket, buffer, req);
|
| 1511 |
-
auto res = handle_request(manager, config, auth, rate_limiter, registry, scheduler, std::move(req));
|
| 1512 |
-
http::write(socket, res);
|
| 1513 |
-
beast::error_code ec;
|
| 1514 |
-
socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
|
| 1515 |
-
} catch (...) {
|
| 1516 |
-
}
|
| 1517 |
-
}
|
| 1518 |
|
| 1519 |
int main() {
|
| 1520 |
const ManagerConfig config = load_manager_config();
|
| 1521 |
const auto &bind_host = config.server.host;
|
| 1522 |
const int bind_port = config.server.port;
|
|
|
|
| 1523 |
ModelManager manager(config);
|
| 1524 |
ApiKeyAuth auth(config);
|
| 1525 |
RateLimiterStore rate_limiter(config.rate_limit);
|
| 1526 |
RequestRegistry registry;
|
| 1527 |
-
|
|
|
|
| 1528 |
|
| 1529 |
std::string init_error;
|
| 1530 |
if (auth.enabled()) {
|
|
@@ -1532,6 +31,7 @@ int main() {
|
|
| 1532 |
} else {
|
| 1533 |
log_line("auth: disabled (no configured api keys)");
|
| 1534 |
}
|
|
|
|
| 1535 |
log_line("startup: loading default model");
|
| 1536 |
if (!manager.initialize_default(init_error)) {
|
| 1537 |
log_line("startup: default model failed: " + init_error);
|
|
@@ -1540,7 +40,9 @@ int main() {
|
|
| 1540 |
}
|
| 1541 |
|
| 1542 |
asio::io_context ioc{1};
|
| 1543 |
-
asio::ip::tcp::acceptor acceptor{
|
|
|
|
|
|
|
| 1544 |
log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
|
| 1545 |
|
| 1546 |
for (;;) {
|
|
@@ -1554,6 +56,7 @@ int main() {
|
|
| 1554 |
std::cref(auth),
|
| 1555 |
std::ref(rate_limiter),
|
| 1556 |
std::ref(registry),
|
|
|
|
| 1557 |
std::ref(scheduler)).detach();
|
| 1558 |
}
|
| 1559 |
}
|
|
|
|
| 1 |
#include <boost/asio.hpp>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
#include <functional>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#include <string>
|
| 5 |
#include <thread>
|
| 6 |
+
#include <utility>
|
|
|
|
| 7 |
|
| 8 |
+
#include "config.h"
|
| 9 |
+
#include "http_helpers.h"
|
| 10 |
+
#include "model_manager.h"
|
| 11 |
+
#include "runtime_components.h"
|
| 12 |
+
#include "server.h"
|
| 13 |
|
| 14 |
namespace asio = boost::asio;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
int main() {
|
| 17 |
const ManagerConfig config = load_manager_config();
|
| 18 |
const auto &bind_host = config.server.host;
|
| 19 |
const int bind_port = config.server.port;
|
| 20 |
+
|
| 21 |
ModelManager manager(config);
|
| 22 |
ApiKeyAuth auth(config);
|
| 23 |
RateLimiterStore rate_limiter(config.rate_limit);
|
| 24 |
RequestRegistry registry;
|
| 25 |
+
MetricsRegistry metrics;
|
| 26 |
+
Scheduler scheduler(manager, registry, metrics, config.queue);
|
| 27 |
|
| 28 |
std::string init_error;
|
| 29 |
if (auth.enabled()) {
|
|
|
|
| 31 |
} else {
|
| 32 |
log_line("auth: disabled (no configured api keys)");
|
| 33 |
}
|
| 34 |
+
|
| 35 |
log_line("startup: loading default model");
|
| 36 |
if (!manager.initialize_default(init_error)) {
|
| 37 |
log_line("startup: default model failed: " + init_error);
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
asio::io_context ioc{1};
|
| 43 |
+
asio::ip::tcp::acceptor acceptor{
|
| 44 |
+
ioc,
|
| 45 |
+
{asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
|
| 46 |
log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
|
| 47 |
|
| 48 |
for (;;) {
|
|
|
|
| 56 |
std::cref(auth),
|
| 57 |
std::ref(rate_limiter),
|
| 58 |
std::ref(registry),
|
| 59 |
+
std::ref(metrics),
|
| 60 |
std::ref(scheduler)).detach();
|
| 61 |
}
|
| 62 |
}
|
cpp/llm_manager_types.cpp
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "llm_manager_types.h"
|
| 2 |
+
|
| 3 |
+
std::string role_to_string(Role role) {
|
| 4 |
+
return role == Role::ADMIN ? "admin" : "user";
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
Priority role_to_priority(Role role) {
|
| 8 |
+
return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
std::string state_to_string(RequestState state) {
|
| 12 |
+
switch (state) {
|
| 13 |
+
case RequestState::QUEUED: return "queued";
|
| 14 |
+
case RequestState::RUNNING: return "running";
|
| 15 |
+
case RequestState::CANCELLED: return "cancelled";
|
| 16 |
+
case RequestState::FAILED: return "failed";
|
| 17 |
+
case RequestState::DONE: return "done";
|
| 18 |
+
}
|
| 19 |
+
return "unknown";
|
| 20 |
+
}
|
cpp/llm_manager_types.h
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <atomic>
|
| 4 |
+
#include <chrono>
|
| 5 |
+
#include <condition_variable>
|
| 6 |
+
#include <memory>
|
| 7 |
+
#include <mutex>
|
| 8 |
+
#include <string>
|
| 9 |
+
#include <vector>
|
| 10 |
+
|
| 11 |
+
#include <nlohmann/json.hpp>
|
| 12 |
+
|
| 13 |
+
#include <sys/types.h>
|
| 14 |
+
|
| 15 |
+
using json = nlohmann::json;
|
| 16 |
+
|
| 17 |
+
struct ServerConfig {
|
| 18 |
+
std::string host = "0.0.0.0";
|
| 19 |
+
int port = 7860;
|
| 20 |
+
};
|
| 21 |
+
|
| 22 |
+
struct WorkerConfig {
|
| 23 |
+
std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
|
| 24 |
+
std::string llama_server_bin = "/usr/local/bin/llama-server";
|
| 25 |
+
std::string host = "127.0.0.1";
|
| 26 |
+
std::string bind_host = "0.0.0.0";
|
| 27 |
+
int base_port = 8080;
|
| 28 |
+
int switch_timeout_sec = 300;
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
struct LlamaConfig {
|
| 32 |
+
int n_ctx = 8192;
|
| 33 |
+
int threads = 4;
|
| 34 |
+
int ngl = 0;
|
| 35 |
+
int batch = 128;
|
| 36 |
+
int ubatch = 64;
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
enum class Role {
|
| 40 |
+
ADMIN,
|
| 41 |
+
USER
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
struct AuthConfig {
|
| 45 |
+
std::string header = "Authorization";
|
| 46 |
+
std::string scheme = "Bearer";
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
struct LimitsConfig {
|
| 50 |
+
int default_max_tokens = 256;
|
| 51 |
+
int max_tokens_per_request = 2048;
|
| 52 |
+
int request_timeout_sec = 30;
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
struct QueueConfig {
|
| 56 |
+
size_t max_size = 100;
|
| 57 |
+
int max_tokens = 20000;
|
| 58 |
+
int admin_quota = 3;
|
| 59 |
+
int retry_after_sec = 5;
|
| 60 |
+
};
|
| 61 |
+
|
| 62 |
+
struct RateLimitConfig {
|
| 63 |
+
int requests_per_minute = 60;
|
| 64 |
+
int estimated_tokens_per_minute = 6000;
|
| 65 |
+
};
|
| 66 |
+
|
| 67 |
+
struct SchedulerConfig {
|
| 68 |
+
int max_concurrent = 1;
|
| 69 |
+
};
|
| 70 |
+
|
| 71 |
+
struct StreamingConfig {
|
| 72 |
+
bool enabled = false;
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
struct ApiKeyRecord {
|
| 76 |
+
std::string key_id;
|
| 77 |
+
std::string secret;
|
| 78 |
+
Role role = Role::USER;
|
| 79 |
+
bool enabled = true;
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
struct ManagerConfig {
|
| 83 |
+
ServerConfig server;
|
| 84 |
+
WorkerConfig worker;
|
| 85 |
+
LlamaConfig llama;
|
| 86 |
+
AuthConfig auth;
|
| 87 |
+
LimitsConfig limits;
|
| 88 |
+
QueueConfig queue;
|
| 89 |
+
RateLimitConfig rate_limit;
|
| 90 |
+
SchedulerConfig scheduler;
|
| 91 |
+
StreamingConfig streaming;
|
| 92 |
+
std::vector<ApiKeyRecord> api_keys;
|
| 93 |
+
};
|
| 94 |
+
|
| 95 |
+
struct WorkerInfo {
|
| 96 |
+
std::string model;
|
| 97 |
+
int port = 0;
|
| 98 |
+
pid_t pid = -1;
|
| 99 |
+
std::string last_loaded;
|
| 100 |
+
};
|
| 101 |
+
|
| 102 |
+
enum class Priority {
|
| 103 |
+
ADMIN = 0,
|
| 104 |
+
USER = 1
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
enum class RequestState {
|
| 108 |
+
QUEUED,
|
| 109 |
+
RUNNING,
|
| 110 |
+
CANCELLED,
|
| 111 |
+
FAILED,
|
| 112 |
+
DONE
|
| 113 |
+
};
|
| 114 |
+
|
| 115 |
+
struct TokenEstimate {
|
| 116 |
+
int prompt_tokens = 0;
|
| 117 |
+
int requested_max_tokens = 0;
|
| 118 |
+
int estimated_total_tokens = 0;
|
| 119 |
+
};
|
| 120 |
+
|
| 121 |
+
struct RateLimitDecision {
|
| 122 |
+
bool allowed = true;
|
| 123 |
+
int retry_after_sec = 0;
|
| 124 |
+
std::string error;
|
| 125 |
+
};
|
| 126 |
+
|
| 127 |
+
struct QueueSnapshot {
|
| 128 |
+
size_t total_size = 0;
|
| 129 |
+
size_t admin_size = 0;
|
| 130 |
+
size_t user_size = 0;
|
| 131 |
+
int total_tokens = 0;
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
struct RequestResult {
|
| 135 |
+
int status = 500;
|
| 136 |
+
std::string body;
|
| 137 |
+
std::string content_type = "application/json";
|
| 138 |
+
};
|
| 139 |
+
|
| 140 |
+
struct RequestContext {
|
| 141 |
+
std::string request_id;
|
| 142 |
+
std::string api_key_id;
|
| 143 |
+
Role role = Role::USER;
|
| 144 |
+
Priority priority = Priority::USER;
|
| 145 |
+
TokenEstimate estimate;
|
| 146 |
+
std::string request_body;
|
| 147 |
+
std::atomic<RequestState> state{RequestState::QUEUED};
|
| 148 |
+
std::atomic<bool> cancelled{false};
|
| 149 |
+
std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
|
| 150 |
+
std::chrono::steady_clock::time_point enqueue_time{created_at};
|
| 151 |
+
std::chrono::steady_clock::time_point start_time{};
|
| 152 |
+
std::mutex mu;
|
| 153 |
+
std::condition_variable cv;
|
| 154 |
+
bool completed = false;
|
| 155 |
+
RequestResult result;
|
| 156 |
+
};
|
| 157 |
+
|
| 158 |
+
struct ProxiedGetResult {
|
| 159 |
+
int status = 500;
|
| 160 |
+
std::string body;
|
| 161 |
+
std::string content_type = "text/plain; charset=utf-8";
|
| 162 |
+
std::string content_encoding;
|
| 163 |
+
};
|
| 164 |
+
|
| 165 |
+
std::string role_to_string(Role role);
|
| 166 |
+
Priority role_to_priority(Role role);
|
| 167 |
+
std::string state_to_string(RequestState state);
|
cpp/model_manager.cpp
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "model_manager.h"
|
| 2 |
+
|
| 3 |
+
#include <boost/asio.hpp>
|
| 4 |
+
#include <boost/beast/core.hpp>
|
| 5 |
+
#include <boost/beast/http.hpp>
|
| 6 |
+
|
| 7 |
+
#include "http_helpers.h"
|
| 8 |
+
|
| 9 |
+
#include <csignal>
|
| 10 |
+
#include <thread>
|
| 11 |
+
#include <unistd.h>
|
| 12 |
+
#include <sys/wait.h>
|
| 13 |
+
|
| 14 |
+
namespace asio = boost::asio;
|
| 15 |
+
namespace beast = boost::beast;
|
| 16 |
+
namespace http = beast::http;
|
| 17 |
+
|
| 18 |
+
bool is_alive(pid_t pid) {
|
| 19 |
+
if (pid <= 0) return false;
|
| 20 |
+
return kill(pid, 0) == 0;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
void shutdown_worker(pid_t pid, int wait_seconds) {
|
| 24 |
+
if (pid <= 0) return;
|
| 25 |
+
kill(pid, SIGTERM);
|
| 26 |
+
const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
|
| 27 |
+
while (std::chrono::steady_clock::now() < deadline) {
|
| 28 |
+
int status = 0;
|
| 29 |
+
pid_t r = waitpid(pid, &status, WNOHANG);
|
| 30 |
+
if (r == pid) return;
|
| 31 |
+
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
| 32 |
+
}
|
| 33 |
+
kill(pid, SIGKILL);
|
| 34 |
+
int status = 0;
|
| 35 |
+
waitpid(pid, &status, 0);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
ModelManager::ModelManager(const ManagerConfig &config)
|
| 39 |
+
: default_model_(config.worker.default_model),
|
| 40 |
+
llama_server_bin_(config.worker.llama_server_bin),
|
| 41 |
+
worker_host_(config.worker.host),
|
| 42 |
+
worker_bind_host_(config.worker.bind_host),
|
| 43 |
+
base_port_(config.worker.base_port),
|
| 44 |
+
switch_timeout_sec_(config.worker.switch_timeout_sec),
|
| 45 |
+
n_ctx_(config.llama.n_ctx),
|
| 46 |
+
n_threads_(config.llama.threads),
|
| 47 |
+
n_gpu_layers_(config.llama.ngl),
|
| 48 |
+
n_batch_(config.llama.batch),
|
| 49 |
+
n_ubatch_(config.llama.ubatch),
|
| 50 |
+
next_port_(base_port_) {}
|
| 51 |
+
|
| 52 |
+
bool ModelManager::initialize_default(std::string &error) {
|
| 53 |
+
return switch_model(default_model_, error);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
bool ModelManager::switch_model(const std::string &model, std::string &error) {
|
| 57 |
+
{
|
| 58 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 59 |
+
if (switch_in_progress_) {
|
| 60 |
+
error = "Switch already in progress";
|
| 61 |
+
return false;
|
| 62 |
+
}
|
| 63 |
+
if (active_ && active_->model == model && is_alive(active_->pid)) {
|
| 64 |
+
return true;
|
| 65 |
+
}
|
| 66 |
+
switch_in_progress_ = true;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
std::optional<WorkerInfo> old_worker;
|
| 70 |
+
{
|
| 71 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 72 |
+
if (active_) old_worker = active_;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
int port = allocate_port();
|
| 76 |
+
pid_t pid = spawn_worker(model, port);
|
| 77 |
+
if (pid <= 0) {
|
| 78 |
+
finish_switch(false);
|
| 79 |
+
error = "Failed to start worker process";
|
| 80 |
+
return false;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
|
| 84 |
+
shutdown_worker(pid);
|
| 85 |
+
finish_switch(false);
|
| 86 |
+
error = "New model did not become ready in time";
|
| 87 |
+
return false;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
WorkerInfo new_worker{model, port, pid, now_utc_iso()};
|
| 91 |
+
{
|
| 92 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 93 |
+
active_ = new_worker;
|
| 94 |
+
switch_in_progress_ = false;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
if (old_worker && old_worker->pid != pid) {
|
| 98 |
+
shutdown_worker(old_worker->pid);
|
| 99 |
+
}
|
| 100 |
+
return true;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
bool ModelManager::restart_active(std::string &error) {
|
| 104 |
+
std::optional<WorkerInfo> old_worker;
|
| 105 |
+
std::string model;
|
| 106 |
+
{
|
| 107 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 108 |
+
if (switch_in_progress_) {
|
| 109 |
+
error = "Switch already in progress";
|
| 110 |
+
return false;
|
| 111 |
+
}
|
| 112 |
+
if (!active_ || !is_alive(active_->pid)) {
|
| 113 |
+
error = "No active model";
|
| 114 |
+
return false;
|
| 115 |
+
}
|
| 116 |
+
switch_in_progress_ = true;
|
| 117 |
+
old_worker = active_;
|
| 118 |
+
model = active_->model;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
shutdown_worker(old_worker->pid);
|
| 122 |
+
|
| 123 |
+
int port = allocate_port();
|
| 124 |
+
pid_t pid = spawn_worker(model, port);
|
| 125 |
+
if (pid <= 0) {
|
| 126 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 127 |
+
active_ = std::nullopt;
|
| 128 |
+
switch_in_progress_ = false;
|
| 129 |
+
error = "Failed to start worker process";
|
| 130 |
+
return false;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
|
| 134 |
+
shutdown_worker(pid);
|
| 135 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 136 |
+
active_ = std::nullopt;
|
| 137 |
+
switch_in_progress_ = false;
|
| 138 |
+
error = "New model did not become ready in time";
|
| 139 |
+
return false;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
WorkerInfo new_worker{model, port, pid, now_utc_iso()};
|
| 143 |
+
{
|
| 144 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 145 |
+
active_ = new_worker;
|
| 146 |
+
switch_in_progress_ = false;
|
| 147 |
+
}
|
| 148 |
+
return true;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
std::optional<WorkerInfo> ModelManager::active_worker() {
|
| 152 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 153 |
+
if (active_ && is_alive(active_->pid)) return active_;
|
| 154 |
+
return std::nullopt;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
json ModelManager::models_view() {
|
| 158 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 159 |
+
json out;
|
| 160 |
+
out["status"] = (active_ && is_alive(active_->pid)) ? "ready" : "no_active_model";
|
| 161 |
+
out["switch_in_progress"] = switch_in_progress_;
|
| 162 |
+
if (active_ && is_alive(active_->pid)) {
|
| 163 |
+
out["current_model"] = active_->model;
|
| 164 |
+
out["last_loaded"] = active_->last_loaded;
|
| 165 |
+
out["active_pid"] = active_->pid;
|
| 166 |
+
out["active_port"] = active_->port;
|
| 167 |
+
} else {
|
| 168 |
+
out["current_model"] = nullptr;
|
| 169 |
+
out["last_loaded"] = nullptr;
|
| 170 |
+
out["active_pid"] = nullptr;
|
| 171 |
+
out["active_port"] = nullptr;
|
| 172 |
+
}
|
| 173 |
+
return out;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
int ModelManager::allocate_port() {
|
| 177 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 178 |
+
return next_port_++;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
void ModelManager::finish_switch(bool ok) {
|
| 182 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 183 |
+
if (!ok) switch_in_progress_ = false;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
pid_t ModelManager::spawn_worker(const std::string &model, int port) {
|
| 187 |
+
pid_t pid = fork();
|
| 188 |
+
if (pid < 0) return -1;
|
| 189 |
+
if (pid == 0) {
|
| 190 |
+
setsid();
|
| 191 |
+
std::string port_s = std::to_string(port);
|
| 192 |
+
std::string n_ctx_s = std::to_string(n_ctx_);
|
| 193 |
+
std::string threads_s = std::to_string(n_threads_);
|
| 194 |
+
std::string ngl_s = std::to_string(n_gpu_layers_);
|
| 195 |
+
std::string batch_s = std::to_string(n_batch_);
|
| 196 |
+
std::string ubatch_s = std::to_string(n_ubatch_);
|
| 197 |
+
|
| 198 |
+
std::vector<std::string> args = {
|
| 199 |
+
llama_server_bin_,
|
| 200 |
+
"-hf", model,
|
| 201 |
+
"--host", worker_bind_host_,
|
| 202 |
+
"--port", port_s,
|
| 203 |
+
"-c", n_ctx_s,
|
| 204 |
+
"-t", threads_s,
|
| 205 |
+
"-ngl", ngl_s,
|
| 206 |
+
"--cont-batching",
|
| 207 |
+
"-b", batch_s,
|
| 208 |
+
"--ubatch-size", ubatch_s
|
| 209 |
+
};
|
| 210 |
+
|
| 211 |
+
std::vector<char *> argv;
|
| 212 |
+
argv.reserve(args.size() + 1);
|
| 213 |
+
for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
|
| 214 |
+
argv.push_back(nullptr);
|
| 215 |
+
execvp(argv[0], argv.data());
|
| 216 |
+
_exit(127);
|
| 217 |
+
}
|
| 218 |
+
return pid;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
bool ModelManager::wait_until_ready(pid_t pid, int port, int timeout_sec) {
|
| 222 |
+
const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
|
| 223 |
+
while (std::chrono::steady_clock::now() < deadline) {
|
| 224 |
+
if (!is_alive(pid)) return false;
|
| 225 |
+
try {
|
| 226 |
+
auto [status, _] = http_get(port, "/");
|
| 227 |
+
if (status == 200) return true;
|
| 228 |
+
} catch (...) {
|
| 229 |
+
}
|
| 230 |
+
std::this_thread::sleep_for(std::chrono::milliseconds(800));
|
| 231 |
+
}
|
| 232 |
+
return false;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
std::pair<int, std::string> ModelManager::http_get(int port, const std::string &target) {
|
| 236 |
+
asio::io_context ioc;
|
| 237 |
+
asio::ip::tcp::resolver resolver(ioc);
|
| 238 |
+
beast::tcp_stream stream(ioc);
|
| 239 |
+
auto const results = resolver.resolve(worker_host_, std::to_string(port));
|
| 240 |
+
stream.connect(results);
|
| 241 |
+
|
| 242 |
+
http::request<http::string_body> req{http::verb::get, target, 11};
|
| 243 |
+
req.set(http::field::host, worker_host_);
|
| 244 |
+
req.set(http::field::user_agent, "llm-manager");
|
| 245 |
+
http::write(stream, req);
|
| 246 |
+
|
| 247 |
+
beast::flat_buffer buffer;
|
| 248 |
+
http::response<http::string_body> res;
|
| 249 |
+
http::read(stream, buffer, res);
|
| 250 |
+
beast::error_code ec;
|
| 251 |
+
stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
|
| 252 |
+
return {res.result_int(), res.body()};
|
| 253 |
+
}
|
cpp/model_manager.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <mutex>
|
| 4 |
+
#include <optional>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <utility>
|
| 7 |
+
|
| 8 |
+
#include "llm_manager_types.h"
|
| 9 |
+
|
| 10 |
+
class ModelManager {
|
| 11 |
+
public:
|
| 12 |
+
explicit ModelManager(const ManagerConfig &config);
|
| 13 |
+
|
| 14 |
+
bool initialize_default(std::string &error);
|
| 15 |
+
bool switch_model(const std::string &model, std::string &error);
|
| 16 |
+
bool restart_active(std::string &error);
|
| 17 |
+
std::optional<WorkerInfo> active_worker();
|
| 18 |
+
json models_view();
|
| 19 |
+
|
| 20 |
+
private:
|
| 21 |
+
std::mutex mu_;
|
| 22 |
+
std::optional<WorkerInfo> active_;
|
| 23 |
+
bool switch_in_progress_ = false;
|
| 24 |
+
|
| 25 |
+
std::string default_model_;
|
| 26 |
+
std::string llama_server_bin_;
|
| 27 |
+
std::string worker_host_;
|
| 28 |
+
std::string worker_bind_host_;
|
| 29 |
+
int base_port_;
|
| 30 |
+
int switch_timeout_sec_;
|
| 31 |
+
int n_ctx_;
|
| 32 |
+
int n_threads_;
|
| 33 |
+
int n_gpu_layers_;
|
| 34 |
+
int n_batch_;
|
| 35 |
+
int n_ubatch_;
|
| 36 |
+
int next_port_;
|
| 37 |
+
|
| 38 |
+
int allocate_port();
|
| 39 |
+
void finish_switch(bool ok);
|
| 40 |
+
pid_t spawn_worker(const std::string &model, int port);
|
| 41 |
+
bool wait_until_ready(pid_t pid, int port, int timeout_sec);
|
| 42 |
+
std::pair<int, std::string> http_get(int port, const std::string &target);
|
| 43 |
+
};
|
| 44 |
+
|
| 45 |
+
bool is_alive(pid_t pid);
|
| 46 |
+
void shutdown_worker(pid_t pid, int wait_seconds = 15);
|
cpp/request_parsing.cpp
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "request_parsing.h"
|
| 2 |
+
|
| 3 |
+
#include <algorithm>
|
| 4 |
+
#include <sstream>
|
| 5 |
+
|
| 6 |
+
static int estimate_text_tokens_rough(const std::string &text) {
|
| 7 |
+
if (text.empty()) return 0;
|
| 8 |
+
return std::max(1, static_cast<int>((text.size() + 3) / 4));
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
static std::string flatten_json_content(const json &content) {
|
| 12 |
+
if (content.is_string()) {
|
| 13 |
+
return content.get<std::string>();
|
| 14 |
+
}
|
| 15 |
+
if (content.is_array()) {
|
| 16 |
+
std::ostringstream oss;
|
| 17 |
+
bool first = true;
|
| 18 |
+
for (const auto &item : content) {
|
| 19 |
+
std::string part;
|
| 20 |
+
if (item.is_string()) {
|
| 21 |
+
part = item.get<std::string>();
|
| 22 |
+
} else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
|
| 23 |
+
part = item["text"].get<std::string>();
|
| 24 |
+
}
|
| 25 |
+
if (part.empty()) continue;
|
| 26 |
+
if (!first) oss << '\n';
|
| 27 |
+
oss << part;
|
| 28 |
+
first = false;
|
| 29 |
+
}
|
| 30 |
+
return oss.str();
|
| 31 |
+
}
|
| 32 |
+
return "";
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
std::optional<TokenEstimate> estimate_chat_tokens(
|
| 36 |
+
const json &payload,
|
| 37 |
+
const LimitsConfig &limits,
|
| 38 |
+
std::string &error) {
|
| 39 |
+
if (!payload.is_object()) {
|
| 40 |
+
error = "Expected JSON object";
|
| 41 |
+
return std::nullopt;
|
| 42 |
+
}
|
| 43 |
+
if (!payload.contains("messages") || !payload["messages"].is_array()) {
|
| 44 |
+
error = "Expected 'messages' array";
|
| 45 |
+
return std::nullopt;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
TokenEstimate estimate;
|
| 49 |
+
estimate.requested_max_tokens = limits.default_max_tokens;
|
| 50 |
+
if (payload.contains("max_tokens")) {
|
| 51 |
+
if (!payload["max_tokens"].is_number_integer()) {
|
| 52 |
+
error = "Expected integer 'max_tokens'";
|
| 53 |
+
return std::nullopt;
|
| 54 |
+
}
|
| 55 |
+
estimate.requested_max_tokens = payload["max_tokens"].get<int>();
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
if (estimate.requested_max_tokens <= 0) {
|
| 59 |
+
error = "'max_tokens' must be > 0";
|
| 60 |
+
return std::nullopt;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
for (const auto &message : payload["messages"]) {
|
| 64 |
+
if (!message.is_object()) continue;
|
| 65 |
+
if (message.contains("role") && message["role"].is_string()) {
|
| 66 |
+
estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
|
| 67 |
+
}
|
| 68 |
+
if (message.contains("content")) {
|
| 69 |
+
estimate.prompt_tokens += estimate_text_tokens_rough(flatten_json_content(message["content"]));
|
| 70 |
+
}
|
| 71 |
+
estimate.prompt_tokens += 4;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
|
| 75 |
+
if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
|
| 76 |
+
error = "Estimated request tokens exceed configured limit";
|
| 77 |
+
return std::nullopt;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
error.clear();
|
| 81 |
+
return estimate;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
bool request_stream_enabled(const json &payload) {
|
| 85 |
+
if (!payload.is_object() || !payload.contains("stream")) return false;
|
| 86 |
+
return payload["stream"].is_boolean() && payload["stream"].get<bool>();
|
| 87 |
+
}
|
cpp/request_parsing.h
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <optional>
|
| 4 |
+
#include <string>
|
| 5 |
+
|
| 6 |
+
#include "llm_manager_types.h"
|
| 7 |
+
|
| 8 |
+
std::optional<TokenEstimate> estimate_chat_tokens(
|
| 9 |
+
const json &payload,
|
| 10 |
+
const LimitsConfig &limits,
|
| 11 |
+
std::string &error);
|
| 12 |
+
|
| 13 |
+
bool request_stream_enabled(const json &payload);
|
cpp/runtime_components.cpp
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "runtime_components.h"
|
| 2 |
+
|
| 3 |
+
#include "config.h"
|
| 4 |
+
#include "http_helpers.h"
|
| 5 |
+
#include "model_manager.h"
|
| 6 |
+
|
| 7 |
+
#include <algorithm>
|
| 8 |
+
#include <sstream>
|
| 9 |
+
|
| 10 |
+
RateLimiterStore::RateLimiterStore(const RateLimitConfig &config)
|
| 11 |
+
: requests_per_minute_(std::max(0, config.requests_per_minute)),
|
| 12 |
+
estimated_tokens_per_minute_(std::max(0, config.estimated_tokens_per_minute)) {}
|
| 13 |
+
|
| 14 |
+
RateLimitDecision RateLimiterStore::allow(const std::string &api_key_id, int estimated_tokens) {
|
| 15 |
+
if (requests_per_minute_ <= 0 && estimated_tokens_per_minute_ <= 0) return {};
|
| 16 |
+
|
| 17 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 18 |
+
auto &bucket = buckets_[api_key_id];
|
| 19 |
+
const auto now = std::chrono::steady_clock::now();
|
| 20 |
+
refill(bucket.request_tokens, bucket.last_request_refill, requests_per_minute_, now);
|
| 21 |
+
refill(bucket.estimated_tokens, bucket.last_estimated_refill, estimated_tokens_per_minute_, now);
|
| 22 |
+
|
| 23 |
+
if (requests_per_minute_ > 0 && bucket.request_tokens < 1.0) {
|
| 24 |
+
return {false, 1, "Rate limit exceeded: requests"};
|
| 25 |
+
}
|
| 26 |
+
if (estimated_tokens_per_minute_ > 0 && bucket.estimated_tokens < estimated_tokens) {
|
| 27 |
+
return {false, 1, "Rate limit exceeded: estimated tokens"};
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
if (requests_per_minute_ > 0) bucket.request_tokens -= 1.0;
|
| 31 |
+
if (estimated_tokens_per_minute_ > 0) bucket.estimated_tokens -= estimated_tokens;
|
| 32 |
+
return {};
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void RateLimiterStore::refill(
|
| 36 |
+
double &tokens,
|
| 37 |
+
std::chrono::steady_clock::time_point &last_refill,
|
| 38 |
+
int limit_per_minute,
|
| 39 |
+
std::chrono::steady_clock::time_point now) {
|
| 40 |
+
if (limit_per_minute <= 0) return;
|
| 41 |
+
if (last_refill.time_since_epoch().count() == 0) {
|
| 42 |
+
tokens = limit_per_minute;
|
| 43 |
+
last_refill = now;
|
| 44 |
+
return;
|
| 45 |
+
}
|
| 46 |
+
const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
|
| 47 |
+
if (elapsed <= 0) return;
|
| 48 |
+
const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
|
| 49 |
+
tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
|
| 50 |
+
last_refill = now;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
std::shared_ptr<RequestContext> RequestRegistry::create(
|
| 54 |
+
const std::string &request_id,
|
| 55 |
+
const ApiKeyRecord &principal,
|
| 56 |
+
const TokenEstimate &estimate,
|
| 57 |
+
const std::string &request_body) {
|
| 58 |
+
auto ctx = std::make_shared<RequestContext>();
|
| 59 |
+
ctx->request_id = request_id;
|
| 60 |
+
ctx->api_key_id = principal.key_id;
|
| 61 |
+
ctx->role = principal.role;
|
| 62 |
+
ctx->priority = role_to_priority(principal.role);
|
| 63 |
+
ctx->estimate = estimate;
|
| 64 |
+
ctx->request_body = request_body;
|
| 65 |
+
ctx->created_at = std::chrono::steady_clock::now();
|
| 66 |
+
ctx->enqueue_time = ctx->created_at;
|
| 67 |
+
|
| 68 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 69 |
+
requests_[request_id] = ctx;
|
| 70 |
+
return ctx;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
std::shared_ptr<RequestContext> RequestRegistry::find(const std::string &request_id) const {
|
| 74 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 75 |
+
const auto it = requests_.find(request_id);
|
| 76 |
+
if (it == requests_.end()) return nullptr;
|
| 77 |
+
return it->second;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
void RequestRegistry::complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
|
| 81 |
+
{
|
| 82 |
+
std::lock_guard<std::mutex> lock(ctx->mu);
|
| 83 |
+
ctx->state.store(state);
|
| 84 |
+
ctx->result = std::move(result);
|
| 85 |
+
ctx->completed = true;
|
| 86 |
+
}
|
| 87 |
+
ctx->cv.notify_all();
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
std::shared_ptr<RequestContext> RequestRegistry::cancel_request(const std::string &request_id) {
|
| 91 |
+
auto ctx = find(request_id);
|
| 92 |
+
if (!ctx) return nullptr;
|
| 93 |
+
|
| 94 |
+
ctx->cancelled.store(true);
|
| 95 |
+
const auto state = ctx->state.load();
|
| 96 |
+
if (state == RequestState::QUEUED) {
|
| 97 |
+
complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 98 |
+
} else if (state == RequestState::RUNNING) {
|
| 99 |
+
ctx->state.store(RequestState::CANCELLED);
|
| 100 |
+
}
|
| 101 |
+
return ctx;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
std::vector<std::shared_ptr<RequestContext>> RequestRegistry::cancel_all() {
|
| 105 |
+
std::vector<std::shared_ptr<RequestContext>> out;
|
| 106 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 107 |
+
out.reserve(requests_.size());
|
| 108 |
+
for (auto &[_, ctx] : requests_) {
|
| 109 |
+
ctx->cancelled.store(true);
|
| 110 |
+
const auto state = ctx->state.load();
|
| 111 |
+
if (state == RequestState::QUEUED) {
|
| 112 |
+
{
|
| 113 |
+
std::lock_guard<std::mutex> ctx_lock(ctx->mu);
|
| 114 |
+
ctx->state.store(RequestState::CANCELLED);
|
| 115 |
+
ctx->result = {499, R"({"error":"Request cancelled"})"};
|
| 116 |
+
ctx->completed = true;
|
| 117 |
+
}
|
| 118 |
+
ctx->cv.notify_all();
|
| 119 |
+
} else if (state == RequestState::RUNNING) {
|
| 120 |
+
ctx->state.store(RequestState::CANCELLED);
|
| 121 |
+
}
|
| 122 |
+
out.push_back(ctx);
|
| 123 |
+
}
|
| 124 |
+
return out;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
void MetricsRegistry::inc_requests_total() { requests_total_.fetch_add(1); }
|
| 128 |
+
void MetricsRegistry::inc_requests_inflight() { requests_inflight_.fetch_add(1); }
|
| 129 |
+
void MetricsRegistry::dec_requests_inflight() { requests_inflight_.fetch_sub(1); }
|
| 130 |
+
void MetricsRegistry::inc_queue_rejected_total() { queue_rejected_total_.fetch_add(1); }
|
| 131 |
+
void MetricsRegistry::inc_rate_limited_total() { rate_limited_total_.fetch_add(1); }
|
| 132 |
+
void MetricsRegistry::add_cancellations_total(uint64_t delta) { cancellations_total_.fetch_add(delta); }
|
| 133 |
+
void MetricsRegistry::inc_switch_total() { switch_total_.fetch_add(1); }
|
| 134 |
+
void MetricsRegistry::inc_worker_restarts_total() { worker_restarts_total_.fetch_add(1); }
|
| 135 |
+
|
| 136 |
+
void MetricsRegistry::observe_request_latency_ms(int64_t value) {
|
| 137 |
+
request_latency_ms_total_.fetch_add(value);
|
| 138 |
+
request_latency_samples_.fetch_add(1);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
void MetricsRegistry::observe_queue_wait_ms(int64_t value) {
|
| 142 |
+
queue_wait_ms_total_.fetch_add(value);
|
| 143 |
+
queue_wait_samples_.fetch_add(1);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
std::string MetricsRegistry::render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const {
|
| 147 |
+
std::ostringstream oss;
|
| 148 |
+
oss << "llm_manager_requests_total " << requests_total_.load() << '\n';
|
| 149 |
+
oss << "llm_manager_requests_inflight " << requests_inflight_.load() << '\n';
|
| 150 |
+
oss << "llm_manager_request_latency_ms_total " << request_latency_ms_total_.load() << '\n';
|
| 151 |
+
oss << "llm_manager_request_latency_ms_samples " << request_latency_samples_.load() << '\n';
|
| 152 |
+
oss << "llm_manager_queue_size " << queue.total_size << '\n';
|
| 153 |
+
oss << "llm_manager_queue_admin_size " << queue.admin_size << '\n';
|
| 154 |
+
oss << "llm_manager_queue_user_size " << queue.user_size << '\n';
|
| 155 |
+
oss << "llm_manager_queue_tokens " << queue.total_tokens << '\n';
|
| 156 |
+
oss << "llm_manager_queue_rejected_total " << queue_rejected_total_.load() << '\n';
|
| 157 |
+
oss << "llm_manager_rate_limited_total " << rate_limited_total_.load() << '\n';
|
| 158 |
+
oss << "llm_manager_queue_wait_time_ms_total " << queue_wait_ms_total_.load() << '\n';
|
| 159 |
+
oss << "llm_manager_queue_wait_time_ms_samples " << queue_wait_samples_.load() << '\n';
|
| 160 |
+
oss << "llm_manager_cancellations_total " << cancellations_total_.load() << '\n';
|
| 161 |
+
oss << "llm_manager_switch_total " << switch_total_.load() << '\n';
|
| 162 |
+
oss << "llm_manager_worker_restarts_total " << worker_restarts_total_.load() << '\n';
|
| 163 |
+
const auto active = manager.active_worker();
|
| 164 |
+
oss << "llm_manager_active_worker " << (active ? 1 : 0) << '\n';
|
| 165 |
+
return oss.str();
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
PrioritySchedulerQueue::PrioritySchedulerQueue(const QueueConfig &config)
|
| 169 |
+
: max_size_(config.max_size),
|
| 170 |
+
max_tokens_(config.max_tokens),
|
| 171 |
+
admin_quota_(std::max(1, config.admin_quota)),
|
| 172 |
+
retry_after_sec_(std::max(1, config.retry_after_sec)) {}
|
| 173 |
+
|
| 174 |
+
bool PrioritySchedulerQueue::try_push(const std::shared_ptr<RequestContext> &ctx) {
|
| 175 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 176 |
+
if (current_size_ >= max_size_) return false;
|
| 177 |
+
if (current_tokens_ + ctx->estimate.estimated_total_tokens > max_tokens_) return false;
|
| 178 |
+
|
| 179 |
+
if (ctx->priority == Priority::ADMIN) admin_queue_.push_back(ctx);
|
| 180 |
+
else user_queue_.push_back(ctx);
|
| 181 |
+
|
| 182 |
+
++current_size_;
|
| 183 |
+
current_tokens_ += ctx->estimate.estimated_total_tokens;
|
| 184 |
+
cv_.notify_one();
|
| 185 |
+
return true;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
std::shared_ptr<RequestContext> PrioritySchedulerQueue::pop_next() {
|
| 189 |
+
std::unique_lock<std::mutex> lock(mu_);
|
| 190 |
+
cv_.wait(lock, [&]() { return stopped_ || current_size_ > 0; });
|
| 191 |
+
if (stopped_) return nullptr;
|
| 192 |
+
|
| 193 |
+
std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
|
| 194 |
+
if (!admin_queue_.empty() && (admin_streak_ < admin_quota_ || user_queue_.empty())) {
|
| 195 |
+
selected_queue = &admin_queue_;
|
| 196 |
+
++admin_streak_;
|
| 197 |
+
} else if (!user_queue_.empty()) {
|
| 198 |
+
selected_queue = &user_queue_;
|
| 199 |
+
admin_streak_ = 0;
|
| 200 |
+
} else if (!admin_queue_.empty()) {
|
| 201 |
+
selected_queue = &admin_queue_;
|
| 202 |
+
admin_streak_ = 1;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
if (!selected_queue || selected_queue->empty()) return nullptr;
|
| 206 |
+
|
| 207 |
+
auto best_it = std::min_element(
|
| 208 |
+
selected_queue->begin(),
|
| 209 |
+
selected_queue->end(),
|
| 210 |
+
[](const auto &a, const auto &b) {
|
| 211 |
+
return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
|
| 212 |
+
});
|
| 213 |
+
auto ctx = *best_it;
|
| 214 |
+
selected_queue->erase(best_it);
|
| 215 |
+
--current_size_;
|
| 216 |
+
current_tokens_ -= ctx->estimate.estimated_total_tokens;
|
| 217 |
+
return ctx;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
void PrioritySchedulerQueue::stop() {
|
| 221 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 222 |
+
stopped_ = true;
|
| 223 |
+
cv_.notify_all();
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
int PrioritySchedulerQueue::retry_after_sec() const {
|
| 227 |
+
return retry_after_sec_;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
QueueSnapshot PrioritySchedulerQueue::snapshot() const {
|
| 231 |
+
std::lock_guard<std::mutex> lock(mu_);
|
| 232 |
+
return QueueSnapshot{current_size_, admin_queue_.size(), user_queue_.size(), current_tokens_};
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
Scheduler::Scheduler(
|
| 236 |
+
ModelManager &manager,
|
| 237 |
+
RequestRegistry ®istry,
|
| 238 |
+
MetricsRegistry &metrics,
|
| 239 |
+
const QueueConfig &queue_config)
|
| 240 |
+
: manager_(manager), registry_(registry), metrics_(metrics), queue_(queue_config) {
|
| 241 |
+
worker_ = std::thread([this]() { worker_loop(); });
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
Scheduler::~Scheduler() {
|
| 245 |
+
queue_.stop();
|
| 246 |
+
if (worker_.joinable()) worker_.join();
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
bool Scheduler::try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
|
| 250 |
+
return queue_.try_push(ctx);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
int Scheduler::retry_after_sec() const {
|
| 254 |
+
return queue_.retry_after_sec();
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
QueueSnapshot Scheduler::snapshot() const {
|
| 258 |
+
return queue_.snapshot();
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
void Scheduler::worker_loop() {
|
| 262 |
+
for (;;) {
|
| 263 |
+
auto ctx = queue_.pop_next();
|
| 264 |
+
if (!ctx) return;
|
| 265 |
+
|
| 266 |
+
if (ctx->cancelled.load()) {
|
| 267 |
+
registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 268 |
+
continue;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
ctx->state.store(RequestState::RUNNING);
|
| 272 |
+
ctx->start_time = std::chrono::steady_clock::now();
|
| 273 |
+
metrics_.observe_queue_wait_ms(
|
| 274 |
+
std::chrono::duration_cast<std::chrono::milliseconds>(ctx->start_time - ctx->enqueue_time).count());
|
| 275 |
+
|
| 276 |
+
auto worker = manager_.active_worker();
|
| 277 |
+
if (!worker) {
|
| 278 |
+
registry_.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
|
| 279 |
+
continue;
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
try {
|
| 283 |
+
auto [status, body] = forward_chat(*worker, ctx->request_body);
|
| 284 |
+
if (ctx->cancelled.load()) {
|
| 285 |
+
registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
|
| 286 |
+
continue;
|
| 287 |
+
}
|
| 288 |
+
registry_.complete(ctx, RequestState::DONE, {status, body});
|
| 289 |
+
} catch (const std::exception &e) {
|
| 290 |
+
registry_.complete(ctx, RequestState::FAILED, {500, json({{"error", e.what()}}).dump()});
|
| 291 |
+
}
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
ApiKeyAuth::ApiKeyAuth(const ManagerConfig &config)
|
| 296 |
+
: header_name_(config.auth.header), scheme_(config.auth.scheme) {
|
| 297 |
+
for (const auto &record : config.api_keys) {
|
| 298 |
+
records_by_secret_.emplace(record.secret, record);
|
| 299 |
+
}
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
bool ApiKeyAuth::enabled() const {
|
| 303 |
+
return !records_by_secret_.empty();
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
std::optional<ApiKeyRecord> ApiKeyAuth::authenticate(
|
| 307 |
+
const http::request<http::string_body> &req,
|
| 308 |
+
std::string &error) const {
|
| 309 |
+
if (!enabled()) {
|
| 310 |
+
error.clear();
|
| 311 |
+
return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
const auto token = extract_bearer_token(req, error);
|
| 315 |
+
if (!token) return std::nullopt;
|
| 316 |
+
|
| 317 |
+
const auto it = records_by_secret_.find(*token);
|
| 318 |
+
if (it == records_by_secret_.end()) {
|
| 319 |
+
error = "Invalid API key";
|
| 320 |
+
return std::nullopt;
|
| 321 |
+
}
|
| 322 |
+
if (!it->second.enabled) {
|
| 323 |
+
error = "API key disabled";
|
| 324 |
+
return std::nullopt;
|
| 325 |
+
}
|
| 326 |
+
error.clear();
|
| 327 |
+
return it->second;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
std::optional<std::string> ApiKeyAuth::extract_bearer_token(
|
| 331 |
+
const http::request<http::string_body> &req,
|
| 332 |
+
std::string &error) const {
|
| 333 |
+
const auto header_it = req.find(header_name_);
|
| 334 |
+
if (header_it == req.end()) {
|
| 335 |
+
error = "Missing authorization header";
|
| 336 |
+
return std::nullopt;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
const std::string value = trim_copy(header_it->value().to_string());
|
| 340 |
+
const std::string prefix = scheme_ + " ";
|
| 341 |
+
if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
|
| 342 |
+
error = "Invalid authorization scheme";
|
| 343 |
+
return std::nullopt;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
std::string token = trim_copy(value.substr(prefix.size()));
|
| 347 |
+
if (token.empty()) {
|
| 348 |
+
error = "Missing API key";
|
| 349 |
+
return std::nullopt;
|
| 350 |
+
}
|
| 351 |
+
return token;
|
| 352 |
+
}
|
cpp/runtime_components.h
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <boost/beast/http.hpp>
|
| 4 |
+
|
| 5 |
+
#include <deque>
|
| 6 |
+
#include <memory>
|
| 7 |
+
#include <thread>
|
| 8 |
+
#include <unordered_map>
|
| 9 |
+
|
| 10 |
+
#include "llm_manager_types.h"
|
| 11 |
+
|
| 12 |
+
class ModelManager;
|
| 13 |
+
|
| 14 |
+
namespace http = boost::beast::http;
|
| 15 |
+
|
| 16 |
+
class RateLimiterStore {
|
| 17 |
+
public:
|
| 18 |
+
explicit RateLimiterStore(const RateLimitConfig &config);
|
| 19 |
+
RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens);
|
| 20 |
+
|
| 21 |
+
private:
|
| 22 |
+
struct Bucket {
|
| 23 |
+
double request_tokens = 0.0;
|
| 24 |
+
double estimated_tokens = 0.0;
|
| 25 |
+
std::chrono::steady_clock::time_point last_request_refill{};
|
| 26 |
+
std::chrono::steady_clock::time_point last_estimated_refill{};
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
std::mutex mu_;
|
| 30 |
+
std::unordered_map<std::string, Bucket> buckets_;
|
| 31 |
+
int requests_per_minute_;
|
| 32 |
+
int estimated_tokens_per_minute_;
|
| 33 |
+
|
| 34 |
+
static void refill(
|
| 35 |
+
double &tokens,
|
| 36 |
+
std::chrono::steady_clock::time_point &last_refill,
|
| 37 |
+
int limit_per_minute,
|
| 38 |
+
std::chrono::steady_clock::time_point now);
|
| 39 |
+
};
|
| 40 |
+
|
| 41 |
+
class RequestRegistry {
|
| 42 |
+
public:
|
| 43 |
+
std::shared_ptr<RequestContext> create(
|
| 44 |
+
const std::string &request_id,
|
| 45 |
+
const ApiKeyRecord &principal,
|
| 46 |
+
const TokenEstimate &estimate,
|
| 47 |
+
const std::string &request_body);
|
| 48 |
+
std::shared_ptr<RequestContext> find(const std::string &request_id) const;
|
| 49 |
+
void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result);
|
| 50 |
+
std::shared_ptr<RequestContext> cancel_request(const std::string &request_id);
|
| 51 |
+
std::vector<std::shared_ptr<RequestContext>> cancel_all();
|
| 52 |
+
|
| 53 |
+
private:
|
| 54 |
+
mutable std::mutex mu_;
|
| 55 |
+
std::unordered_map<std::string, std::shared_ptr<RequestContext>> requests_;
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
class MetricsRegistry {
|
| 59 |
+
public:
|
| 60 |
+
void inc_requests_total();
|
| 61 |
+
void inc_requests_inflight();
|
| 62 |
+
void dec_requests_inflight();
|
| 63 |
+
void inc_queue_rejected_total();
|
| 64 |
+
void inc_rate_limited_total();
|
| 65 |
+
void add_cancellations_total(uint64_t delta = 1);
|
| 66 |
+
void inc_switch_total();
|
| 67 |
+
void inc_worker_restarts_total();
|
| 68 |
+
void observe_request_latency_ms(int64_t value);
|
| 69 |
+
void observe_queue_wait_ms(int64_t value);
|
| 70 |
+
std::string render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const;
|
| 71 |
+
|
| 72 |
+
private:
|
| 73 |
+
std::atomic<uint64_t> requests_total_{0};
|
| 74 |
+
std::atomic<int64_t> requests_inflight_{0};
|
| 75 |
+
std::atomic<uint64_t> request_latency_ms_total_{0};
|
| 76 |
+
std::atomic<uint64_t> request_latency_samples_{0};
|
| 77 |
+
std::atomic<uint64_t> queue_rejected_total_{0};
|
| 78 |
+
std::atomic<uint64_t> rate_limited_total_{0};
|
| 79 |
+
std::atomic<uint64_t> queue_wait_ms_total_{0};
|
| 80 |
+
std::atomic<uint64_t> queue_wait_samples_{0};
|
| 81 |
+
std::atomic<uint64_t> cancellations_total_{0};
|
| 82 |
+
std::atomic<uint64_t> switch_total_{0};
|
| 83 |
+
std::atomic<uint64_t> worker_restarts_total_{0};
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
class PrioritySchedulerQueue {
|
| 87 |
+
public:
|
| 88 |
+
explicit PrioritySchedulerQueue(const QueueConfig &config);
|
| 89 |
+
bool try_push(const std::shared_ptr<RequestContext> &ctx);
|
| 90 |
+
std::shared_ptr<RequestContext> pop_next();
|
| 91 |
+
void stop();
|
| 92 |
+
int retry_after_sec() const;
|
| 93 |
+
QueueSnapshot snapshot() const;
|
| 94 |
+
|
| 95 |
+
private:
|
| 96 |
+
mutable std::mutex mu_;
|
| 97 |
+
std::condition_variable cv_;
|
| 98 |
+
std::deque<std::shared_ptr<RequestContext>> admin_queue_;
|
| 99 |
+
std::deque<std::shared_ptr<RequestContext>> user_queue_;
|
| 100 |
+
size_t max_size_;
|
| 101 |
+
size_t current_size_ = 0;
|
| 102 |
+
int max_tokens_;
|
| 103 |
+
int current_tokens_ = 0;
|
| 104 |
+
int admin_quota_;
|
| 105 |
+
int admin_streak_ = 0;
|
| 106 |
+
int retry_after_sec_;
|
| 107 |
+
bool stopped_ = false;
|
| 108 |
+
};
|
| 109 |
+
|
| 110 |
+
class Scheduler {
|
| 111 |
+
public:
|
| 112 |
+
Scheduler(
|
| 113 |
+
ModelManager &manager,
|
| 114 |
+
RequestRegistry ®istry,
|
| 115 |
+
MetricsRegistry &metrics,
|
| 116 |
+
const QueueConfig &queue_config);
|
| 117 |
+
~Scheduler();
|
| 118 |
+
|
| 119 |
+
bool try_enqueue(const std::shared_ptr<RequestContext> &ctx);
|
| 120 |
+
int retry_after_sec() const;
|
| 121 |
+
QueueSnapshot snapshot() const;
|
| 122 |
+
|
| 123 |
+
private:
|
| 124 |
+
ModelManager &manager_;
|
| 125 |
+
RequestRegistry ®istry_;
|
| 126 |
+
MetricsRegistry &metrics_;
|
| 127 |
+
PrioritySchedulerQueue queue_;
|
| 128 |
+
std::thread worker_;
|
| 129 |
+
|
| 130 |
+
void worker_loop();
|
| 131 |
+
};
|
| 132 |
+
|
| 133 |
+
class ApiKeyAuth {
|
| 134 |
+
public:
|
| 135 |
+
explicit ApiKeyAuth(const ManagerConfig &config);
|
| 136 |
+
bool enabled() const;
|
| 137 |
+
std::optional<ApiKeyRecord> authenticate(
|
| 138 |
+
const http::request<http::string_body> &req,
|
| 139 |
+
std::string &error) const;
|
| 140 |
+
|
| 141 |
+
private:
|
| 142 |
+
std::string header_name_;
|
| 143 |
+
std::string scheme_;
|
| 144 |
+
std::unordered_map<std::string, ApiKeyRecord> records_by_secret_;
|
| 145 |
+
|
| 146 |
+
std::optional<std::string> extract_bearer_token(
|
| 147 |
+
const http::request<http::string_body> &req,
|
| 148 |
+
std::string &error) const;
|
| 149 |
+
};
|
cpp/server.cpp
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "server.h"
|
| 2 |
+
|
| 3 |
+
#include <boost/beast/core.hpp>
|
| 4 |
+
|
| 5 |
+
#include "http_helpers.h"
|
| 6 |
+
#include "model_manager.h"
|
| 7 |
+
#include "request_parsing.h"
|
| 8 |
+
#include "runtime_components.h"
|
| 9 |
+
|
| 10 |
+
#include <algorithm>
|
| 11 |
+
#include <atomic>
|
| 12 |
+
|
| 13 |
+
namespace beast = boost::beast;
|
| 14 |
+
namespace http = beast::http;
|
| 15 |
+
|
| 16 |
+
static std::atomic<uint64_t> g_req_id{1};
|
| 17 |
+
|
| 18 |
+
http::response<http::string_body> handle_request(
|
| 19 |
+
ModelManager &manager,
|
| 20 |
+
const ManagerConfig &config,
|
| 21 |
+
const ApiKeyAuth &auth,
|
| 22 |
+
RateLimiterStore &rate_limiter,
|
| 23 |
+
RequestRegistry ®istry,
|
| 24 |
+
MetricsRegistry &metrics,
|
| 25 |
+
Scheduler &scheduler,
|
| 26 |
+
http::request<http::string_body> &&req) {
|
| 27 |
+
const auto start = std::chrono::steady_clock::now();
|
| 28 |
+
const auto req_id_num = g_req_id.fetch_add(1);
|
| 29 |
+
const std::string request_id = std::to_string(req_id_num);
|
| 30 |
+
const std::string target = req.target().to_string();
|
| 31 |
+
const std::string method = req.method_string().to_string();
|
| 32 |
+
const std::string path = target.substr(0, target.find('?'));
|
| 33 |
+
auto authenticated = std::optional<ApiKeyRecord>{};
|
| 34 |
+
|
| 35 |
+
metrics.inc_requests_total();
|
| 36 |
+
metrics.inc_requests_inflight();
|
| 37 |
+
struct InflightGuard {
|
| 38 |
+
MetricsRegistry &metrics;
|
| 39 |
+
~InflightGuard() { metrics.dec_requests_inflight(); }
|
| 40 |
+
} inflight_guard{metrics};
|
| 41 |
+
|
| 42 |
+
log_line("request_id=" + request_id + " method=" + method + " path=" + target);
|
| 43 |
+
if (!req.body().empty()) {
|
| 44 |
+
log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
auto json_response = [&](http::status status, const json &obj) {
|
| 48 |
+
json payload = obj;
|
| 49 |
+
payload["request_id"] = request_id;
|
| 50 |
+
http::response<http::string_body> res{status, req.version()};
|
| 51 |
+
res.set(http::field::content_type, "application/json");
|
| 52 |
+
res.set(http::field::server, "llm-manager");
|
| 53 |
+
res.set("X-Request-Id", request_id);
|
| 54 |
+
res.keep_alive(req.keep_alive());
|
| 55 |
+
res.body() = payload.dump();
|
| 56 |
+
res.prepare_payload();
|
| 57 |
+
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 58 |
+
std::chrono::steady_clock::now() - start).count();
|
| 59 |
+
metrics.observe_request_latency_ms(elapsed_ms);
|
| 60 |
+
log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
|
| 61 |
+
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 62 |
+
return res;
|
| 63 |
+
};
|
| 64 |
+
|
| 65 |
+
auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
|
| 66 |
+
auto res = json_response(status, obj);
|
| 67 |
+
res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
|
| 68 |
+
return res;
|
| 69 |
+
};
|
| 70 |
+
|
| 71 |
+
auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
|
| 72 |
+
std::string auth_error;
|
| 73 |
+
authenticated = auth.authenticate(req, auth_error);
|
| 74 |
+
if (!authenticated) {
|
| 75 |
+
return json_response(http::status::unauthorized, {{"error", auth_error}});
|
| 76 |
+
}
|
| 77 |
+
if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
|
| 78 |
+
return json_response(http::status::forbidden, {{"error", "Admin role required"}});
|
| 79 |
+
}
|
| 80 |
+
log_line("request_id=" + request_id +
|
| 81 |
+
" api_key_id=" + authenticated->key_id +
|
| 82 |
+
" role=" + role_to_string(authenticated->role));
|
| 83 |
+
return std::nullopt;
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
try {
|
| 87 |
+
if (path == "/health" && req.method() == http::verb::get) {
|
| 88 |
+
return json_response(http::status::ok, manager.models_view());
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
if (path == "/models" && req.method() == http::verb::get) {
|
| 92 |
+
return json_response(http::status::ok, manager.models_view());
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
if (path == "/queue/metrics" && req.method() == http::verb::get) {
|
| 96 |
+
http::response<http::string_body> res{http::status::ok, req.version()};
|
| 97 |
+
res.set(http::field::content_type, "text/plain; version=0.0.4; charset=utf-8");
|
| 98 |
+
res.set(http::field::server, "llm-manager");
|
| 99 |
+
res.set("X-Request-Id", request_id);
|
| 100 |
+
res.keep_alive(req.keep_alive());
|
| 101 |
+
res.body() = metrics.render_prometheus(scheduler.snapshot(), manager);
|
| 102 |
+
res.prepare_payload();
|
| 103 |
+
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 104 |
+
std::chrono::steady_clock::now() - start).count();
|
| 105 |
+
metrics.observe_request_latency_ms(elapsed_ms);
|
| 106 |
+
log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
|
| 107 |
+
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 108 |
+
return res;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
if (path == "/switch-model" && req.method() == http::verb::post) {
|
| 112 |
+
if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
|
| 113 |
+
json j = json::parse(req.body(), nullptr, false);
|
| 114 |
+
if (j.is_discarded()) {
|
| 115 |
+
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 116 |
+
}
|
| 117 |
+
std::string model;
|
| 118 |
+
if (j.contains("model_name")) model = j["model_name"].get<std::string>();
|
| 119 |
+
if (j.contains("model")) model = j["model"].get<std::string>();
|
| 120 |
+
if (model.empty()) {
|
| 121 |
+
return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
std::string err;
|
| 125 |
+
bool ok = manager.switch_model(model, err);
|
| 126 |
+
if (!ok) {
|
| 127 |
+
auto status = (err == "Switch already in progress")
|
| 128 |
+
? http::status::conflict
|
| 129 |
+
: http::status::internal_server_error;
|
| 130 |
+
return json_response(status, {{"status", "error"}, {"error", err}});
|
| 131 |
+
}
|
| 132 |
+
auto state = manager.models_view();
|
| 133 |
+
state["message"] = "Switched model successfully";
|
| 134 |
+
metrics.inc_switch_total();
|
| 135 |
+
return json_response(http::status::ok, state);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
if (path == "/stop" && req.method() == http::verb::post) {
|
| 139 |
+
if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
|
| 140 |
+
const auto cancelled = registry.cancel_all();
|
| 141 |
+
metrics.add_cancellations_total(cancelled.size());
|
| 142 |
+
std::string err;
|
| 143 |
+
bool ok = manager.restart_active(err);
|
| 144 |
+
if (!ok) {
|
| 145 |
+
http::status status = http::status::internal_server_error;
|
| 146 |
+
if (err == "Switch already in progress") status = http::status::conflict;
|
| 147 |
+
else if (err == "No active model") status = http::status::service_unavailable;
|
| 148 |
+
return json_response(status, {{"status", "error"}, {"error", err}});
|
| 149 |
+
}
|
| 150 |
+
auto state = manager.models_view();
|
| 151 |
+
state["message"] = "Stopped in-flight prompts and restarted model";
|
| 152 |
+
metrics.inc_worker_restarts_total();
|
| 153 |
+
return json_response(http::status::ok, state);
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
if (req.method() == http::verb::post) {
|
| 157 |
+
if (auto cancel_id = extract_cancel_request_id(path)) {
|
| 158 |
+
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 159 |
+
auto ctx = registry.find(*cancel_id);
|
| 160 |
+
if (!ctx) {
|
| 161 |
+
return json_response(http::status::not_found, {{"error", "Unknown request id"}});
|
| 162 |
+
}
|
| 163 |
+
if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
|
| 164 |
+
return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
const auto previous_state = ctx->state.load();
|
| 168 |
+
registry.cancel_request(*cancel_id);
|
| 169 |
+
metrics.add_cancellations_total();
|
| 170 |
+
std::string restart_error;
|
| 171 |
+
bool restarted = true;
|
| 172 |
+
if (previous_state == RequestState::RUNNING) {
|
| 173 |
+
restarted = manager.restart_active(restart_error);
|
| 174 |
+
if (restarted) metrics.inc_worker_restarts_total();
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
json payload = {
|
| 178 |
+
{"cancelled_request_id", *cancel_id},
|
| 179 |
+
{"state", state_to_string(ctx->state.load())}
|
| 180 |
+
};
|
| 181 |
+
if (!restarted) payload["restart_error"] = restart_error;
|
| 182 |
+
return json_response(http::status::ok, payload);
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
|
| 187 |
+
if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
|
| 188 |
+
json payload = json::parse(req.body(), nullptr, false);
|
| 189 |
+
if (payload.is_discarded()) {
|
| 190 |
+
return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
|
| 191 |
+
}
|
| 192 |
+
if (request_stream_enabled(payload)) {
|
| 193 |
+
if (!config.streaming.enabled) {
|
| 194 |
+
return json_response(http::status::not_implemented, {{"error", "Streaming is disabled"}});
|
| 195 |
+
}
|
| 196 |
+
return json_response(http::status::not_implemented, {{"error", "Streaming relay is not implemented yet"}});
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
std::string token_error;
|
| 200 |
+
auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
|
| 201 |
+
if (!estimate) {
|
| 202 |
+
return json_response(http::status::bad_request, {{"error", token_error}});
|
| 203 |
+
}
|
| 204 |
+
log_line("request_id=" + request_id +
|
| 205 |
+
" prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
|
| 206 |
+
" max_tokens=" + std::to_string(estimate->requested_max_tokens) +
|
| 207 |
+
" estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
|
| 208 |
+
|
| 209 |
+
auto rate_limit_decision = rate_limiter.allow(authenticated->key_id, estimate->estimated_total_tokens);
|
| 210 |
+
if (!rate_limit_decision.allowed) {
|
| 211 |
+
metrics.inc_rate_limited_total();
|
| 212 |
+
return json_response_with_retry_after(
|
| 213 |
+
http::status::too_many_requests,
|
| 214 |
+
{{"error", rate_limit_decision.error}},
|
| 215 |
+
rate_limit_decision.retry_after_sec);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
|
| 219 |
+
if (!scheduler.try_enqueue(ctx)) {
|
| 220 |
+
ctx->cancelled.store(true);
|
| 221 |
+
registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
|
| 222 |
+
metrics.inc_queue_rejected_total();
|
| 223 |
+
return json_response_with_retry_after(
|
| 224 |
+
http::status::service_unavailable,
|
| 225 |
+
{{"error", "Queue full"}},
|
| 226 |
+
scheduler.retry_after_sec());
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
std::unique_lock<std::mutex> lock(ctx->mu);
|
| 230 |
+
const bool finished = ctx->cv.wait_for(
|
| 231 |
+
lock,
|
| 232 |
+
std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
|
| 233 |
+
[&]() { return ctx->completed; });
|
| 234 |
+
if (!finished) {
|
| 235 |
+
lock.unlock();
|
| 236 |
+
registry.cancel_request(request_id);
|
| 237 |
+
metrics.add_cancellations_total();
|
| 238 |
+
std::string restart_error;
|
| 239 |
+
bool restarted = true;
|
| 240 |
+
if (ctx->state.load() == RequestState::RUNNING) {
|
| 241 |
+
restarted = manager.restart_active(restart_error);
|
| 242 |
+
if (restarted) metrics.inc_worker_restarts_total();
|
| 243 |
+
}
|
| 244 |
+
json timeout_payload = {
|
| 245 |
+
{"error", "Request timed out"},
|
| 246 |
+
{"state", state_to_string(ctx->state.load())}
|
| 247 |
+
};
|
| 248 |
+
if (!restarted) timeout_payload["restart_error"] = restart_error;
|
| 249 |
+
return json_response(http::status::gateway_timeout, timeout_payload);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
const auto final_state = ctx->state.load();
|
| 253 |
+
RequestResult result = ctx->result;
|
| 254 |
+
lock.unlock();
|
| 255 |
+
|
| 256 |
+
if (final_state == RequestState::CANCELLED) {
|
| 257 |
+
return json_response(http::status::ok, {{"status", "cancelled"}});
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
http::response<http::string_body> res{
|
| 261 |
+
static_cast<http::status>(result.status), req.version()};
|
| 262 |
+
res.set(http::field::content_type, result.content_type);
|
| 263 |
+
res.set(http::field::server, "llm-manager");
|
| 264 |
+
res.set("X-Request-Id", request_id);
|
| 265 |
+
res.keep_alive(req.keep_alive());
|
| 266 |
+
res.body() = result.body;
|
| 267 |
+
res.prepare_payload();
|
| 268 |
+
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 269 |
+
std::chrono::steady_clock::now() - start).count();
|
| 270 |
+
metrics.observe_request_latency_ms(elapsed_ms);
|
| 271 |
+
log_line("request_id=" + request_id +
|
| 272 |
+
" final_state=" + state_to_string(final_state) +
|
| 273 |
+
" upstream_status=" + std::to_string(result.status) +
|
| 274 |
+
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 275 |
+
return res;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
if (req.method() == http::verb::get) {
|
| 279 |
+
auto worker = manager.active_worker();
|
| 280 |
+
if (!worker) {
|
| 281 |
+
return json_response(http::status::service_unavailable, {{"error", "No active model"}});
|
| 282 |
+
}
|
| 283 |
+
auto upstream = forward_get_to_worker(*worker, target);
|
| 284 |
+
http::response<http::string_body> res{
|
| 285 |
+
static_cast<http::status>(upstream.status), req.version()};
|
| 286 |
+
res.set(http::field::content_type, upstream.content_type);
|
| 287 |
+
if (!upstream.content_encoding.empty()) {
|
| 288 |
+
res.set(http::field::content_encoding, upstream.content_encoding);
|
| 289 |
+
}
|
| 290 |
+
res.set(http::field::server, "llm-manager");
|
| 291 |
+
res.set("X-Request-Id", request_id);
|
| 292 |
+
res.keep_alive(req.keep_alive());
|
| 293 |
+
res.body() = upstream.body;
|
| 294 |
+
res.prepare_payload();
|
| 295 |
+
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 296 |
+
std::chrono::steady_clock::now() - start).count();
|
| 297 |
+
log_line("request_id=" + request_id +
|
| 298 |
+
" proxied_get model=" + worker->model +
|
| 299 |
+
" upstream_status=" + std::to_string(upstream.status) +
|
| 300 |
+
" elapsed_ms=" + std::to_string(elapsed_ms));
|
| 301 |
+
return res;
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
return json_response(http::status::not_found, {{"error", "Not found"}});
|
| 305 |
+
} catch (const std::exception &e) {
|
| 306 |
+
return json_response(http::status::internal_server_error, {{"error", e.what()}});
|
| 307 |
+
}
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
void do_session(
|
| 311 |
+
boost::asio::ip::tcp::socket socket,
|
| 312 |
+
ModelManager &manager,
|
| 313 |
+
const ManagerConfig &config,
|
| 314 |
+
const ApiKeyAuth &auth,
|
| 315 |
+
RateLimiterStore &rate_limiter,
|
| 316 |
+
RequestRegistry ®istry,
|
| 317 |
+
MetricsRegistry &metrics,
|
| 318 |
+
Scheduler &scheduler) {
|
| 319 |
+
try {
|
| 320 |
+
beast::flat_buffer buffer;
|
| 321 |
+
http::request<http::string_body> req;
|
| 322 |
+
http::read(socket, buffer, req);
|
| 323 |
+
auto res = handle_request(manager, config, auth, rate_limiter, registry, metrics, scheduler, std::move(req));
|
| 324 |
+
http::write(socket, res);
|
| 325 |
+
beast::error_code ec;
|
| 326 |
+
socket.shutdown(boost::asio::ip::tcp::socket::shutdown_send, ec);
|
| 327 |
+
} catch (...) {
|
| 328 |
+
}
|
| 329 |
+
}
|
cpp/server.h
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <boost/asio.hpp>
|
| 4 |
+
#include <boost/beast/http.hpp>
|
| 5 |
+
|
| 6 |
+
#include "llm_manager_types.h"
|
| 7 |
+
|
| 8 |
+
class ModelManager;
|
| 9 |
+
class ApiKeyAuth;
|
| 10 |
+
class RateLimiterStore;
|
| 11 |
+
class RequestRegistry;
|
| 12 |
+
class MetricsRegistry;
|
| 13 |
+
class Scheduler;
|
| 14 |
+
|
| 15 |
+
namespace http = boost::beast::http;
|
| 16 |
+
|
| 17 |
+
http::response<http::string_body> handle_request(
|
| 18 |
+
ModelManager &manager,
|
| 19 |
+
const ManagerConfig &config,
|
| 20 |
+
const ApiKeyAuth &auth,
|
| 21 |
+
RateLimiterStore &rate_limiter,
|
| 22 |
+
RequestRegistry ®istry,
|
| 23 |
+
MetricsRegistry &metrics,
|
| 24 |
+
Scheduler &scheduler,
|
| 25 |
+
http::request<http::string_body> &&req);
|
| 26 |
+
|
| 27 |
+
void do_session(
|
| 28 |
+
boost::asio::ip::tcp::socket socket,
|
| 29 |
+
ModelManager &manager,
|
| 30 |
+
const ManagerConfig &config,
|
| 31 |
+
const ApiKeyAuth &auth,
|
| 32 |
+
RateLimiterStore &rate_limiter,
|
| 33 |
+
RequestRegistry ®istry,
|
| 34 |
+
MetricsRegistry &metrics,
|
| 35 |
+
Scheduler &scheduler);
|
app.py → python/app.py
RENAMED
|
File without changes
|
logger.py → python/logger.py
RENAMED
|
File without changes
|