Dmitry Beresnev commited on
Commit
332826f
·
1 Parent(s): d9ce859

Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder

Browse files
Dockerfile CHANGED
@@ -55,9 +55,9 @@ RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/l
55
  ldd build/bin/llama-server || true
56
 
57
  # Build C++ manager (Boost.Beast + JSON)
58
- COPY cpp/llm_manager.cpp /build/llm_manager.cpp
59
  RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
60
- /build/llm_manager.cpp -o /build/llm-manager
61
 
62
  # Runtime stage
63
  FROM debian:bookworm-slim
@@ -95,8 +95,7 @@ RUN useradd -m -u 1000 user && \
95
  chown -R user:user /home/user
96
 
97
  # Copy application code
98
- COPY --chown=user:user app.py /home/user/app.py
99
- COPY --chown=user:user logger.py /home/user/logger.py
100
 
101
  USER user
102
  WORKDIR /home/user
@@ -105,6 +104,7 @@ WORKDIR /home/user
105
  ENV HOME=/home/user \
106
  LLAMA_CACHE=/home/user/.cache/llama.cpp \
107
  PATH=/home/user/.local/bin:$PATH \
 
108
  PYTHONUNBUFFERED=1
109
 
110
  EXPOSE 7860
 
55
  ldd build/bin/llama-server || true
56
 
57
  # Build C++ manager (Boost.Beast + JSON)
58
+ COPY cpp/ /build/cpp/
59
  RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
60
+ /build/cpp/*.cpp -o /build/llm-manager
61
 
62
  # Runtime stage
63
  FROM debian:bookworm-slim
 
95
  chown -R user:user /home/user
96
 
97
  # Copy application code
98
+ COPY --chown=user:user python/ /home/user/python/
 
99
 
100
  USER user
101
  WORKDIR /home/user
 
104
  ENV HOME=/home/user \
105
  LLAMA_CACHE=/home/user/.cache/llama.cpp \
106
  PATH=/home/user/.local/bin:$PATH \
107
+ PYTHONPATH=/home/user/python \
108
  PYTHONUNBUFFERED=1
109
 
110
  EXPOSE 7860
config.toml.example CHANGED
@@ -35,6 +35,9 @@ retry_after_sec = 5
35
  [scheduler]
36
  max_concurrent = 1
37
 
 
 
 
38
  [rate_limit]
39
  requests_per_minute = 60
40
  estimated_tokens_per_minute = 6000
 
35
  [scheduler]
36
  max_concurrent = 1
37
 
38
+ [streaming]
39
+ enabled = false
40
+
41
  [rate_limit]
42
  requests_per_minute = 60
43
  estimated_tokens_per_minute = 6000
cpp/config.cpp ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "config.h"
2
+
3
+ #include "http_helpers.h"
4
+
5
+ #include <algorithm>
6
+ #include <cstdlib>
7
+ #include <filesystem>
8
+ #include <fstream>
9
+ #include <unordered_map>
10
+
11
+ std::string get_env_or(const char *name, const std::string &fallback) {
12
+ const char *v = std::getenv(name);
13
+ return (v && *v) ? std::string(v) : fallback;
14
+ }
15
+
16
+ int get_env_int_or(const char *name, int fallback) {
17
+ const char *v = std::getenv(name);
18
+ if (!v || !*v) return fallback;
19
+ try {
20
+ return std::stoi(v);
21
+ } catch (...) {
22
+ return fallback;
23
+ }
24
+ }
25
+
26
+ std::string trim_copy(const std::string &value) {
27
+ const auto first = value.find_first_not_of(" \t\r\n");
28
+ if (first == std::string::npos) return "";
29
+ const auto last = value.find_last_not_of(" \t\r\n");
30
+ return value.substr(first, last - first + 1);
31
+ }
32
+
33
+ std::string strip_quotes(const std::string &value) {
34
+ if (value.size() >= 2) {
35
+ const char first = value.front();
36
+ const char last = value.back();
37
+ if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
38
+ return value.substr(1, value.size() - 2);
39
+ }
40
+ }
41
+ return value;
42
+ }
43
+
44
+ bool parse_bool_or(const std::string &value, bool fallback) {
45
+ const std::string normalized = trim_copy(value);
46
+ if (normalized == "true") return true;
47
+ if (normalized == "false") return false;
48
+ return fallback;
49
+ }
50
+
51
+ Role parse_role_or(const std::string &value, Role fallback) {
52
+ const std::string normalized = trim_copy(value);
53
+ if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
54
+ if (normalized == "user" || normalized == "USER") return Role::USER;
55
+ return fallback;
56
+ }
57
+
58
+ static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
59
+ parse_simple_toml(const std::string &path) {
60
+ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
61
+ std::ifstream input(path);
62
+ if (!input.is_open()) return out;
63
+
64
+ std::string current_section;
65
+ std::string line;
66
+ while (std::getline(input, line)) {
67
+ auto hash = line.find('#');
68
+ if (hash != std::string::npos) line = line.substr(0, hash);
69
+ line = trim_copy(line);
70
+ if (line.empty()) continue;
71
+
72
+ if (line.front() == '[' && line.back() == ']') {
73
+ current_section = trim_copy(line.substr(1, line.size() - 2));
74
+ continue;
75
+ }
76
+
77
+ const auto eq = line.find('=');
78
+ if (eq == std::string::npos) continue;
79
+
80
+ std::string key = trim_copy(line.substr(0, eq));
81
+ std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
82
+ out[current_section][key] = value;
83
+ }
84
+ return out;
85
+ }
86
+
87
+ static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
88
+ std::vector<ApiKeyRecord> keys;
89
+ std::ifstream input(path);
90
+ if (!input.is_open()) return keys;
91
+
92
+ std::string line;
93
+ bool in_api_key = false;
94
+ ApiKeyRecord current;
95
+ bool has_any_field = false;
96
+
97
+ auto flush_current = [&]() {
98
+ if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
99
+ keys.push_back(current);
100
+ }
101
+ current = ApiKeyRecord{};
102
+ has_any_field = false;
103
+ };
104
+
105
+ while (std::getline(input, line)) {
106
+ auto hash = line.find('#');
107
+ if (hash != std::string::npos) line = line.substr(0, hash);
108
+ line = trim_copy(line);
109
+ if (line.empty()) continue;
110
+
111
+ if (line == "[[api_keys]]") {
112
+ flush_current();
113
+ in_api_key = true;
114
+ continue;
115
+ }
116
+
117
+ if (!in_api_key) continue;
118
+ if (line.front() == '[' && line.back() == ']') {
119
+ flush_current();
120
+ in_api_key = false;
121
+ continue;
122
+ }
123
+
124
+ const auto eq = line.find('=');
125
+ if (eq == std::string::npos) continue;
126
+
127
+ std::string key = trim_copy(line.substr(0, eq));
128
+ std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
129
+ has_any_field = true;
130
+
131
+ if (key == "key_id") current.key_id = value;
132
+ else if (key == "secret") current.secret = value;
133
+ else if (key == "role") current.role = parse_role_or(value, current.role);
134
+ else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
135
+ }
136
+
137
+ flush_current();
138
+ return keys;
139
+ }
140
+
141
+ static std::string get_toml_string_or(
142
+ const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
143
+ const std::string &section,
144
+ const std::string &key,
145
+ const std::string &fallback) {
146
+ const auto it = data.find(section);
147
+ if (it == data.end()) return fallback;
148
+ const auto kv = it->second.find(key);
149
+ if (kv == it->second.end() || kv->second.empty()) return fallback;
150
+ return kv->second;
151
+ }
152
+
153
+ static int get_toml_int_or(
154
+ const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
155
+ const std::string &section,
156
+ const std::string &key,
157
+ int fallback) {
158
+ const auto it = data.find(section);
159
+ if (it == data.end()) return fallback;
160
+ const auto kv = it->second.find(key);
161
+ if (kv == it->second.end() || kv->second.empty()) return fallback;
162
+ try {
163
+ return std::stoi(kv->second);
164
+ } catch (...) {
165
+ return fallback;
166
+ }
167
+ }
168
+
169
+ ManagerConfig load_manager_config() {
170
+ ManagerConfig cfg;
171
+
172
+ const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
173
+ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
174
+ if (std::filesystem::exists(config_path)) {
175
+ toml = parse_simple_toml(config_path);
176
+ log_line("config: loaded " + config_path);
177
+ } else {
178
+ log_line("config: using environment/defaults (file not found: " + config_path + ")");
179
+ }
180
+
181
+ cfg.server.host = get_env_or("MANAGER_HOST", get_toml_string_or(toml, "server", "host", cfg.server.host));
182
+ cfg.server.port = get_env_int_or("MANAGER_PORT", get_toml_int_or(toml, "server", "port", cfg.server.port));
183
+
184
+ cfg.worker.default_model = get_env_or("DEFAULT_MODEL", get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
185
+ cfg.worker.llama_server_bin = get_env_or("LLAMA_SERVER_BIN", get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
186
+ cfg.worker.host = get_env_or("WORKER_HOST", get_toml_string_or(toml, "worker", "host", cfg.worker.host));
187
+ cfg.worker.bind_host = get_env_or("WORKER_BIND_HOST", get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
188
+ cfg.worker.base_port = get_env_int_or("WORKER_BASE_PORT", get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
189
+ cfg.worker.switch_timeout_sec = get_env_int_or("SWITCH_TIMEOUT_SEC", get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
190
+
191
+ cfg.llama.n_ctx = get_env_int_or("MODEL_N_CTX", get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
192
+ cfg.llama.threads = get_env_int_or("MODEL_THREADS", get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
193
+ cfg.llama.ngl = get_env_int_or("MODEL_NGL", get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
194
+ cfg.llama.batch = get_env_int_or("MODEL_BATCH", get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
195
+ cfg.llama.ubatch = get_env_int_or("MODEL_UBATCH", get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
196
+
197
+ cfg.auth.header = get_env_or("AUTH_HEADER", get_toml_string_or(toml, "auth", "header", cfg.auth.header));
198
+ cfg.auth.scheme = get_env_or("AUTH_SCHEME", get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
199
+
200
+ cfg.limits.default_max_tokens = get_env_int_or("DEFAULT_MAX_TOKENS", get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
201
+ cfg.limits.max_tokens_per_request = get_env_int_or("MAX_TOKENS_PER_REQUEST", get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
202
+ cfg.limits.request_timeout_sec = get_env_int_or("REQUEST_TIMEOUT_SEC", get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
203
+
204
+ cfg.queue.max_size = static_cast<size_t>(std::max(
205
+ 1,
206
+ get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
207
+ cfg.queue.max_tokens = get_env_int_or("QUEUE_MAX_TOKENS", get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
208
+ cfg.queue.admin_quota = get_env_int_or("QUEUE_ADMIN_QUOTA", get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
209
+ cfg.queue.retry_after_sec = get_env_int_or("QUEUE_RETRY_AFTER_SEC", get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
210
+
211
+ cfg.rate_limit.requests_per_minute = get_env_int_or("REQUESTS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
212
+ cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or("ESTIMATED_TOKENS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
213
+
214
+ cfg.scheduler.max_concurrent = get_env_int_or("SCHEDULER_MAX_CONCURRENT", get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
215
+ cfg.streaming.enabled = parse_bool_or(
216
+ get_env_or("STREAMING_ENABLED", get_toml_string_or(toml, "streaming", "enabled", cfg.streaming.enabled ? "true" : "false")),
217
+ cfg.streaming.enabled);
218
+
219
+ if (!config_path.empty() && std::filesystem::exists(config_path)) {
220
+ cfg.api_keys = parse_api_keys_toml(config_path);
221
+ }
222
+
223
+ return cfg;
224
+ }
cpp/config.h ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+
5
+ #include "llm_manager_types.h"
6
+
7
+ std::string get_env_or(const char *name, const std::string &fallback);
8
+ int get_env_int_or(const char *name, int fallback);
9
+ std::string trim_copy(const std::string &value);
10
+ std::string strip_quotes(const std::string &value);
11
+ bool parse_bool_or(const std::string &value, bool fallback);
12
+ Role parse_role_or(const std::string &value, Role fallback);
13
+ ManagerConfig load_manager_config();
cpp/http_helpers.cpp ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "http_helpers.h"
2
+
3
+ #include <boost/asio.hpp>
4
+ #include <boost/beast/core.hpp>
5
+ #include <boost/beast/http.hpp>
6
+
7
+ #include <ctime>
8
+ #include <iomanip>
9
+ #include <iostream>
10
+ #include <sstream>
11
+
12
+ namespace asio = boost::asio;
13
+ namespace beast = boost::beast;
14
+ namespace http = beast::http;
15
+
16
+ std::string now_utc_iso() {
17
+ std::time_t t = std::time(nullptr);
18
+ std::tm tm{};
19
+ gmtime_r(&t, &tm);
20
+ std::ostringstream oss;
21
+ oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
22
+ return oss.str();
23
+ }
24
+
25
+ void log_line(const std::string &line) {
26
+ std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
27
+ }
28
+
29
+ std::string truncate_body(const std::string &body, size_t max_len) {
30
+ if (body.size() <= max_len) return body;
31
+ return body.substr(0, max_len) + "...[truncated]";
32
+ }
33
+
34
+ std::optional<std::string> extract_cancel_request_id(const std::string &path) {
35
+ const std::string prefix = "/requests/";
36
+ const std::string suffix = "/cancel";
37
+ if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
38
+ if (path.rfind(prefix, 0) != 0) return std::nullopt;
39
+ if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
40
+ const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
41
+ if (request_id.empty()) return std::nullopt;
42
+ return request_id;
43
+ }
44
+
45
+ std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
46
+ asio::io_context ioc;
47
+ asio::ip::tcp::resolver resolver(ioc);
48
+ beast::tcp_stream stream(ioc);
49
+ auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
50
+ stream.connect(results);
51
+
52
+ http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
53
+ req.set(http::field::host, "127.0.0.1");
54
+ req.set(http::field::content_type, "application/json");
55
+ req.set(http::field::user_agent, "llm-manager");
56
+ req.body() = body;
57
+ req.prepare_payload();
58
+ http::write(stream, req);
59
+
60
+ beast::flat_buffer buffer;
61
+ http::response<http::string_body> res;
62
+ http::read(stream, buffer, res);
63
+ beast::error_code ec;
64
+ stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
65
+ return {res.result_int(), res.body()};
66
+ }
67
+
68
+ ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target) {
69
+ asio::io_context ioc;
70
+ asio::ip::tcp::resolver resolver(ioc);
71
+ beast::tcp_stream stream(ioc);
72
+ auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
73
+ stream.connect(results);
74
+
75
+ http::request<http::string_body> req{http::verb::get, target, 11};
76
+ req.set(http::field::host, "127.0.0.1");
77
+ req.set(http::field::user_agent, "llm-manager");
78
+ req.set(http::field::accept_encoding, "gzip, identity");
79
+ http::write(stream, req);
80
+
81
+ beast::flat_buffer buffer;
82
+ http::response<http::string_body> res;
83
+ http::read(stream, buffer, res);
84
+ beast::error_code ec;
85
+ stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
86
+
87
+ ProxiedGetResult out;
88
+ out.status = res.result_int();
89
+ out.body = res.body();
90
+ if (res.base().find(http::field::content_type) != res.base().end()) {
91
+ out.content_type = res.base()[http::field::content_type].to_string();
92
+ }
93
+ if (res.base().find(http::field::content_encoding) != res.base().end()) {
94
+ out.content_encoding = res.base()[http::field::content_encoding].to_string();
95
+ }
96
+ return out;
97
+ }
cpp/http_helpers.h ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <optional>
4
+ #include <string>
5
+ #include <utility>
6
+
7
+ #include "llm_manager_types.h"
8
+
9
+ std::string now_utc_iso();
10
+ void log_line(const std::string &line);
11
+ std::string truncate_body(const std::string &body, size_t max_len = 2000);
12
+ std::optional<std::string> extract_cancel_request_id(const std::string &path);
13
+ std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
14
+ ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target);
cpp/llm_manager.cpp CHANGED
@@ -1,1530 +1,29 @@
1
  #include <boost/asio.hpp>
2
- #include <boost/beast/core.hpp>
3
- #include <boost/beast/http.hpp>
4
- #include <boost/beast/version.hpp>
5
- #include <nlohmann/json.hpp>
6
 
7
- #include <algorithm>
8
- #include <atomic>
9
- #include <chrono>
10
- #include <condition_variable>
11
- #include <csignal>
12
- #include <cstdlib>
13
- #include <ctime>
14
- #include <deque>
15
- #include <filesystem>
16
- #include <fstream>
17
- #include <iomanip>
18
- #include <iostream>
19
- #include <memory>
20
- #include <mutex>
21
- #include <optional>
22
- #include <sstream>
23
  #include <string>
24
  #include <thread>
25
- #include <unordered_map>
26
- #include <vector>
27
 
28
- #include <sys/types.h>
29
- #include <sys/wait.h>
30
- #include <unistd.h>
 
 
31
 
32
  namespace asio = boost::asio;
33
- namespace beast = boost::beast;
34
- namespace http = beast::http;
35
- using json = nlohmann::json;
36
-
37
- static void log_line(const std::string &line);
38
-
39
- struct ServerConfig {
40
- std::string host = "0.0.0.0";
41
- int port = 7860;
42
- };
43
-
44
- struct WorkerConfig {
45
- std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
46
- std::string llama_server_bin = "/usr/local/bin/llama-server";
47
- std::string host = "127.0.0.1";
48
- std::string bind_host = "0.0.0.0";
49
- int base_port = 8080;
50
- int switch_timeout_sec = 300;
51
- };
52
-
53
- struct LlamaConfig {
54
- int n_ctx = 8192;
55
- int threads = 4;
56
- int ngl = 0;
57
- int batch = 128;
58
- int ubatch = 64;
59
- };
60
-
61
- enum class Role {
62
- ADMIN,
63
- USER
64
- };
65
-
66
- struct AuthConfig {
67
- std::string header = "Authorization";
68
- std::string scheme = "Bearer";
69
- };
70
-
71
- struct LimitsConfig {
72
- int default_max_tokens = 256;
73
- int max_tokens_per_request = 2048;
74
- int request_timeout_sec = 30;
75
- };
76
-
77
- struct QueueConfig {
78
- size_t max_size = 100;
79
- int max_tokens = 20000;
80
- int admin_quota = 3;
81
- int retry_after_sec = 5;
82
- };
83
-
84
- struct RateLimitConfig {
85
- int requests_per_minute = 60;
86
- int estimated_tokens_per_minute = 6000;
87
- };
88
-
89
- struct SchedulerConfig {
90
- int max_concurrent = 1;
91
- };
92
-
93
- struct ApiKeyRecord {
94
- std::string key_id;
95
- std::string secret;
96
- Role role = Role::USER;
97
- bool enabled = true;
98
- };
99
-
100
- struct ManagerConfig {
101
- ServerConfig server;
102
- WorkerConfig worker;
103
- LlamaConfig llama;
104
- AuthConfig auth;
105
- LimitsConfig limits;
106
- QueueConfig queue;
107
- RateLimitConfig rate_limit;
108
- SchedulerConfig scheduler;
109
- std::vector<ApiKeyRecord> api_keys;
110
- };
111
-
112
- struct WorkerInfo {
113
- std::string model;
114
- int port = 0;
115
- pid_t pid = -1;
116
- std::string last_loaded;
117
- };
118
-
119
- static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
120
-
121
- static std::string now_utc_iso() {
122
- std::time_t t = std::time(nullptr);
123
- std::tm tm{};
124
- gmtime_r(&t, &tm);
125
- std::ostringstream oss;
126
- oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
127
- return oss.str();
128
- }
129
-
130
- static std::string get_env_or(const char *name, const std::string &fallback) {
131
- const char *v = std::getenv(name);
132
- return (v && *v) ? std::string(v) : fallback;
133
- }
134
-
135
- static int get_env_int_or(const char *name, int fallback) {
136
- const char *v = std::getenv(name);
137
- if (!v || !*v) return fallback;
138
- try {
139
- return std::stoi(v);
140
- } catch (...) {
141
- return fallback;
142
- }
143
- }
144
-
145
- static std::string trim_copy(const std::string &value) {
146
- const auto first = value.find_first_not_of(" \t\r\n");
147
- if (first == std::string::npos) return "";
148
- const auto last = value.find_last_not_of(" \t\r\n");
149
- return value.substr(first, last - first + 1);
150
- }
151
-
152
- static std::string strip_quotes(const std::string &value) {
153
- if (value.size() >= 2) {
154
- const char first = value.front();
155
- const char last = value.back();
156
- if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
157
- return value.substr(1, value.size() - 2);
158
- }
159
- }
160
- return value;
161
- }
162
-
163
- static bool parse_bool_or(const std::string &value, bool fallback) {
164
- const std::string normalized = trim_copy(value);
165
- if (normalized == "true") return true;
166
- if (normalized == "false") return false;
167
- return fallback;
168
- }
169
-
170
- static Role parse_role_or(const std::string &value, Role fallback) {
171
- const std::string normalized = trim_copy(value);
172
- if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
173
- if (normalized == "user" || normalized == "USER") return Role::USER;
174
- return fallback;
175
- }
176
-
177
- static std::string role_to_string(Role role) {
178
- return role == Role::ADMIN ? "admin" : "user";
179
- }
180
-
181
- enum class Priority {
182
- ADMIN = 0,
183
- USER = 1
184
- };
185
-
186
- static Priority role_to_priority(Role role) {
187
- return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
188
- }
189
-
190
- enum class RequestState {
191
- QUEUED,
192
- RUNNING,
193
- CANCELLED,
194
- FAILED,
195
- DONE
196
- };
197
-
198
- static std::string state_to_string(RequestState state) {
199
- switch (state) {
200
- case RequestState::QUEUED: return "queued";
201
- case RequestState::RUNNING: return "running";
202
- case RequestState::CANCELLED: return "cancelled";
203
- case RequestState::FAILED: return "failed";
204
- case RequestState::DONE: return "done";
205
- }
206
- return "unknown";
207
- }
208
-
209
- struct TokenEstimate {
210
- int prompt_tokens = 0;
211
- int requested_max_tokens = 0;
212
- int estimated_total_tokens = 0;
213
- };
214
-
215
- struct RateLimitDecision {
216
- bool allowed = true;
217
- int retry_after_sec = 0;
218
- std::string error;
219
- };
220
-
221
- struct RequestResult {
222
- int status = 500;
223
- std::string body;
224
- std::string content_type = "application/json";
225
- };
226
-
227
- struct RequestContext {
228
- std::string request_id;
229
- std::string api_key_id;
230
- Role role = Role::USER;
231
- Priority priority = Priority::USER;
232
- TokenEstimate estimate;
233
- std::string request_body;
234
- std::atomic<RequestState> state{RequestState::QUEUED};
235
- std::atomic<bool> cancelled{false};
236
- std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
237
- std::chrono::steady_clock::time_point enqueue_time{created_at};
238
- std::chrono::steady_clock::time_point start_time{};
239
- std::mutex mu;
240
- std::condition_variable cv;
241
- bool completed = false;
242
- RequestResult result;
243
- };
244
-
245
- static int estimate_text_tokens_rough(const std::string &text) {
246
- if (text.empty()) return 0;
247
- return std::max(1, static_cast<int>((text.size() + 3) / 4));
248
- }
249
-
250
- static std::string flatten_json_content(const json &content) {
251
- if (content.is_string()) {
252
- return content.get<std::string>();
253
- }
254
- if (content.is_array()) {
255
- std::ostringstream oss;
256
- bool first = true;
257
- for (const auto &item : content) {
258
- std::string part;
259
- if (item.is_string()) {
260
- part = item.get<std::string>();
261
- } else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
262
- part = item["text"].get<std::string>();
263
- }
264
- if (part.empty()) continue;
265
- if (!first) oss << '\n';
266
- oss << part;
267
- first = false;
268
- }
269
- return oss.str();
270
- }
271
- return "";
272
- }
273
-
274
- static std::optional<TokenEstimate> estimate_chat_tokens(
275
- const json &payload,
276
- const LimitsConfig &limits,
277
- std::string &error) {
278
- if (!payload.is_object()) {
279
- error = "Expected JSON object";
280
- return std::nullopt;
281
- }
282
- if (!payload.contains("messages") || !payload["messages"].is_array()) {
283
- error = "Expected 'messages' array";
284
- return std::nullopt;
285
- }
286
-
287
- TokenEstimate estimate;
288
- estimate.requested_max_tokens = limits.default_max_tokens;
289
- if (payload.contains("max_tokens")) {
290
- if (!payload["max_tokens"].is_number_integer()) {
291
- error = "Expected integer 'max_tokens'";
292
- return std::nullopt;
293
- }
294
- estimate.requested_max_tokens = payload["max_tokens"].get<int>();
295
- }
296
-
297
- if (estimate.requested_max_tokens <= 0) {
298
- error = "'max_tokens' must be > 0";
299
- return std::nullopt;
300
- }
301
-
302
- for (const auto &message : payload["messages"]) {
303
- if (!message.is_object()) continue;
304
- if (message.contains("role") && message["role"].is_string()) {
305
- estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
306
- }
307
- if (message.contains("content")) {
308
- estimate.prompt_tokens += estimate_text_tokens_rough(
309
- flatten_json_content(message["content"]));
310
- }
311
- estimate.prompt_tokens += 4;
312
- }
313
-
314
- estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
315
- if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
316
- error = "Estimated request tokens exceed configured limit";
317
- return std::nullopt;
318
- }
319
-
320
- error.clear();
321
- return estimate;
322
- }
323
-
324
- static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
325
- parse_simple_toml(const std::string &path) {
326
- std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
327
- std::ifstream input(path);
328
- if (!input.is_open()) return out;
329
-
330
- std::string current_section;
331
- std::string line;
332
- while (std::getline(input, line)) {
333
- auto hash = line.find('#');
334
- if (hash != std::string::npos) line = line.substr(0, hash);
335
- line = trim_copy(line);
336
- if (line.empty()) continue;
337
-
338
- if (line.front() == '[' && line.back() == ']') {
339
- current_section = trim_copy(line.substr(1, line.size() - 2));
340
- continue;
341
- }
342
-
343
- const auto eq = line.find('=');
344
- if (eq == std::string::npos) continue;
345
-
346
- std::string key = trim_copy(line.substr(0, eq));
347
- std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
348
- out[current_section][key] = value;
349
- }
350
- return out;
351
- }
352
-
353
- static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
354
- std::vector<ApiKeyRecord> keys;
355
- std::ifstream input(path);
356
- if (!input.is_open()) return keys;
357
-
358
- std::string line;
359
- bool in_api_key = false;
360
- ApiKeyRecord current;
361
- bool has_any_field = false;
362
-
363
- auto flush_current = [&]() {
364
- if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
365
- keys.push_back(current);
366
- }
367
- current = ApiKeyRecord{};
368
- has_any_field = false;
369
- };
370
-
371
- while (std::getline(input, line)) {
372
- auto hash = line.find('#');
373
- if (hash != std::string::npos) line = line.substr(0, hash);
374
- line = trim_copy(line);
375
- if (line.empty()) continue;
376
-
377
- if (line == "[[api_keys]]") {
378
- flush_current();
379
- in_api_key = true;
380
- continue;
381
- }
382
-
383
- if (!in_api_key) continue;
384
-
385
- if (line.front() == '[' && line.back() == ']') {
386
- flush_current();
387
- in_api_key = false;
388
- continue;
389
- }
390
-
391
- const auto eq = line.find('=');
392
- if (eq == std::string::npos) continue;
393
-
394
- std::string key = trim_copy(line.substr(0, eq));
395
- std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
396
- has_any_field = true;
397
-
398
- if (key == "key_id") current.key_id = value;
399
- else if (key == "secret") current.secret = value;
400
- else if (key == "role") current.role = parse_role_or(value, current.role);
401
- else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
402
- }
403
-
404
- flush_current();
405
- return keys;
406
- }
407
-
408
- static std::string get_toml_string_or(
409
- const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
410
- const std::string &section,
411
- const std::string &key,
412
- const std::string &fallback) {
413
- const auto it = data.find(section);
414
- if (it == data.end()) return fallback;
415
- const auto kv = it->second.find(key);
416
- if (kv == it->second.end() || kv->second.empty()) return fallback;
417
- return kv->second;
418
- }
419
-
420
- static int get_toml_int_or(
421
- const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
422
- const std::string &section,
423
- const std::string &key,
424
- int fallback) {
425
- const auto it = data.find(section);
426
- if (it == data.end()) return fallback;
427
- const auto kv = it->second.find(key);
428
- if (kv == it->second.end() || kv->second.empty()) return fallback;
429
- try {
430
- return std::stoi(kv->second);
431
- } catch (...) {
432
- return fallback;
433
- }
434
- }
435
-
436
- static ManagerConfig load_manager_config() {
437
- ManagerConfig cfg;
438
-
439
- const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
440
- std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
441
- if (std::filesystem::exists(config_path)) {
442
- toml = parse_simple_toml(config_path);
443
- log_line("config: loaded " + config_path);
444
- } else {
445
- log_line("config: using environment/defaults (file not found: " + config_path + ")");
446
- }
447
-
448
- cfg.server.host = get_env_or(
449
- "MANAGER_HOST",
450
- get_toml_string_or(toml, "server", "host", cfg.server.host));
451
- cfg.server.port = get_env_int_or(
452
- "MANAGER_PORT",
453
- get_toml_int_or(toml, "server", "port", cfg.server.port));
454
-
455
- cfg.worker.default_model = get_env_or(
456
- "DEFAULT_MODEL",
457
- get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
458
- cfg.worker.llama_server_bin = get_env_or(
459
- "LLAMA_SERVER_BIN",
460
- get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
461
- cfg.worker.host = get_env_or(
462
- "WORKER_HOST",
463
- get_toml_string_or(toml, "worker", "host", cfg.worker.host));
464
- cfg.worker.bind_host = get_env_or(
465
- "WORKER_BIND_HOST",
466
- get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
467
- cfg.worker.base_port = get_env_int_or(
468
- "WORKER_BASE_PORT",
469
- get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
470
- cfg.worker.switch_timeout_sec = get_env_int_or(
471
- "SWITCH_TIMEOUT_SEC",
472
- get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
473
-
474
- cfg.llama.n_ctx = get_env_int_or(
475
- "MODEL_N_CTX",
476
- get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
477
- cfg.llama.threads = get_env_int_or(
478
- "MODEL_THREADS",
479
- get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
480
- cfg.llama.ngl = get_env_int_or(
481
- "MODEL_NGL",
482
- get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
483
- cfg.llama.batch = get_env_int_or(
484
- "MODEL_BATCH",
485
- get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
486
- cfg.llama.ubatch = get_env_int_or(
487
- "MODEL_UBATCH",
488
- get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
489
-
490
- cfg.auth.header = get_env_or(
491
- "AUTH_HEADER",
492
- get_toml_string_or(toml, "auth", "header", cfg.auth.header));
493
- cfg.auth.scheme = get_env_or(
494
- "AUTH_SCHEME",
495
- get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
496
-
497
- cfg.limits.default_max_tokens = get_env_int_or(
498
- "DEFAULT_MAX_TOKENS",
499
- get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
500
- cfg.limits.max_tokens_per_request = get_env_int_or(
501
- "MAX_TOKENS_PER_REQUEST",
502
- get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
503
- cfg.limits.request_timeout_sec = get_env_int_or(
504
- "REQUEST_TIMEOUT_SEC",
505
- get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
506
-
507
- cfg.queue.max_size = static_cast<size_t>(std::max(
508
- 1,
509
- get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
510
- cfg.queue.max_tokens = get_env_int_or(
511
- "QUEUE_MAX_TOKENS",
512
- get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
513
- cfg.queue.admin_quota = get_env_int_or(
514
- "QUEUE_ADMIN_QUOTA",
515
- get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
516
- cfg.queue.retry_after_sec = get_env_int_or(
517
- "QUEUE_RETRY_AFTER_SEC",
518
- get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
519
-
520
- cfg.rate_limit.requests_per_minute = get_env_int_or(
521
- "REQUESTS_PER_MINUTE",
522
- get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
523
- cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or(
524
- "ESTIMATED_TOKENS_PER_MINUTE",
525
- get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
526
-
527
- cfg.scheduler.max_concurrent = get_env_int_or(
528
- "SCHEDULER_MAX_CONCURRENT",
529
- get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
530
-
531
- if (!config_path.empty() && std::filesystem::exists(config_path)) {
532
- cfg.api_keys = parse_api_keys_toml(config_path);
533
- }
534
-
535
- return cfg;
536
- }
537
-
538
- static bool is_alive(pid_t pid) {
539
- if (pid <= 0) return false;
540
- return kill(pid, 0) == 0;
541
- }
542
-
543
- static void shutdown_worker(pid_t pid, int wait_seconds = 15) {
544
- if (pid <= 0) return;
545
- kill(pid, SIGTERM);
546
- const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
547
- while (std::chrono::steady_clock::now() < deadline) {
548
- int status = 0;
549
- pid_t r = waitpid(pid, &status, WNOHANG);
550
- if (r == pid) return;
551
- std::this_thread::sleep_for(std::chrono::milliseconds(200));
552
- }
553
- kill(pid, SIGKILL);
554
- int status = 0;
555
- waitpid(pid, &status, 0);
556
- }
557
-
558
- class ModelManager {
559
- public:
560
- explicit ModelManager(const ManagerConfig &config)
561
- : _default_model(config.worker.default_model),
562
- _llama_server_bin(config.worker.llama_server_bin),
563
- _worker_host(config.worker.host),
564
- _worker_bind_host(config.worker.bind_host),
565
- _base_port(config.worker.base_port),
566
- _switch_timeout_sec(config.worker.switch_timeout_sec),
567
- _n_ctx(config.llama.n_ctx),
568
- _n_threads(config.llama.threads),
569
- _n_gpu_layers(config.llama.ngl),
570
- _n_batch(config.llama.batch),
571
- _n_ubatch(config.llama.ubatch),
572
- _next_port(_base_port) {}
573
-
574
- bool initialize_default(std::string &error) {
575
- return switch_model(_default_model, error);
576
- }
577
-
578
- bool switch_model(const std::string &model, std::string &error) {
579
- {
580
- std::lock_guard<std::mutex> lock(_mu);
581
- if (_switch_in_progress) {
582
- error = "Switch already in progress";
583
- return false;
584
- }
585
- if (_active && _active->model == model && is_alive(_active->pid)) {
586
- return true;
587
- }
588
- _switch_in_progress = true;
589
- }
590
-
591
- std::optional<WorkerInfo> old_worker;
592
- {
593
- std::lock_guard<std::mutex> lock(_mu);
594
- if (_active) old_worker = _active;
595
- }
596
-
597
- int port = allocate_port();
598
- pid_t pid = spawn_worker(model, port);
599
- if (pid <= 0) {
600
- finish_switch(false);
601
- error = "Failed to start worker process";
602
- return false;
603
- }
604
-
605
- if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
606
- shutdown_worker(pid);
607
- finish_switch(false);
608
- error = "New model did not become ready in time";
609
- return false;
610
- }
611
-
612
- WorkerInfo new_worker{model, port, pid, now_utc_iso()};
613
- {
614
- std::lock_guard<std::mutex> lock(_mu);
615
- _active = new_worker;
616
- _switch_in_progress = false;
617
- }
618
-
619
- if (old_worker && old_worker->pid != pid) {
620
- shutdown_worker(old_worker->pid);
621
- }
622
- return true;
623
- }
624
-
625
- bool restart_active(std::string &error) {
626
- std::optional<WorkerInfo> old_worker;
627
- std::string model;
628
- {
629
- std::lock_guard<std::mutex> lock(_mu);
630
- if (_switch_in_progress) {
631
- error = "Switch already in progress";
632
- return false;
633
- }
634
- if (!_active || !is_alive(_active->pid)) {
635
- error = "No active model";
636
- return false;
637
- }
638
- _switch_in_progress = true;
639
- old_worker = _active;
640
- model = _active->model;
641
- }
642
-
643
- shutdown_worker(old_worker->pid);
644
-
645
- int port = allocate_port();
646
- pid_t pid = spawn_worker(model, port);
647
- if (pid <= 0) {
648
- std::lock_guard<std::mutex> lock(_mu);
649
- _active = std::nullopt;
650
- _switch_in_progress = false;
651
- error = "Failed to start worker process";
652
- return false;
653
- }
654
-
655
- if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
656
- shutdown_worker(pid);
657
- std::lock_guard<std::mutex> lock(_mu);
658
- _active = std::nullopt;
659
- _switch_in_progress = false;
660
- error = "New model did not become ready in time";
661
- return false;
662
- }
663
-
664
- WorkerInfo new_worker{model, port, pid, now_utc_iso()};
665
- {
666
- std::lock_guard<std::mutex> lock(_mu);
667
- _active = new_worker;
668
- _switch_in_progress = false;
669
- }
670
- return true;
671
- }
672
-
673
- std::optional<WorkerInfo> active_worker() {
674
- std::lock_guard<std::mutex> lock(_mu);
675
- if (_active && is_alive(_active->pid)) return _active;
676
- return std::nullopt;
677
- }
678
-
679
- json models_view() {
680
- std::lock_guard<std::mutex> lock(_mu);
681
- json out;
682
- out["status"] = (_active && is_alive(_active->pid)) ? "ready" : "no_active_model";
683
- out["switch_in_progress"] = _switch_in_progress;
684
- if (_active && is_alive(_active->pid)) {
685
- out["current_model"] = _active->model;
686
- out["last_loaded"] = _active->last_loaded;
687
- out["active_pid"] = _active->pid;
688
- out["active_port"] = _active->port;
689
- } else {
690
- out["current_model"] = nullptr;
691
- out["last_loaded"] = nullptr;
692
- out["active_pid"] = nullptr;
693
- out["active_port"] = nullptr;
694
- }
695
- return out;
696
- }
697
-
698
- private:
699
- std::mutex _mu;
700
- std::optional<WorkerInfo> _active;
701
- bool _switch_in_progress = false;
702
-
703
- std::string _default_model;
704
- std::string _llama_server_bin;
705
- std::string _worker_host;
706
- std::string _worker_bind_host;
707
- int _base_port;
708
- int _switch_timeout_sec;
709
- int _n_ctx;
710
- int _n_threads;
711
- int _n_gpu_layers;
712
- int _n_batch;
713
- int _n_ubatch;
714
- int _next_port;
715
-
716
- int allocate_port() {
717
- std::lock_guard<std::mutex> lock(_mu);
718
- return _next_port++;
719
- }
720
-
721
- void finish_switch(bool ok) {
722
- std::lock_guard<std::mutex> lock(_mu);
723
- if (!ok) _switch_in_progress = false;
724
- }
725
-
726
- pid_t spawn_worker(const std::string &model, int port) {
727
- pid_t pid = fork();
728
- if (pid < 0) return -1;
729
- if (pid == 0) {
730
- setsid();
731
- std::string port_s = std::to_string(port);
732
- std::string n_ctx_s = std::to_string(_n_ctx);
733
- std::string threads_s = std::to_string(_n_threads);
734
- std::string ngl_s = std::to_string(_n_gpu_layers);
735
- std::string batch_s = std::to_string(_n_batch);
736
- std::string ubatch_s = std::to_string(_n_ubatch);
737
-
738
- std::vector<std::string> args = {
739
- _llama_server_bin,
740
- "-hf", model,
741
- "--host", _worker_bind_host,
742
- "--port", port_s,
743
- "-c", n_ctx_s,
744
- "-t", threads_s,
745
- "-ngl", ngl_s,
746
- "--cont-batching",
747
- "-b", batch_s,
748
- "--ubatch-size", ubatch_s
749
- };
750
-
751
- std::vector<char *> argv;
752
- argv.reserve(args.size() + 1);
753
- for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
754
- argv.push_back(nullptr);
755
- execvp(argv[0], argv.data());
756
- _exit(127);
757
- }
758
- return pid;
759
- }
760
-
761
- bool wait_until_ready(pid_t pid, int port, int timeout_sec) {
762
- const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
763
- while (std::chrono::steady_clock::now() < deadline) {
764
- if (!is_alive(pid)) return false;
765
- try {
766
- auto [status, _] = http_get(port, "/");
767
- if (status == 200) return true;
768
- } catch (...) {
769
- }
770
- std::this_thread::sleep_for(std::chrono::milliseconds(800));
771
- }
772
- return false;
773
- }
774
-
775
- std::pair<int, std::string> http_get(int port, const std::string &target) {
776
- asio::io_context ioc;
777
- asio::ip::tcp::resolver resolver(ioc);
778
- beast::tcp_stream stream(ioc);
779
- auto const results = resolver.resolve(_worker_host, std::to_string(port));
780
- stream.connect(results);
781
-
782
- http::request<http::string_body> req{http::verb::get, target, 11};
783
- req.set(http::field::host, _worker_host);
784
- req.set(http::field::user_agent, "llm-manager");
785
- http::write(stream, req);
786
-
787
- beast::flat_buffer buffer;
788
- http::response<http::string_body> res;
789
- http::read(stream, buffer, res);
790
- beast::error_code ec;
791
- stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
792
- return {res.result_int(), res.body()};
793
- }
794
- };
795
-
796
- static std::atomic<uint64_t> g_req_id{1};
797
-
798
- class RateLimiterStore {
799
- public:
800
- explicit RateLimiterStore(const RateLimitConfig &config)
801
- : _requests_per_minute(std::max(0, config.requests_per_minute)),
802
- _estimated_tokens_per_minute(std::max(0, config.estimated_tokens_per_minute)) {}
803
-
804
- RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens) {
805
- if (_requests_per_minute <= 0 && _estimated_tokens_per_minute <= 0) {
806
- return {};
807
- }
808
-
809
- std::lock_guard<std::mutex> lock(_mu);
810
- auto &bucket = _buckets[api_key_id];
811
- const auto now = std::chrono::steady_clock::now();
812
- refill(bucket.request_tokens, bucket.last_request_refill, _requests_per_minute, now);
813
- refill(bucket.estimated_tokens, bucket.last_estimated_refill, _estimated_tokens_per_minute, now);
814
-
815
- if (_requests_per_minute > 0 && bucket.request_tokens < 1.0) {
816
- return {false, 1, "Rate limit exceeded: requests"};
817
- }
818
- if (_estimated_tokens_per_minute > 0 && bucket.estimated_tokens < estimated_tokens) {
819
- return {false, 1, "Rate limit exceeded: estimated tokens"};
820
- }
821
-
822
- if (_requests_per_minute > 0) bucket.request_tokens -= 1.0;
823
- if (_estimated_tokens_per_minute > 0) bucket.estimated_tokens -= estimated_tokens;
824
- return {};
825
- }
826
-
827
- private:
828
- struct Bucket {
829
- double request_tokens = 0.0;
830
- double estimated_tokens = 0.0;
831
- std::chrono::steady_clock::time_point last_request_refill{};
832
- std::chrono::steady_clock::time_point last_estimated_refill{};
833
- };
834
-
835
- std::mutex _mu;
836
- std::unordered_map<std::string, Bucket> _buckets;
837
- int _requests_per_minute;
838
- int _estimated_tokens_per_minute;
839
-
840
- static void refill(
841
- double &tokens,
842
- std::chrono::steady_clock::time_point &last_refill,
843
- int limit_per_minute,
844
- std::chrono::steady_clock::time_point now) {
845
- if (limit_per_minute <= 0) return;
846
- if (last_refill.time_since_epoch().count() == 0) {
847
- tokens = limit_per_minute;
848
- last_refill = now;
849
- return;
850
- }
851
- const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
852
- if (elapsed <= 0) return;
853
- const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
854
- tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
855
- last_refill = now;
856
- }
857
- };
858
-
859
- class RequestRegistry {
860
- public:
861
- std::shared_ptr<RequestContext> create(
862
- const std::string &request_id,
863
- const ApiKeyRecord &principal,
864
- const TokenEstimate &estimate,
865
- const std::string &request_body) {
866
- auto ctx = std::make_shared<RequestContext>();
867
- ctx->request_id = request_id;
868
- ctx->api_key_id = principal.key_id;
869
- ctx->role = principal.role;
870
- ctx->priority = role_to_priority(principal.role);
871
- ctx->estimate = estimate;
872
- ctx->request_body = request_body;
873
- ctx->created_at = std::chrono::steady_clock::now();
874
- ctx->enqueue_time = ctx->created_at;
875
-
876
- std::lock_guard<std::mutex> lock(_mu);
877
- _requests[request_id] = ctx;
878
- return ctx;
879
- }
880
-
881
- std::shared_ptr<RequestContext> find(const std::string &request_id) const {
882
- std::lock_guard<std::mutex> lock(_mu);
883
- const auto it = _requests.find(request_id);
884
- if (it == _requests.end()) return nullptr;
885
- return it->second;
886
- }
887
-
888
- void mark_state(const std::string &request_id, RequestState state) {
889
- auto ctx = find(request_id);
890
- if (ctx) ctx->state.store(state);
891
- }
892
-
893
- void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
894
- {
895
- std::lock_guard<std::mutex> lock(ctx->mu);
896
- ctx->state.store(state);
897
- ctx->result = std::move(result);
898
- ctx->completed = true;
899
- }
900
- ctx->cv.notify_all();
901
- }
902
-
903
- std::shared_ptr<RequestContext> cancel_request(const std::string &request_id) {
904
- auto ctx = find(request_id);
905
- if (!ctx) return nullptr;
906
-
907
- ctx->cancelled.store(true);
908
- const auto state = ctx->state.load();
909
- if (state == RequestState::QUEUED) {
910
- complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
911
- } else if (state == RequestState::RUNNING) {
912
- ctx->state.store(RequestState::CANCELLED);
913
- }
914
- return ctx;
915
- }
916
-
917
- std::vector<std::shared_ptr<RequestContext>> cancel_all() {
918
- std::vector<std::shared_ptr<RequestContext>> out;
919
- std::lock_guard<std::mutex> lock(_mu);
920
- out.reserve(_requests.size());
921
- for (auto &[_, ctx] : _requests) {
922
- ctx->cancelled.store(true);
923
- const auto state = ctx->state.load();
924
- if (state == RequestState::QUEUED) {
925
- {
926
- std::lock_guard<std::mutex> ctx_lock(ctx->mu);
927
- ctx->state.store(RequestState::CANCELLED);
928
- ctx->result = {499, R"({"error":"Request cancelled"})"};
929
- ctx->completed = true;
930
- }
931
- ctx->cv.notify_all();
932
- } else if (state == RequestState::RUNNING) {
933
- ctx->state.store(RequestState::CANCELLED);
934
- }
935
- out.push_back(ctx);
936
- }
937
- return out;
938
- }
939
-
940
- private:
941
- mutable std::mutex _mu;
942
- std::unordered_map<std::string, std::shared_ptr<RequestContext>> _requests;
943
- };
944
-
945
- class PrioritySchedulerQueue {
946
- public:
947
- explicit PrioritySchedulerQueue(const QueueConfig &config)
948
- : _max_size(config.max_size),
949
- _max_tokens(config.max_tokens),
950
- _admin_quota(std::max(1, config.admin_quota)),
951
- _retry_after_sec(std::max(1, config.retry_after_sec)) {}
952
-
953
- bool try_push(const std::shared_ptr<RequestContext> &ctx) {
954
- std::lock_guard<std::mutex> lock(_mu);
955
- if (_current_size >= _max_size) return false;
956
- if (_current_tokens + ctx->estimate.estimated_total_tokens > _max_tokens) return false;
957
-
958
- if (ctx->priority == Priority::ADMIN) {
959
- _admin_queue.push_back(ctx);
960
- } else {
961
- _user_queue.push_back(ctx);
962
- }
963
- ++_current_size;
964
- _current_tokens += ctx->estimate.estimated_total_tokens;
965
- _cv.notify_one();
966
- return true;
967
- }
968
-
969
- std::shared_ptr<RequestContext> pop_next() {
970
- std::unique_lock<std::mutex> lock(_mu);
971
- _cv.wait(lock, [&]() { return _stopped || _current_size > 0; });
972
- if (_stopped) return nullptr;
973
-
974
- std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
975
- if (!_admin_queue.empty() && (_admin_streak < _admin_quota || _user_queue.empty())) {
976
- selected_queue = &_admin_queue;
977
- ++_admin_streak;
978
- } else if (!_user_queue.empty()) {
979
- selected_queue = &_user_queue;
980
- _admin_streak = 0;
981
- } else if (!_admin_queue.empty()) {
982
- selected_queue = &_admin_queue;
983
- _admin_streak = 1;
984
- }
985
-
986
- if (!selected_queue || selected_queue->empty()) return nullptr;
987
-
988
- auto best_it = std::min_element(
989
- selected_queue->begin(),
990
- selected_queue->end(),
991
- [](const auto &a, const auto &b) {
992
- return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
993
- });
994
- auto ctx = *best_it;
995
- selected_queue->erase(best_it);
996
- --_current_size;
997
- _current_tokens -= ctx->estimate.estimated_total_tokens;
998
- return ctx;
999
- }
1000
-
1001
- void stop() {
1002
- std::lock_guard<std::mutex> lock(_mu);
1003
- _stopped = true;
1004
- _cv.notify_all();
1005
- }
1006
-
1007
- int retry_after_sec() const {
1008
- return _retry_after_sec;
1009
- }
1010
-
1011
- private:
1012
- mutable std::mutex _mu;
1013
- std::condition_variable _cv;
1014
- std::deque<std::shared_ptr<RequestContext>> _admin_queue;
1015
- std::deque<std::shared_ptr<RequestContext>> _user_queue;
1016
- size_t _max_size;
1017
- size_t _current_size = 0;
1018
- int _max_tokens;
1019
- int _current_tokens = 0;
1020
- int _admin_quota;
1021
- int _admin_streak = 0;
1022
- int _retry_after_sec;
1023
- bool _stopped = false;
1024
- };
1025
-
1026
- class Scheduler {
1027
- public:
1028
- Scheduler(ModelManager &manager, RequestRegistry &registry, const QueueConfig &queue_config)
1029
- : _manager(manager), _registry(registry), _queue(queue_config) {
1030
- _worker = std::thread([this]() { worker_loop(); });
1031
- }
1032
-
1033
- ~Scheduler() {
1034
- _queue.stop();
1035
- if (_worker.joinable()) _worker.join();
1036
- }
1037
-
1038
- bool try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
1039
- return _queue.try_push(ctx);
1040
- }
1041
-
1042
- int retry_after_sec() const {
1043
- return _queue.retry_after_sec();
1044
- }
1045
-
1046
- private:
1047
- ModelManager &_manager;
1048
- RequestRegistry &_registry;
1049
- PrioritySchedulerQueue _queue;
1050
- std::thread _worker;
1051
-
1052
- void worker_loop() {
1053
- for (;;) {
1054
- auto ctx = _queue.pop_next();
1055
- if (!ctx) return;
1056
-
1057
- if (ctx->cancelled.load()) {
1058
- _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
1059
- continue;
1060
- }
1061
-
1062
- ctx->state.store(RequestState::RUNNING);
1063
- ctx->start_time = std::chrono::steady_clock::now();
1064
- auto worker = _manager.active_worker();
1065
- if (!worker) {
1066
- _registry.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
1067
- continue;
1068
- }
1069
-
1070
- try {
1071
- auto [status, body] = forward_chat(*worker, ctx->request_body);
1072
- if (ctx->cancelled.load()) {
1073
- _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
1074
- continue;
1075
- }
1076
- _registry.complete(ctx, RequestState::DONE, {status, body});
1077
- } catch (const std::exception &e) {
1078
- _registry.complete(
1079
- ctx,
1080
- RequestState::FAILED,
1081
- {500, json({{"error", e.what()}}).dump()});
1082
- }
1083
- }
1084
- }
1085
- };
1086
-
1087
- class ApiKeyAuth {
1088
- public:
1089
- explicit ApiKeyAuth(const ManagerConfig &config)
1090
- : _header_name(config.auth.header), _scheme(config.auth.scheme) {
1091
- for (const auto &record : config.api_keys) {
1092
- _records_by_secret.emplace(record.secret, record);
1093
- }
1094
- }
1095
-
1096
- bool enabled() const {
1097
- return !_records_by_secret.empty();
1098
- }
1099
-
1100
- template <typename Body, typename Allocator>
1101
- std::optional<ApiKeyRecord> authenticate(
1102
- const http::request<Body, http::basic_fields<Allocator>> &req,
1103
- std::string &error) const {
1104
- if (!enabled()) {
1105
- error.clear();
1106
- return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
1107
- }
1108
-
1109
- const auto token = extract_bearer_token(req, error);
1110
- if (!token) return std::nullopt;
1111
-
1112
- const auto it = _records_by_secret.find(*token);
1113
- if (it == _records_by_secret.end()) {
1114
- error = "Invalid API key";
1115
- return std::nullopt;
1116
- }
1117
- if (!it->second.enabled) {
1118
- error = "API key disabled";
1119
- return std::nullopt;
1120
- }
1121
- error.clear();
1122
- return it->second;
1123
- }
1124
-
1125
- private:
1126
- std::string _header_name;
1127
- std::string _scheme;
1128
- std::unordered_map<std::string, ApiKeyRecord> _records_by_secret;
1129
-
1130
- template <typename Body, typename Allocator>
1131
- std::optional<std::string> extract_bearer_token(
1132
- const http::request<Body, http::basic_fields<Allocator>> &req,
1133
- std::string &error) const {
1134
- const auto header_it = req.find(_header_name);
1135
- if (header_it == req.end()) {
1136
- error = "Missing authorization header";
1137
- return std::nullopt;
1138
- }
1139
-
1140
- const std::string value = trim_copy(header_it->value().to_string());
1141
- const std::string prefix = _scheme + " ";
1142
- if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
1143
- error = "Invalid authorization scheme";
1144
- return std::nullopt;
1145
- }
1146
-
1147
- std::string token = trim_copy(value.substr(prefix.size()));
1148
- if (token.empty()) {
1149
- error = "Missing API key";
1150
- return std::nullopt;
1151
- }
1152
- return token;
1153
- }
1154
- };
1155
-
1156
- static void log_line(const std::string &line) {
1157
- std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
1158
- }
1159
-
1160
- static std::string truncate_body(const std::string &body, size_t max_len = 2000) {
1161
- if (body.size() <= max_len) return body;
1162
- return body.substr(0, max_len) + "...[truncated]";
1163
- }
1164
-
1165
- static std::optional<std::string> extract_cancel_request_id(const std::string &path) {
1166
- const std::string prefix = "/requests/";
1167
- const std::string suffix = "/cancel";
1168
- if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
1169
- if (path.rfind(prefix, 0) != 0) return std::nullopt;
1170
- if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
1171
- const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
1172
- if (request_id.empty()) return std::nullopt;
1173
- return request_id;
1174
- }
1175
-
1176
- static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
1177
- asio::io_context ioc;
1178
- asio::ip::tcp::resolver resolver(ioc);
1179
- beast::tcp_stream stream(ioc);
1180
- auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
1181
- stream.connect(results);
1182
-
1183
- http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
1184
- req.set(http::field::host, "127.0.0.1");
1185
- req.set(http::field::content_type, "application/json");
1186
- req.set(http::field::user_agent, "llm-manager");
1187
- req.body() = body;
1188
- req.prepare_payload();
1189
- http::write(stream, req);
1190
-
1191
- beast::flat_buffer buffer;
1192
- http::response<http::string_body> res;
1193
- http::read(stream, buffer, res);
1194
- beast::error_code ec;
1195
- stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
1196
- return {res.result_int(), res.body()};
1197
- }
1198
-
1199
- struct ProxiedGetResult {
1200
- int status = 500;
1201
- std::string body;
1202
- std::string content_type = "text/plain; charset=utf-8";
1203
- std::string content_encoding;
1204
- };
1205
-
1206
- static ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker,
1207
- const std::string &target) {
1208
- asio::io_context ioc;
1209
- asio::ip::tcp::resolver resolver(ioc);
1210
- beast::tcp_stream stream(ioc);
1211
- auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
1212
- stream.connect(results);
1213
-
1214
- http::request<http::string_body> req{http::verb::get, target, 11};
1215
- req.set(http::field::host, "127.0.0.1");
1216
- req.set(http::field::user_agent, "llm-manager");
1217
- req.set(http::field::accept_encoding, "gzip, identity");
1218
- http::write(stream, req);
1219
-
1220
- beast::flat_buffer buffer;
1221
- http::response<http::string_body> res;
1222
- http::read(stream, buffer, res);
1223
- beast::error_code ec;
1224
- stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
1225
- ProxiedGetResult out;
1226
- out.status = res.result_int();
1227
- out.body = res.body();
1228
- if (res.base().find(http::field::content_type) != res.base().end()) {
1229
- out.content_type = res.base()[http::field::content_type].to_string();
1230
- }
1231
- if (res.base().find(http::field::content_encoding) != res.base().end()) {
1232
- out.content_encoding = res.base()[http::field::content_encoding].to_string();
1233
- }
1234
- return out;
1235
- }
1236
-
1237
- template <typename Body, typename Allocator>
1238
- http::response<http::string_body> handle_request(
1239
- ModelManager &manager,
1240
- const ManagerConfig &config,
1241
- const ApiKeyAuth &auth,
1242
- RateLimiterStore &rate_limiter,
1243
- RequestRegistry &registry,
1244
- Scheduler &scheduler,
1245
- http::request<Body, http::basic_fields<Allocator>> &&req) {
1246
- const auto start = std::chrono::steady_clock::now();
1247
- const auto req_id_num = g_req_id.fetch_add(1);
1248
- const std::string request_id = std::to_string(req_id_num);
1249
- const std::string target = req.target().to_string();
1250
- const std::string method = req.method_string().to_string();
1251
- const std::string path = target.substr(0, target.find('?'));
1252
- auto authenticated = std::optional<ApiKeyRecord>{};
1253
-
1254
- log_line("request_id=" + request_id + " method=" + method + " path=" + target);
1255
- if constexpr (std::is_same_v<Body, http::string_body>) {
1256
- if (!req.body().empty()) {
1257
- log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
1258
- }
1259
- }
1260
-
1261
- auto json_response = [&](http::status status, const json &obj) {
1262
- json payload = obj;
1263
- payload["request_id"] = request_id;
1264
- http::response<http::string_body> res{status, req.version()};
1265
- res.set(http::field::content_type, "application/json");
1266
- res.set(http::field::server, "llm-manager");
1267
- res.set("X-Request-Id", request_id);
1268
- res.keep_alive(req.keep_alive());
1269
- res.body() = payload.dump();
1270
- res.prepare_payload();
1271
- auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
1272
- std::chrono::steady_clock::now() - start).count();
1273
- log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
1274
- " elapsed_ms=" + std::to_string(elapsed_ms));
1275
- return res;
1276
- };
1277
-
1278
- auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
1279
- auto res = json_response(status, obj);
1280
- res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
1281
- return res;
1282
- };
1283
-
1284
- auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
1285
- std::string auth_error;
1286
- authenticated = auth.authenticate(req, auth_error);
1287
- if (!authenticated) {
1288
- return json_response(http::status::unauthorized, {{"error", auth_error}});
1289
- }
1290
- if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
1291
- return json_response(http::status::forbidden, {{"error", "Admin role required"}});
1292
- }
1293
- log_line("request_id=" + request_id +
1294
- " api_key_id=" + authenticated->key_id +
1295
- " role=" + role_to_string(authenticated->role));
1296
- return std::nullopt;
1297
- };
1298
-
1299
- try {
1300
- if (path == "/health" && req.method() == http::verb::get) {
1301
- return json_response(http::status::ok, manager.models_view());
1302
- }
1303
-
1304
- if (path == "/models" && req.method() == http::verb::get) {
1305
- return json_response(http::status::ok, manager.models_view());
1306
- }
1307
-
1308
- if (path == "/switch-model" && req.method() == http::verb::post) {
1309
- if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
1310
- std::string body(req.body().data(), req.body().size());
1311
- json j = json::parse(body, nullptr, false);
1312
- if (j.is_discarded()) {
1313
- return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
1314
- }
1315
- std::string model;
1316
- if (j.contains("model_name")) model = j["model_name"].get<std::string>();
1317
- if (j.contains("model")) model = j["model"].get<std::string>();
1318
- if (model.empty()) {
1319
- return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
1320
- }
1321
-
1322
- std::string err;
1323
- bool ok = manager.switch_model(model, err);
1324
- if (!ok) {
1325
- auto status = (err == "Switch already in progress") ? http::status::conflict : http::status::internal_server_error;
1326
- return json_response(status, {{"status", "error"}, {"error", err}});
1327
- }
1328
- auto state = manager.models_view();
1329
- state["message"] = "Switched model successfully";
1330
- return json_response(http::status::ok, state);
1331
- }
1332
-
1333
- if (path == "/stop" && req.method() == http::verb::post) {
1334
- if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
1335
- registry.cancel_all();
1336
- std::string err;
1337
- bool ok = manager.restart_active(err);
1338
- if (!ok) {
1339
- http::status status = http::status::internal_server_error;
1340
- if (err == "Switch already in progress") {
1341
- status = http::status::conflict;
1342
- } else if (err == "No active model") {
1343
- status = http::status::service_unavailable;
1344
- }
1345
- return json_response(status, {{"status", "error"}, {"error", err}});
1346
- }
1347
- auto state = manager.models_view();
1348
- state["message"] = "Stopped in-flight prompts and restarted model";
1349
- return json_response(http::status::ok, state);
1350
- }
1351
-
1352
- if (req.method() == http::verb::post) {
1353
- if (auto cancel_id = extract_cancel_request_id(path)) {
1354
- if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
1355
- auto ctx = registry.find(*cancel_id);
1356
- if (!ctx) {
1357
- return json_response(http::status::not_found, {{"error", "Unknown request id"}});
1358
- }
1359
- if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
1360
- return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
1361
- }
1362
-
1363
- const auto previous_state = ctx->state.load();
1364
- registry.cancel_request(*cancel_id);
1365
- std::string restart_error;
1366
- bool restarted = true;
1367
- if (previous_state == RequestState::RUNNING) {
1368
- restarted = manager.restart_active(restart_error);
1369
- }
1370
-
1371
- json payload = {
1372
- {"cancelled_request_id", *cancel_id},
1373
- {"state", state_to_string(ctx->state.load())}
1374
- };
1375
- if (!restarted) {
1376
- payload["restart_error"] = restart_error;
1377
- }
1378
- return json_response(http::status::ok, payload);
1379
- }
1380
- }
1381
-
1382
- if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
1383
- if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
1384
- json payload = json::parse(req.body(), nullptr, false);
1385
- if (payload.is_discarded()) {
1386
- return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
1387
- }
1388
- std::string token_error;
1389
- auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
1390
- if (!estimate) {
1391
- return json_response(http::status::bad_request, {{"error", token_error}});
1392
- }
1393
- log_line("request_id=" + request_id +
1394
- " prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
1395
- " max_tokens=" + std::to_string(estimate->requested_max_tokens) +
1396
- " estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
1397
-
1398
- auto rate_limit_decision = rate_limiter.allow(
1399
- authenticated->key_id,
1400
- estimate->estimated_total_tokens);
1401
- if (!rate_limit_decision.allowed) {
1402
- return json_response_with_retry_after(
1403
- http::status::too_many_requests,
1404
- {{"error", rate_limit_decision.error}},
1405
- rate_limit_decision.retry_after_sec);
1406
- }
1407
-
1408
- auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
1409
- if (!scheduler.try_enqueue(ctx)) {
1410
- ctx->cancelled.store(true);
1411
- registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
1412
- return json_response_with_retry_after(
1413
- http::status::service_unavailable,
1414
- {{"error", "Queue full"}},
1415
- scheduler.retry_after_sec());
1416
- }
1417
-
1418
- std::unique_lock<std::mutex> lock(ctx->mu);
1419
- const bool finished = ctx->cv.wait_for(
1420
- lock,
1421
- std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
1422
- [&]() { return ctx->completed; });
1423
- if (!finished) {
1424
- lock.unlock();
1425
- registry.cancel_request(request_id);
1426
- std::string restart_error;
1427
- bool restarted = true;
1428
- if (ctx->state.load() == RequestState::RUNNING) {
1429
- restarted = manager.restart_active(restart_error);
1430
- }
1431
- json timeout_payload = {
1432
- {"error", "Request timed out"},
1433
- {"state", state_to_string(ctx->state.load())}
1434
- };
1435
- if (!restarted) timeout_payload["restart_error"] = restart_error;
1436
- return json_response(http::status::gateway_timeout, timeout_payload);
1437
- }
1438
-
1439
- const auto final_state = ctx->state.load();
1440
- RequestResult result = ctx->result;
1441
- lock.unlock();
1442
-
1443
- if (final_state == RequestState::CANCELLED) {
1444
- return json_response(http::status::ok, {{"status", "cancelled"}});
1445
- }
1446
-
1447
- http::response<http::string_body> res{
1448
- static_cast<http::status>(result.status), req.version()};
1449
- res.set(http::field::content_type, result.content_type);
1450
- res.set(http::field::server, "llm-manager");
1451
- res.set("X-Request-Id", request_id);
1452
- res.keep_alive(req.keep_alive());
1453
- res.body() = result.body;
1454
- res.prepare_payload();
1455
- auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
1456
- std::chrono::steady_clock::now() - start).count();
1457
- log_line("request_id=" + request_id +
1458
- " final_state=" + state_to_string(final_state) +
1459
- " upstream_status=" + std::to_string(result.status) +
1460
- " elapsed_ms=" + std::to_string(elapsed_ms));
1461
- return res;
1462
- }
1463
-
1464
- // Proxy GET requests not handled by manager endpoints to active llama-server.
1465
- // This enables llama.cpp UI/static routes (including "/").
1466
- if (req.method() == http::verb::get) {
1467
- auto worker = manager.active_worker();
1468
- if (!worker) {
1469
- return json_response(http::status::service_unavailable, {{"error", "No active model"}});
1470
- }
1471
- auto upstream = forward_get_to_worker(*worker, target);
1472
- http::response<http::string_body> res{
1473
- static_cast<http::status>(upstream.status), req.version()};
1474
- res.set(http::field::content_type, upstream.content_type);
1475
- if (!upstream.content_encoding.empty()) {
1476
- res.set(http::field::content_encoding, upstream.content_encoding);
1477
- }
1478
- res.set(http::field::server, "llm-manager");
1479
- res.set("X-Request-Id", request_id);
1480
- res.keep_alive(req.keep_alive());
1481
- res.body() = upstream.body;
1482
- res.prepare_payload();
1483
- auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
1484
- std::chrono::steady_clock::now() - start)
1485
- .count();
1486
- log_line("request_id=" + request_id +
1487
- " proxied_get model=" + worker->model +
1488
- " upstream_status=" + std::to_string(upstream.status) +
1489
- " elapsed_ms=" + std::to_string(elapsed_ms));
1490
- return res;
1491
- }
1492
-
1493
- return json_response(http::status::not_found, {{"error", "Not found"}});
1494
- } catch (const std::exception &e) {
1495
- return json_response(http::status::internal_server_error, {{"error", e.what()}});
1496
- }
1497
- }
1498
-
1499
- void do_session(
1500
- asio::ip::tcp::socket socket,
1501
- ModelManager &manager,
1502
- const ManagerConfig &config,
1503
- const ApiKeyAuth &auth,
1504
- RateLimiterStore &rate_limiter,
1505
- RequestRegistry &registry,
1506
- Scheduler &scheduler) {
1507
- try {
1508
- beast::flat_buffer buffer;
1509
- http::request<http::string_body> req;
1510
- http::read(socket, buffer, req);
1511
- auto res = handle_request(manager, config, auth, rate_limiter, registry, scheduler, std::move(req));
1512
- http::write(socket, res);
1513
- beast::error_code ec;
1514
- socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
1515
- } catch (...) {
1516
- }
1517
- }
1518
 
1519
  int main() {
1520
  const ManagerConfig config = load_manager_config();
1521
  const auto &bind_host = config.server.host;
1522
  const int bind_port = config.server.port;
 
1523
  ModelManager manager(config);
1524
  ApiKeyAuth auth(config);
1525
  RateLimiterStore rate_limiter(config.rate_limit);
1526
  RequestRegistry registry;
1527
- Scheduler scheduler(manager, registry, config.queue);
 
1528
 
1529
  std::string init_error;
1530
  if (auth.enabled()) {
@@ -1532,6 +31,7 @@ int main() {
1532
  } else {
1533
  log_line("auth: disabled (no configured api keys)");
1534
  }
 
1535
  log_line("startup: loading default model");
1536
  if (!manager.initialize_default(init_error)) {
1537
  log_line("startup: default model failed: " + init_error);
@@ -1540,7 +40,9 @@ int main() {
1540
  }
1541
 
1542
  asio::io_context ioc{1};
1543
- asio::ip::tcp::acceptor acceptor{ioc, {asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
 
 
1544
  log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
1545
 
1546
  for (;;) {
@@ -1554,6 +56,7 @@ int main() {
1554
  std::cref(auth),
1555
  std::ref(rate_limiter),
1556
  std::ref(registry),
 
1557
  std::ref(scheduler)).detach();
1558
  }
1559
  }
 
1
  #include <boost/asio.hpp>
 
 
 
 
2
 
3
+ #include <functional>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  #include <string>
5
  #include <thread>
6
+ #include <utility>
 
7
 
8
+ #include "config.h"
9
+ #include "http_helpers.h"
10
+ #include "model_manager.h"
11
+ #include "runtime_components.h"
12
+ #include "server.h"
13
 
14
  namespace asio = boost::asio;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  int main() {
17
  const ManagerConfig config = load_manager_config();
18
  const auto &bind_host = config.server.host;
19
  const int bind_port = config.server.port;
20
+
21
  ModelManager manager(config);
22
  ApiKeyAuth auth(config);
23
  RateLimiterStore rate_limiter(config.rate_limit);
24
  RequestRegistry registry;
25
+ MetricsRegistry metrics;
26
+ Scheduler scheduler(manager, registry, metrics, config.queue);
27
 
28
  std::string init_error;
29
  if (auth.enabled()) {
 
31
  } else {
32
  log_line("auth: disabled (no configured api keys)");
33
  }
34
+
35
  log_line("startup: loading default model");
36
  if (!manager.initialize_default(init_error)) {
37
  log_line("startup: default model failed: " + init_error);
 
40
  }
41
 
42
  asio::io_context ioc{1};
43
+ asio::ip::tcp::acceptor acceptor{
44
+ ioc,
45
+ {asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
46
  log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
47
 
48
  for (;;) {
 
56
  std::cref(auth),
57
  std::ref(rate_limiter),
58
  std::ref(registry),
59
+ std::ref(metrics),
60
  std::ref(scheduler)).detach();
61
  }
62
  }
cpp/llm_manager_types.cpp ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "llm_manager_types.h"
2
+
3
+ std::string role_to_string(Role role) {
4
+ return role == Role::ADMIN ? "admin" : "user";
5
+ }
6
+
7
+ Priority role_to_priority(Role role) {
8
+ return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
9
+ }
10
+
11
+ std::string state_to_string(RequestState state) {
12
+ switch (state) {
13
+ case RequestState::QUEUED: return "queued";
14
+ case RequestState::RUNNING: return "running";
15
+ case RequestState::CANCELLED: return "cancelled";
16
+ case RequestState::FAILED: return "failed";
17
+ case RequestState::DONE: return "done";
18
+ }
19
+ return "unknown";
20
+ }
cpp/llm_manager_types.h ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <atomic>
4
+ #include <chrono>
5
+ #include <condition_variable>
6
+ #include <memory>
7
+ #include <mutex>
8
+ #include <string>
9
+ #include <vector>
10
+
11
+ #include <nlohmann/json.hpp>
12
+
13
+ #include <sys/types.h>
14
+
15
+ using json = nlohmann::json;
16
+
17
+ struct ServerConfig {
18
+ std::string host = "0.0.0.0";
19
+ int port = 7860;
20
+ };
21
+
22
+ struct WorkerConfig {
23
+ std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
24
+ std::string llama_server_bin = "/usr/local/bin/llama-server";
25
+ std::string host = "127.0.0.1";
26
+ std::string bind_host = "0.0.0.0";
27
+ int base_port = 8080;
28
+ int switch_timeout_sec = 300;
29
+ };
30
+
31
+ struct LlamaConfig {
32
+ int n_ctx = 8192;
33
+ int threads = 4;
34
+ int ngl = 0;
35
+ int batch = 128;
36
+ int ubatch = 64;
37
+ };
38
+
39
+ enum class Role {
40
+ ADMIN,
41
+ USER
42
+ };
43
+
44
+ struct AuthConfig {
45
+ std::string header = "Authorization";
46
+ std::string scheme = "Bearer";
47
+ };
48
+
49
+ struct LimitsConfig {
50
+ int default_max_tokens = 256;
51
+ int max_tokens_per_request = 2048;
52
+ int request_timeout_sec = 30;
53
+ };
54
+
55
+ struct QueueConfig {
56
+ size_t max_size = 100;
57
+ int max_tokens = 20000;
58
+ int admin_quota = 3;
59
+ int retry_after_sec = 5;
60
+ };
61
+
62
+ struct RateLimitConfig {
63
+ int requests_per_minute = 60;
64
+ int estimated_tokens_per_minute = 6000;
65
+ };
66
+
67
+ struct SchedulerConfig {
68
+ int max_concurrent = 1;
69
+ };
70
+
71
+ struct StreamingConfig {
72
+ bool enabled = false;
73
+ };
74
+
75
+ struct ApiKeyRecord {
76
+ std::string key_id;
77
+ std::string secret;
78
+ Role role = Role::USER;
79
+ bool enabled = true;
80
+ };
81
+
82
+ struct ManagerConfig {
83
+ ServerConfig server;
84
+ WorkerConfig worker;
85
+ LlamaConfig llama;
86
+ AuthConfig auth;
87
+ LimitsConfig limits;
88
+ QueueConfig queue;
89
+ RateLimitConfig rate_limit;
90
+ SchedulerConfig scheduler;
91
+ StreamingConfig streaming;
92
+ std::vector<ApiKeyRecord> api_keys;
93
+ };
94
+
95
+ struct WorkerInfo {
96
+ std::string model;
97
+ int port = 0;
98
+ pid_t pid = -1;
99
+ std::string last_loaded;
100
+ };
101
+
102
+ enum class Priority {
103
+ ADMIN = 0,
104
+ USER = 1
105
+ };
106
+
107
+ enum class RequestState {
108
+ QUEUED,
109
+ RUNNING,
110
+ CANCELLED,
111
+ FAILED,
112
+ DONE
113
+ };
114
+
115
+ struct TokenEstimate {
116
+ int prompt_tokens = 0;
117
+ int requested_max_tokens = 0;
118
+ int estimated_total_tokens = 0;
119
+ };
120
+
121
+ struct RateLimitDecision {
122
+ bool allowed = true;
123
+ int retry_after_sec = 0;
124
+ std::string error;
125
+ };
126
+
127
+ struct QueueSnapshot {
128
+ size_t total_size = 0;
129
+ size_t admin_size = 0;
130
+ size_t user_size = 0;
131
+ int total_tokens = 0;
132
+ };
133
+
134
+ struct RequestResult {
135
+ int status = 500;
136
+ std::string body;
137
+ std::string content_type = "application/json";
138
+ };
139
+
140
+ struct RequestContext {
141
+ std::string request_id;
142
+ std::string api_key_id;
143
+ Role role = Role::USER;
144
+ Priority priority = Priority::USER;
145
+ TokenEstimate estimate;
146
+ std::string request_body;
147
+ std::atomic<RequestState> state{RequestState::QUEUED};
148
+ std::atomic<bool> cancelled{false};
149
+ std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
150
+ std::chrono::steady_clock::time_point enqueue_time{created_at};
151
+ std::chrono::steady_clock::time_point start_time{};
152
+ std::mutex mu;
153
+ std::condition_variable cv;
154
+ bool completed = false;
155
+ RequestResult result;
156
+ };
157
+
158
+ struct ProxiedGetResult {
159
+ int status = 500;
160
+ std::string body;
161
+ std::string content_type = "text/plain; charset=utf-8";
162
+ std::string content_encoding;
163
+ };
164
+
165
+ std::string role_to_string(Role role);
166
+ Priority role_to_priority(Role role);
167
+ std::string state_to_string(RequestState state);
cpp/model_manager.cpp ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "model_manager.h"
2
+
3
+ #include <boost/asio.hpp>
4
+ #include <boost/beast/core.hpp>
5
+ #include <boost/beast/http.hpp>
6
+
7
+ #include "http_helpers.h"
8
+
9
+ #include <csignal>
10
+ #include <thread>
11
+ #include <unistd.h>
12
+ #include <sys/wait.h>
13
+
14
+ namespace asio = boost::asio;
15
+ namespace beast = boost::beast;
16
+ namespace http = beast::http;
17
+
18
+ bool is_alive(pid_t pid) {
19
+ if (pid <= 0) return false;
20
+ return kill(pid, 0) == 0;
21
+ }
22
+
23
+ void shutdown_worker(pid_t pid, int wait_seconds) {
24
+ if (pid <= 0) return;
25
+ kill(pid, SIGTERM);
26
+ const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
27
+ while (std::chrono::steady_clock::now() < deadline) {
28
+ int status = 0;
29
+ pid_t r = waitpid(pid, &status, WNOHANG);
30
+ if (r == pid) return;
31
+ std::this_thread::sleep_for(std::chrono::milliseconds(200));
32
+ }
33
+ kill(pid, SIGKILL);
34
+ int status = 0;
35
+ waitpid(pid, &status, 0);
36
+ }
37
+
38
+ ModelManager::ModelManager(const ManagerConfig &config)
39
+ : default_model_(config.worker.default_model),
40
+ llama_server_bin_(config.worker.llama_server_bin),
41
+ worker_host_(config.worker.host),
42
+ worker_bind_host_(config.worker.bind_host),
43
+ base_port_(config.worker.base_port),
44
+ switch_timeout_sec_(config.worker.switch_timeout_sec),
45
+ n_ctx_(config.llama.n_ctx),
46
+ n_threads_(config.llama.threads),
47
+ n_gpu_layers_(config.llama.ngl),
48
+ n_batch_(config.llama.batch),
49
+ n_ubatch_(config.llama.ubatch),
50
+ next_port_(base_port_) {}
51
+
52
+ bool ModelManager::initialize_default(std::string &error) {
53
+ return switch_model(default_model_, error);
54
+ }
55
+
56
+ bool ModelManager::switch_model(const std::string &model, std::string &error) {
57
+ {
58
+ std::lock_guard<std::mutex> lock(mu_);
59
+ if (switch_in_progress_) {
60
+ error = "Switch already in progress";
61
+ return false;
62
+ }
63
+ if (active_ && active_->model == model && is_alive(active_->pid)) {
64
+ return true;
65
+ }
66
+ switch_in_progress_ = true;
67
+ }
68
+
69
+ std::optional<WorkerInfo> old_worker;
70
+ {
71
+ std::lock_guard<std::mutex> lock(mu_);
72
+ if (active_) old_worker = active_;
73
+ }
74
+
75
+ int port = allocate_port();
76
+ pid_t pid = spawn_worker(model, port);
77
+ if (pid <= 0) {
78
+ finish_switch(false);
79
+ error = "Failed to start worker process";
80
+ return false;
81
+ }
82
+
83
+ if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
84
+ shutdown_worker(pid);
85
+ finish_switch(false);
86
+ error = "New model did not become ready in time";
87
+ return false;
88
+ }
89
+
90
+ WorkerInfo new_worker{model, port, pid, now_utc_iso()};
91
+ {
92
+ std::lock_guard<std::mutex> lock(mu_);
93
+ active_ = new_worker;
94
+ switch_in_progress_ = false;
95
+ }
96
+
97
+ if (old_worker && old_worker->pid != pid) {
98
+ shutdown_worker(old_worker->pid);
99
+ }
100
+ return true;
101
+ }
102
+
103
+ bool ModelManager::restart_active(std::string &error) {
104
+ std::optional<WorkerInfo> old_worker;
105
+ std::string model;
106
+ {
107
+ std::lock_guard<std::mutex> lock(mu_);
108
+ if (switch_in_progress_) {
109
+ error = "Switch already in progress";
110
+ return false;
111
+ }
112
+ if (!active_ || !is_alive(active_->pid)) {
113
+ error = "No active model";
114
+ return false;
115
+ }
116
+ switch_in_progress_ = true;
117
+ old_worker = active_;
118
+ model = active_->model;
119
+ }
120
+
121
+ shutdown_worker(old_worker->pid);
122
+
123
+ int port = allocate_port();
124
+ pid_t pid = spawn_worker(model, port);
125
+ if (pid <= 0) {
126
+ std::lock_guard<std::mutex> lock(mu_);
127
+ active_ = std::nullopt;
128
+ switch_in_progress_ = false;
129
+ error = "Failed to start worker process";
130
+ return false;
131
+ }
132
+
133
+ if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
134
+ shutdown_worker(pid);
135
+ std::lock_guard<std::mutex> lock(mu_);
136
+ active_ = std::nullopt;
137
+ switch_in_progress_ = false;
138
+ error = "New model did not become ready in time";
139
+ return false;
140
+ }
141
+
142
+ WorkerInfo new_worker{model, port, pid, now_utc_iso()};
143
+ {
144
+ std::lock_guard<std::mutex> lock(mu_);
145
+ active_ = new_worker;
146
+ switch_in_progress_ = false;
147
+ }
148
+ return true;
149
+ }
150
+
151
+ std::optional<WorkerInfo> ModelManager::active_worker() {
152
+ std::lock_guard<std::mutex> lock(mu_);
153
+ if (active_ && is_alive(active_->pid)) return active_;
154
+ return std::nullopt;
155
+ }
156
+
157
+ json ModelManager::models_view() {
158
+ std::lock_guard<std::mutex> lock(mu_);
159
+ json out;
160
+ out["status"] = (active_ && is_alive(active_->pid)) ? "ready" : "no_active_model";
161
+ out["switch_in_progress"] = switch_in_progress_;
162
+ if (active_ && is_alive(active_->pid)) {
163
+ out["current_model"] = active_->model;
164
+ out["last_loaded"] = active_->last_loaded;
165
+ out["active_pid"] = active_->pid;
166
+ out["active_port"] = active_->port;
167
+ } else {
168
+ out["current_model"] = nullptr;
169
+ out["last_loaded"] = nullptr;
170
+ out["active_pid"] = nullptr;
171
+ out["active_port"] = nullptr;
172
+ }
173
+ return out;
174
+ }
175
+
176
+ int ModelManager::allocate_port() {
177
+ std::lock_guard<std::mutex> lock(mu_);
178
+ return next_port_++;
179
+ }
180
+
181
+ void ModelManager::finish_switch(bool ok) {
182
+ std::lock_guard<std::mutex> lock(mu_);
183
+ if (!ok) switch_in_progress_ = false;
184
+ }
185
+
186
+ pid_t ModelManager::spawn_worker(const std::string &model, int port) {
187
+ pid_t pid = fork();
188
+ if (pid < 0) return -1;
189
+ if (pid == 0) {
190
+ setsid();
191
+ std::string port_s = std::to_string(port);
192
+ std::string n_ctx_s = std::to_string(n_ctx_);
193
+ std::string threads_s = std::to_string(n_threads_);
194
+ std::string ngl_s = std::to_string(n_gpu_layers_);
195
+ std::string batch_s = std::to_string(n_batch_);
196
+ std::string ubatch_s = std::to_string(n_ubatch_);
197
+
198
+ std::vector<std::string> args = {
199
+ llama_server_bin_,
200
+ "-hf", model,
201
+ "--host", worker_bind_host_,
202
+ "--port", port_s,
203
+ "-c", n_ctx_s,
204
+ "-t", threads_s,
205
+ "-ngl", ngl_s,
206
+ "--cont-batching",
207
+ "-b", batch_s,
208
+ "--ubatch-size", ubatch_s
209
+ };
210
+
211
+ std::vector<char *> argv;
212
+ argv.reserve(args.size() + 1);
213
+ for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
214
+ argv.push_back(nullptr);
215
+ execvp(argv[0], argv.data());
216
+ _exit(127);
217
+ }
218
+ return pid;
219
+ }
220
+
221
+ bool ModelManager::wait_until_ready(pid_t pid, int port, int timeout_sec) {
222
+ const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
223
+ while (std::chrono::steady_clock::now() < deadline) {
224
+ if (!is_alive(pid)) return false;
225
+ try {
226
+ auto [status, _] = http_get(port, "/");
227
+ if (status == 200) return true;
228
+ } catch (...) {
229
+ }
230
+ std::this_thread::sleep_for(std::chrono::milliseconds(800));
231
+ }
232
+ return false;
233
+ }
234
+
235
+ std::pair<int, std::string> ModelManager::http_get(int port, const std::string &target) {
236
+ asio::io_context ioc;
237
+ asio::ip::tcp::resolver resolver(ioc);
238
+ beast::tcp_stream stream(ioc);
239
+ auto const results = resolver.resolve(worker_host_, std::to_string(port));
240
+ stream.connect(results);
241
+
242
+ http::request<http::string_body> req{http::verb::get, target, 11};
243
+ req.set(http::field::host, worker_host_);
244
+ req.set(http::field::user_agent, "llm-manager");
245
+ http::write(stream, req);
246
+
247
+ beast::flat_buffer buffer;
248
+ http::response<http::string_body> res;
249
+ http::read(stream, buffer, res);
250
+ beast::error_code ec;
251
+ stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
252
+ return {res.result_int(), res.body()};
253
+ }
cpp/model_manager.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <mutex>
4
+ #include <optional>
5
+ #include <string>
6
+ #include <utility>
7
+
8
+ #include "llm_manager_types.h"
9
+
10
+ class ModelManager {
11
+ public:
12
+ explicit ModelManager(const ManagerConfig &config);
13
+
14
+ bool initialize_default(std::string &error);
15
+ bool switch_model(const std::string &model, std::string &error);
16
+ bool restart_active(std::string &error);
17
+ std::optional<WorkerInfo> active_worker();
18
+ json models_view();
19
+
20
+ private:
21
+ std::mutex mu_;
22
+ std::optional<WorkerInfo> active_;
23
+ bool switch_in_progress_ = false;
24
+
25
+ std::string default_model_;
26
+ std::string llama_server_bin_;
27
+ std::string worker_host_;
28
+ std::string worker_bind_host_;
29
+ int base_port_;
30
+ int switch_timeout_sec_;
31
+ int n_ctx_;
32
+ int n_threads_;
33
+ int n_gpu_layers_;
34
+ int n_batch_;
35
+ int n_ubatch_;
36
+ int next_port_;
37
+
38
+ int allocate_port();
39
+ void finish_switch(bool ok);
40
+ pid_t spawn_worker(const std::string &model, int port);
41
+ bool wait_until_ready(pid_t pid, int port, int timeout_sec);
42
+ std::pair<int, std::string> http_get(int port, const std::string &target);
43
+ };
44
+
45
+ bool is_alive(pid_t pid);
46
+ void shutdown_worker(pid_t pid, int wait_seconds = 15);
cpp/request_parsing.cpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "request_parsing.h"
2
+
3
+ #include <algorithm>
4
+ #include <sstream>
5
+
6
+ static int estimate_text_tokens_rough(const std::string &text) {
7
+ if (text.empty()) return 0;
8
+ return std::max(1, static_cast<int>((text.size() + 3) / 4));
9
+ }
10
+
11
+ static std::string flatten_json_content(const json &content) {
12
+ if (content.is_string()) {
13
+ return content.get<std::string>();
14
+ }
15
+ if (content.is_array()) {
16
+ std::ostringstream oss;
17
+ bool first = true;
18
+ for (const auto &item : content) {
19
+ std::string part;
20
+ if (item.is_string()) {
21
+ part = item.get<std::string>();
22
+ } else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
23
+ part = item["text"].get<std::string>();
24
+ }
25
+ if (part.empty()) continue;
26
+ if (!first) oss << '\n';
27
+ oss << part;
28
+ first = false;
29
+ }
30
+ return oss.str();
31
+ }
32
+ return "";
33
+ }
34
+
35
+ std::optional<TokenEstimate> estimate_chat_tokens(
36
+ const json &payload,
37
+ const LimitsConfig &limits,
38
+ std::string &error) {
39
+ if (!payload.is_object()) {
40
+ error = "Expected JSON object";
41
+ return std::nullopt;
42
+ }
43
+ if (!payload.contains("messages") || !payload["messages"].is_array()) {
44
+ error = "Expected 'messages' array";
45
+ return std::nullopt;
46
+ }
47
+
48
+ TokenEstimate estimate;
49
+ estimate.requested_max_tokens = limits.default_max_tokens;
50
+ if (payload.contains("max_tokens")) {
51
+ if (!payload["max_tokens"].is_number_integer()) {
52
+ error = "Expected integer 'max_tokens'";
53
+ return std::nullopt;
54
+ }
55
+ estimate.requested_max_tokens = payload["max_tokens"].get<int>();
56
+ }
57
+
58
+ if (estimate.requested_max_tokens <= 0) {
59
+ error = "'max_tokens' must be > 0";
60
+ return std::nullopt;
61
+ }
62
+
63
+ for (const auto &message : payload["messages"]) {
64
+ if (!message.is_object()) continue;
65
+ if (message.contains("role") && message["role"].is_string()) {
66
+ estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
67
+ }
68
+ if (message.contains("content")) {
69
+ estimate.prompt_tokens += estimate_text_tokens_rough(flatten_json_content(message["content"]));
70
+ }
71
+ estimate.prompt_tokens += 4;
72
+ }
73
+
74
+ estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
75
+ if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
76
+ error = "Estimated request tokens exceed configured limit";
77
+ return std::nullopt;
78
+ }
79
+
80
+ error.clear();
81
+ return estimate;
82
+ }
83
+
84
+ bool request_stream_enabled(const json &payload) {
85
+ if (!payload.is_object() || !payload.contains("stream")) return false;
86
+ return payload["stream"].is_boolean() && payload["stream"].get<bool>();
87
+ }
cpp/request_parsing.h ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <optional>
4
+ #include <string>
5
+
6
+ #include "llm_manager_types.h"
7
+
8
+ std::optional<TokenEstimate> estimate_chat_tokens(
9
+ const json &payload,
10
+ const LimitsConfig &limits,
11
+ std::string &error);
12
+
13
+ bool request_stream_enabled(const json &payload);
cpp/runtime_components.cpp ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "runtime_components.h"
2
+
3
+ #include "config.h"
4
+ #include "http_helpers.h"
5
+ #include "model_manager.h"
6
+
7
+ #include <algorithm>
8
+ #include <sstream>
9
+
10
+ RateLimiterStore::RateLimiterStore(const RateLimitConfig &config)
11
+ : requests_per_minute_(std::max(0, config.requests_per_minute)),
12
+ estimated_tokens_per_minute_(std::max(0, config.estimated_tokens_per_minute)) {}
13
+
14
+ RateLimitDecision RateLimiterStore::allow(const std::string &api_key_id, int estimated_tokens) {
15
+ if (requests_per_minute_ <= 0 && estimated_tokens_per_minute_ <= 0) return {};
16
+
17
+ std::lock_guard<std::mutex> lock(mu_);
18
+ auto &bucket = buckets_[api_key_id];
19
+ const auto now = std::chrono::steady_clock::now();
20
+ refill(bucket.request_tokens, bucket.last_request_refill, requests_per_minute_, now);
21
+ refill(bucket.estimated_tokens, bucket.last_estimated_refill, estimated_tokens_per_minute_, now);
22
+
23
+ if (requests_per_minute_ > 0 && bucket.request_tokens < 1.0) {
24
+ return {false, 1, "Rate limit exceeded: requests"};
25
+ }
26
+ if (estimated_tokens_per_minute_ > 0 && bucket.estimated_tokens < estimated_tokens) {
27
+ return {false, 1, "Rate limit exceeded: estimated tokens"};
28
+ }
29
+
30
+ if (requests_per_minute_ > 0) bucket.request_tokens -= 1.0;
31
+ if (estimated_tokens_per_minute_ > 0) bucket.estimated_tokens -= estimated_tokens;
32
+ return {};
33
+ }
34
+
35
+ void RateLimiterStore::refill(
36
+ double &tokens,
37
+ std::chrono::steady_clock::time_point &last_refill,
38
+ int limit_per_minute,
39
+ std::chrono::steady_clock::time_point now) {
40
+ if (limit_per_minute <= 0) return;
41
+ if (last_refill.time_since_epoch().count() == 0) {
42
+ tokens = limit_per_minute;
43
+ last_refill = now;
44
+ return;
45
+ }
46
+ const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
47
+ if (elapsed <= 0) return;
48
+ const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
49
+ tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
50
+ last_refill = now;
51
+ }
52
+
53
+ std::shared_ptr<RequestContext> RequestRegistry::create(
54
+ const std::string &request_id,
55
+ const ApiKeyRecord &principal,
56
+ const TokenEstimate &estimate,
57
+ const std::string &request_body) {
58
+ auto ctx = std::make_shared<RequestContext>();
59
+ ctx->request_id = request_id;
60
+ ctx->api_key_id = principal.key_id;
61
+ ctx->role = principal.role;
62
+ ctx->priority = role_to_priority(principal.role);
63
+ ctx->estimate = estimate;
64
+ ctx->request_body = request_body;
65
+ ctx->created_at = std::chrono::steady_clock::now();
66
+ ctx->enqueue_time = ctx->created_at;
67
+
68
+ std::lock_guard<std::mutex> lock(mu_);
69
+ requests_[request_id] = ctx;
70
+ return ctx;
71
+ }
72
+
73
+ std::shared_ptr<RequestContext> RequestRegistry::find(const std::string &request_id) const {
74
+ std::lock_guard<std::mutex> lock(mu_);
75
+ const auto it = requests_.find(request_id);
76
+ if (it == requests_.end()) return nullptr;
77
+ return it->second;
78
+ }
79
+
80
+ void RequestRegistry::complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
81
+ {
82
+ std::lock_guard<std::mutex> lock(ctx->mu);
83
+ ctx->state.store(state);
84
+ ctx->result = std::move(result);
85
+ ctx->completed = true;
86
+ }
87
+ ctx->cv.notify_all();
88
+ }
89
+
90
+ std::shared_ptr<RequestContext> RequestRegistry::cancel_request(const std::string &request_id) {
91
+ auto ctx = find(request_id);
92
+ if (!ctx) return nullptr;
93
+
94
+ ctx->cancelled.store(true);
95
+ const auto state = ctx->state.load();
96
+ if (state == RequestState::QUEUED) {
97
+ complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
98
+ } else if (state == RequestState::RUNNING) {
99
+ ctx->state.store(RequestState::CANCELLED);
100
+ }
101
+ return ctx;
102
+ }
103
+
104
+ std::vector<std::shared_ptr<RequestContext>> RequestRegistry::cancel_all() {
105
+ std::vector<std::shared_ptr<RequestContext>> out;
106
+ std::lock_guard<std::mutex> lock(mu_);
107
+ out.reserve(requests_.size());
108
+ for (auto &[_, ctx] : requests_) {
109
+ ctx->cancelled.store(true);
110
+ const auto state = ctx->state.load();
111
+ if (state == RequestState::QUEUED) {
112
+ {
113
+ std::lock_guard<std::mutex> ctx_lock(ctx->mu);
114
+ ctx->state.store(RequestState::CANCELLED);
115
+ ctx->result = {499, R"({"error":"Request cancelled"})"};
116
+ ctx->completed = true;
117
+ }
118
+ ctx->cv.notify_all();
119
+ } else if (state == RequestState::RUNNING) {
120
+ ctx->state.store(RequestState::CANCELLED);
121
+ }
122
+ out.push_back(ctx);
123
+ }
124
+ return out;
125
+ }
126
+
127
+ void MetricsRegistry::inc_requests_total() { requests_total_.fetch_add(1); }
128
+ void MetricsRegistry::inc_requests_inflight() { requests_inflight_.fetch_add(1); }
129
+ void MetricsRegistry::dec_requests_inflight() { requests_inflight_.fetch_sub(1); }
130
+ void MetricsRegistry::inc_queue_rejected_total() { queue_rejected_total_.fetch_add(1); }
131
+ void MetricsRegistry::inc_rate_limited_total() { rate_limited_total_.fetch_add(1); }
132
+ void MetricsRegistry::add_cancellations_total(uint64_t delta) { cancellations_total_.fetch_add(delta); }
133
+ void MetricsRegistry::inc_switch_total() { switch_total_.fetch_add(1); }
134
+ void MetricsRegistry::inc_worker_restarts_total() { worker_restarts_total_.fetch_add(1); }
135
+
136
+ void MetricsRegistry::observe_request_latency_ms(int64_t value) {
137
+ request_latency_ms_total_.fetch_add(value);
138
+ request_latency_samples_.fetch_add(1);
139
+ }
140
+
141
+ void MetricsRegistry::observe_queue_wait_ms(int64_t value) {
142
+ queue_wait_ms_total_.fetch_add(value);
143
+ queue_wait_samples_.fetch_add(1);
144
+ }
145
+
146
+ std::string MetricsRegistry::render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const {
147
+ std::ostringstream oss;
148
+ oss << "llm_manager_requests_total " << requests_total_.load() << '\n';
149
+ oss << "llm_manager_requests_inflight " << requests_inflight_.load() << '\n';
150
+ oss << "llm_manager_request_latency_ms_total " << request_latency_ms_total_.load() << '\n';
151
+ oss << "llm_manager_request_latency_ms_samples " << request_latency_samples_.load() << '\n';
152
+ oss << "llm_manager_queue_size " << queue.total_size << '\n';
153
+ oss << "llm_manager_queue_admin_size " << queue.admin_size << '\n';
154
+ oss << "llm_manager_queue_user_size " << queue.user_size << '\n';
155
+ oss << "llm_manager_queue_tokens " << queue.total_tokens << '\n';
156
+ oss << "llm_manager_queue_rejected_total " << queue_rejected_total_.load() << '\n';
157
+ oss << "llm_manager_rate_limited_total " << rate_limited_total_.load() << '\n';
158
+ oss << "llm_manager_queue_wait_time_ms_total " << queue_wait_ms_total_.load() << '\n';
159
+ oss << "llm_manager_queue_wait_time_ms_samples " << queue_wait_samples_.load() << '\n';
160
+ oss << "llm_manager_cancellations_total " << cancellations_total_.load() << '\n';
161
+ oss << "llm_manager_switch_total " << switch_total_.load() << '\n';
162
+ oss << "llm_manager_worker_restarts_total " << worker_restarts_total_.load() << '\n';
163
+ const auto active = manager.active_worker();
164
+ oss << "llm_manager_active_worker " << (active ? 1 : 0) << '\n';
165
+ return oss.str();
166
+ }
167
+
168
+ PrioritySchedulerQueue::PrioritySchedulerQueue(const QueueConfig &config)
169
+ : max_size_(config.max_size),
170
+ max_tokens_(config.max_tokens),
171
+ admin_quota_(std::max(1, config.admin_quota)),
172
+ retry_after_sec_(std::max(1, config.retry_after_sec)) {}
173
+
174
+ bool PrioritySchedulerQueue::try_push(const std::shared_ptr<RequestContext> &ctx) {
175
+ std::lock_guard<std::mutex> lock(mu_);
176
+ if (current_size_ >= max_size_) return false;
177
+ if (current_tokens_ + ctx->estimate.estimated_total_tokens > max_tokens_) return false;
178
+
179
+ if (ctx->priority == Priority::ADMIN) admin_queue_.push_back(ctx);
180
+ else user_queue_.push_back(ctx);
181
+
182
+ ++current_size_;
183
+ current_tokens_ += ctx->estimate.estimated_total_tokens;
184
+ cv_.notify_one();
185
+ return true;
186
+ }
187
+
188
+ std::shared_ptr<RequestContext> PrioritySchedulerQueue::pop_next() {
189
+ std::unique_lock<std::mutex> lock(mu_);
190
+ cv_.wait(lock, [&]() { return stopped_ || current_size_ > 0; });
191
+ if (stopped_) return nullptr;
192
+
193
+ std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
194
+ if (!admin_queue_.empty() && (admin_streak_ < admin_quota_ || user_queue_.empty())) {
195
+ selected_queue = &admin_queue_;
196
+ ++admin_streak_;
197
+ } else if (!user_queue_.empty()) {
198
+ selected_queue = &user_queue_;
199
+ admin_streak_ = 0;
200
+ } else if (!admin_queue_.empty()) {
201
+ selected_queue = &admin_queue_;
202
+ admin_streak_ = 1;
203
+ }
204
+
205
+ if (!selected_queue || selected_queue->empty()) return nullptr;
206
+
207
+ auto best_it = std::min_element(
208
+ selected_queue->begin(),
209
+ selected_queue->end(),
210
+ [](const auto &a, const auto &b) {
211
+ return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
212
+ });
213
+ auto ctx = *best_it;
214
+ selected_queue->erase(best_it);
215
+ --current_size_;
216
+ current_tokens_ -= ctx->estimate.estimated_total_tokens;
217
+ return ctx;
218
+ }
219
+
220
+ void PrioritySchedulerQueue::stop() {
221
+ std::lock_guard<std::mutex> lock(mu_);
222
+ stopped_ = true;
223
+ cv_.notify_all();
224
+ }
225
+
226
+ int PrioritySchedulerQueue::retry_after_sec() const {
227
+ return retry_after_sec_;
228
+ }
229
+
230
+ QueueSnapshot PrioritySchedulerQueue::snapshot() const {
231
+ std::lock_guard<std::mutex> lock(mu_);
232
+ return QueueSnapshot{current_size_, admin_queue_.size(), user_queue_.size(), current_tokens_};
233
+ }
234
+
235
+ Scheduler::Scheduler(
236
+ ModelManager &manager,
237
+ RequestRegistry &registry,
238
+ MetricsRegistry &metrics,
239
+ const QueueConfig &queue_config)
240
+ : manager_(manager), registry_(registry), metrics_(metrics), queue_(queue_config) {
241
+ worker_ = std::thread([this]() { worker_loop(); });
242
+ }
243
+
244
+ Scheduler::~Scheduler() {
245
+ queue_.stop();
246
+ if (worker_.joinable()) worker_.join();
247
+ }
248
+
249
+ bool Scheduler::try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
250
+ return queue_.try_push(ctx);
251
+ }
252
+
253
+ int Scheduler::retry_after_sec() const {
254
+ return queue_.retry_after_sec();
255
+ }
256
+
257
+ QueueSnapshot Scheduler::snapshot() const {
258
+ return queue_.snapshot();
259
+ }
260
+
261
+ void Scheduler::worker_loop() {
262
+ for (;;) {
263
+ auto ctx = queue_.pop_next();
264
+ if (!ctx) return;
265
+
266
+ if (ctx->cancelled.load()) {
267
+ registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
268
+ continue;
269
+ }
270
+
271
+ ctx->state.store(RequestState::RUNNING);
272
+ ctx->start_time = std::chrono::steady_clock::now();
273
+ metrics_.observe_queue_wait_ms(
274
+ std::chrono::duration_cast<std::chrono::milliseconds>(ctx->start_time - ctx->enqueue_time).count());
275
+
276
+ auto worker = manager_.active_worker();
277
+ if (!worker) {
278
+ registry_.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
279
+ continue;
280
+ }
281
+
282
+ try {
283
+ auto [status, body] = forward_chat(*worker, ctx->request_body);
284
+ if (ctx->cancelled.load()) {
285
+ registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
286
+ continue;
287
+ }
288
+ registry_.complete(ctx, RequestState::DONE, {status, body});
289
+ } catch (const std::exception &e) {
290
+ registry_.complete(ctx, RequestState::FAILED, {500, json({{"error", e.what()}}).dump()});
291
+ }
292
+ }
293
+ }
294
+
295
+ ApiKeyAuth::ApiKeyAuth(const ManagerConfig &config)
296
+ : header_name_(config.auth.header), scheme_(config.auth.scheme) {
297
+ for (const auto &record : config.api_keys) {
298
+ records_by_secret_.emplace(record.secret, record);
299
+ }
300
+ }
301
+
302
+ bool ApiKeyAuth::enabled() const {
303
+ return !records_by_secret_.empty();
304
+ }
305
+
306
+ std::optional<ApiKeyRecord> ApiKeyAuth::authenticate(
307
+ const http::request<http::string_body> &req,
308
+ std::string &error) const {
309
+ if (!enabled()) {
310
+ error.clear();
311
+ return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
312
+ }
313
+
314
+ const auto token = extract_bearer_token(req, error);
315
+ if (!token) return std::nullopt;
316
+
317
+ const auto it = records_by_secret_.find(*token);
318
+ if (it == records_by_secret_.end()) {
319
+ error = "Invalid API key";
320
+ return std::nullopt;
321
+ }
322
+ if (!it->second.enabled) {
323
+ error = "API key disabled";
324
+ return std::nullopt;
325
+ }
326
+ error.clear();
327
+ return it->second;
328
+ }
329
+
330
+ std::optional<std::string> ApiKeyAuth::extract_bearer_token(
331
+ const http::request<http::string_body> &req,
332
+ std::string &error) const {
333
+ const auto header_it = req.find(header_name_);
334
+ if (header_it == req.end()) {
335
+ error = "Missing authorization header";
336
+ return std::nullopt;
337
+ }
338
+
339
+ const std::string value = trim_copy(header_it->value().to_string());
340
+ const std::string prefix = scheme_ + " ";
341
+ if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
342
+ error = "Invalid authorization scheme";
343
+ return std::nullopt;
344
+ }
345
+
346
+ std::string token = trim_copy(value.substr(prefix.size()));
347
+ if (token.empty()) {
348
+ error = "Missing API key";
349
+ return std::nullopt;
350
+ }
351
+ return token;
352
+ }
cpp/runtime_components.h ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <boost/beast/http.hpp>
4
+
5
+ #include <deque>
6
+ #include <memory>
7
+ #include <thread>
8
+ #include <unordered_map>
9
+
10
+ #include "llm_manager_types.h"
11
+
12
+ class ModelManager;
13
+
14
+ namespace http = boost::beast::http;
15
+
16
+ class RateLimiterStore {
17
+ public:
18
+ explicit RateLimiterStore(const RateLimitConfig &config);
19
+ RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens);
20
+
21
+ private:
22
+ struct Bucket {
23
+ double request_tokens = 0.0;
24
+ double estimated_tokens = 0.0;
25
+ std::chrono::steady_clock::time_point last_request_refill{};
26
+ std::chrono::steady_clock::time_point last_estimated_refill{};
27
+ };
28
+
29
+ std::mutex mu_;
30
+ std::unordered_map<std::string, Bucket> buckets_;
31
+ int requests_per_minute_;
32
+ int estimated_tokens_per_minute_;
33
+
34
+ static void refill(
35
+ double &tokens,
36
+ std::chrono::steady_clock::time_point &last_refill,
37
+ int limit_per_minute,
38
+ std::chrono::steady_clock::time_point now);
39
+ };
40
+
41
+ class RequestRegistry {
42
+ public:
43
+ std::shared_ptr<RequestContext> create(
44
+ const std::string &request_id,
45
+ const ApiKeyRecord &principal,
46
+ const TokenEstimate &estimate,
47
+ const std::string &request_body);
48
+ std::shared_ptr<RequestContext> find(const std::string &request_id) const;
49
+ void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result);
50
+ std::shared_ptr<RequestContext> cancel_request(const std::string &request_id);
51
+ std::vector<std::shared_ptr<RequestContext>> cancel_all();
52
+
53
+ private:
54
+ mutable std::mutex mu_;
55
+ std::unordered_map<std::string, std::shared_ptr<RequestContext>> requests_;
56
+ };
57
+
58
+ class MetricsRegistry {
59
+ public:
60
+ void inc_requests_total();
61
+ void inc_requests_inflight();
62
+ void dec_requests_inflight();
63
+ void inc_queue_rejected_total();
64
+ void inc_rate_limited_total();
65
+ void add_cancellations_total(uint64_t delta = 1);
66
+ void inc_switch_total();
67
+ void inc_worker_restarts_total();
68
+ void observe_request_latency_ms(int64_t value);
69
+ void observe_queue_wait_ms(int64_t value);
70
+ std::string render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const;
71
+
72
+ private:
73
+ std::atomic<uint64_t> requests_total_{0};
74
+ std::atomic<int64_t> requests_inflight_{0};
75
+ std::atomic<uint64_t> request_latency_ms_total_{0};
76
+ std::atomic<uint64_t> request_latency_samples_{0};
77
+ std::atomic<uint64_t> queue_rejected_total_{0};
78
+ std::atomic<uint64_t> rate_limited_total_{0};
79
+ std::atomic<uint64_t> queue_wait_ms_total_{0};
80
+ std::atomic<uint64_t> queue_wait_samples_{0};
81
+ std::atomic<uint64_t> cancellations_total_{0};
82
+ std::atomic<uint64_t> switch_total_{0};
83
+ std::atomic<uint64_t> worker_restarts_total_{0};
84
+ };
85
+
86
+ class PrioritySchedulerQueue {
87
+ public:
88
+ explicit PrioritySchedulerQueue(const QueueConfig &config);
89
+ bool try_push(const std::shared_ptr<RequestContext> &ctx);
90
+ std::shared_ptr<RequestContext> pop_next();
91
+ void stop();
92
+ int retry_after_sec() const;
93
+ QueueSnapshot snapshot() const;
94
+
95
+ private:
96
+ mutable std::mutex mu_;
97
+ std::condition_variable cv_;
98
+ std::deque<std::shared_ptr<RequestContext>> admin_queue_;
99
+ std::deque<std::shared_ptr<RequestContext>> user_queue_;
100
+ size_t max_size_;
101
+ size_t current_size_ = 0;
102
+ int max_tokens_;
103
+ int current_tokens_ = 0;
104
+ int admin_quota_;
105
+ int admin_streak_ = 0;
106
+ int retry_after_sec_;
107
+ bool stopped_ = false;
108
+ };
109
+
110
+ class Scheduler {
111
+ public:
112
+ Scheduler(
113
+ ModelManager &manager,
114
+ RequestRegistry &registry,
115
+ MetricsRegistry &metrics,
116
+ const QueueConfig &queue_config);
117
+ ~Scheduler();
118
+
119
+ bool try_enqueue(const std::shared_ptr<RequestContext> &ctx);
120
+ int retry_after_sec() const;
121
+ QueueSnapshot snapshot() const;
122
+
123
+ private:
124
+ ModelManager &manager_;
125
+ RequestRegistry &registry_;
126
+ MetricsRegistry &metrics_;
127
+ PrioritySchedulerQueue queue_;
128
+ std::thread worker_;
129
+
130
+ void worker_loop();
131
+ };
132
+
133
+ class ApiKeyAuth {
134
+ public:
135
+ explicit ApiKeyAuth(const ManagerConfig &config);
136
+ bool enabled() const;
137
+ std::optional<ApiKeyRecord> authenticate(
138
+ const http::request<http::string_body> &req,
139
+ std::string &error) const;
140
+
141
+ private:
142
+ std::string header_name_;
143
+ std::string scheme_;
144
+ std::unordered_map<std::string, ApiKeyRecord> records_by_secret_;
145
+
146
+ std::optional<std::string> extract_bearer_token(
147
+ const http::request<http::string_body> &req,
148
+ std::string &error) const;
149
+ };
cpp/server.cpp ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "server.h"
2
+
3
+ #include <boost/beast/core.hpp>
4
+
5
+ #include "http_helpers.h"
6
+ #include "model_manager.h"
7
+ #include "request_parsing.h"
8
+ #include "runtime_components.h"
9
+
10
+ #include <algorithm>
11
+ #include <atomic>
12
+
13
+ namespace beast = boost::beast;
14
+ namespace http = beast::http;
15
+
16
+ static std::atomic<uint64_t> g_req_id{1};
17
+
18
+ http::response<http::string_body> handle_request(
19
+ ModelManager &manager,
20
+ const ManagerConfig &config,
21
+ const ApiKeyAuth &auth,
22
+ RateLimiterStore &rate_limiter,
23
+ RequestRegistry &registry,
24
+ MetricsRegistry &metrics,
25
+ Scheduler &scheduler,
26
+ http::request<http::string_body> &&req) {
27
+ const auto start = std::chrono::steady_clock::now();
28
+ const auto req_id_num = g_req_id.fetch_add(1);
29
+ const std::string request_id = std::to_string(req_id_num);
30
+ const std::string target = req.target().to_string();
31
+ const std::string method = req.method_string().to_string();
32
+ const std::string path = target.substr(0, target.find('?'));
33
+ auto authenticated = std::optional<ApiKeyRecord>{};
34
+
35
+ metrics.inc_requests_total();
36
+ metrics.inc_requests_inflight();
37
+ struct InflightGuard {
38
+ MetricsRegistry &metrics;
39
+ ~InflightGuard() { metrics.dec_requests_inflight(); }
40
+ } inflight_guard{metrics};
41
+
42
+ log_line("request_id=" + request_id + " method=" + method + " path=" + target);
43
+ if (!req.body().empty()) {
44
+ log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
45
+ }
46
+
47
+ auto json_response = [&](http::status status, const json &obj) {
48
+ json payload = obj;
49
+ payload["request_id"] = request_id;
50
+ http::response<http::string_body> res{status, req.version()};
51
+ res.set(http::field::content_type, "application/json");
52
+ res.set(http::field::server, "llm-manager");
53
+ res.set("X-Request-Id", request_id);
54
+ res.keep_alive(req.keep_alive());
55
+ res.body() = payload.dump();
56
+ res.prepare_payload();
57
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
58
+ std::chrono::steady_clock::now() - start).count();
59
+ metrics.observe_request_latency_ms(elapsed_ms);
60
+ log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
61
+ " elapsed_ms=" + std::to_string(elapsed_ms));
62
+ return res;
63
+ };
64
+
65
+ auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
66
+ auto res = json_response(status, obj);
67
+ res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
68
+ return res;
69
+ };
70
+
71
+ auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
72
+ std::string auth_error;
73
+ authenticated = auth.authenticate(req, auth_error);
74
+ if (!authenticated) {
75
+ return json_response(http::status::unauthorized, {{"error", auth_error}});
76
+ }
77
+ if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
78
+ return json_response(http::status::forbidden, {{"error", "Admin role required"}});
79
+ }
80
+ log_line("request_id=" + request_id +
81
+ " api_key_id=" + authenticated->key_id +
82
+ " role=" + role_to_string(authenticated->role));
83
+ return std::nullopt;
84
+ };
85
+
86
+ try {
87
+ if (path == "/health" && req.method() == http::verb::get) {
88
+ return json_response(http::status::ok, manager.models_view());
89
+ }
90
+
91
+ if (path == "/models" && req.method() == http::verb::get) {
92
+ return json_response(http::status::ok, manager.models_view());
93
+ }
94
+
95
+ if (path == "/queue/metrics" && req.method() == http::verb::get) {
96
+ http::response<http::string_body> res{http::status::ok, req.version()};
97
+ res.set(http::field::content_type, "text/plain; version=0.0.4; charset=utf-8");
98
+ res.set(http::field::server, "llm-manager");
99
+ res.set("X-Request-Id", request_id);
100
+ res.keep_alive(req.keep_alive());
101
+ res.body() = metrics.render_prometheus(scheduler.snapshot(), manager);
102
+ res.prepare_payload();
103
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
104
+ std::chrono::steady_clock::now() - start).count();
105
+ metrics.observe_request_latency_ms(elapsed_ms);
106
+ log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
107
+ " elapsed_ms=" + std::to_string(elapsed_ms));
108
+ return res;
109
+ }
110
+
111
+ if (path == "/switch-model" && req.method() == http::verb::post) {
112
+ if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
113
+ json j = json::parse(req.body(), nullptr, false);
114
+ if (j.is_discarded()) {
115
+ return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
116
+ }
117
+ std::string model;
118
+ if (j.contains("model_name")) model = j["model_name"].get<std::string>();
119
+ if (j.contains("model")) model = j["model"].get<std::string>();
120
+ if (model.empty()) {
121
+ return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
122
+ }
123
+
124
+ std::string err;
125
+ bool ok = manager.switch_model(model, err);
126
+ if (!ok) {
127
+ auto status = (err == "Switch already in progress")
128
+ ? http::status::conflict
129
+ : http::status::internal_server_error;
130
+ return json_response(status, {{"status", "error"}, {"error", err}});
131
+ }
132
+ auto state = manager.models_view();
133
+ state["message"] = "Switched model successfully";
134
+ metrics.inc_switch_total();
135
+ return json_response(http::status::ok, state);
136
+ }
137
+
138
+ if (path == "/stop" && req.method() == http::verb::post) {
139
+ if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
140
+ const auto cancelled = registry.cancel_all();
141
+ metrics.add_cancellations_total(cancelled.size());
142
+ std::string err;
143
+ bool ok = manager.restart_active(err);
144
+ if (!ok) {
145
+ http::status status = http::status::internal_server_error;
146
+ if (err == "Switch already in progress") status = http::status::conflict;
147
+ else if (err == "No active model") status = http::status::service_unavailable;
148
+ return json_response(status, {{"status", "error"}, {"error", err}});
149
+ }
150
+ auto state = manager.models_view();
151
+ state["message"] = "Stopped in-flight prompts and restarted model";
152
+ metrics.inc_worker_restarts_total();
153
+ return json_response(http::status::ok, state);
154
+ }
155
+
156
+ if (req.method() == http::verb::post) {
157
+ if (auto cancel_id = extract_cancel_request_id(path)) {
158
+ if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
159
+ auto ctx = registry.find(*cancel_id);
160
+ if (!ctx) {
161
+ return json_response(http::status::not_found, {{"error", "Unknown request id"}});
162
+ }
163
+ if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
164
+ return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
165
+ }
166
+
167
+ const auto previous_state = ctx->state.load();
168
+ registry.cancel_request(*cancel_id);
169
+ metrics.add_cancellations_total();
170
+ std::string restart_error;
171
+ bool restarted = true;
172
+ if (previous_state == RequestState::RUNNING) {
173
+ restarted = manager.restart_active(restart_error);
174
+ if (restarted) metrics.inc_worker_restarts_total();
175
+ }
176
+
177
+ json payload = {
178
+ {"cancelled_request_id", *cancel_id},
179
+ {"state", state_to_string(ctx->state.load())}
180
+ };
181
+ if (!restarted) payload["restart_error"] = restart_error;
182
+ return json_response(http::status::ok, payload);
183
+ }
184
+ }
185
+
186
+ if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
187
+ if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
188
+ json payload = json::parse(req.body(), nullptr, false);
189
+ if (payload.is_discarded()) {
190
+ return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
191
+ }
192
+ if (request_stream_enabled(payload)) {
193
+ if (!config.streaming.enabled) {
194
+ return json_response(http::status::not_implemented, {{"error", "Streaming is disabled"}});
195
+ }
196
+ return json_response(http::status::not_implemented, {{"error", "Streaming relay is not implemented yet"}});
197
+ }
198
+
199
+ std::string token_error;
200
+ auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
201
+ if (!estimate) {
202
+ return json_response(http::status::bad_request, {{"error", token_error}});
203
+ }
204
+ log_line("request_id=" + request_id +
205
+ " prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
206
+ " max_tokens=" + std::to_string(estimate->requested_max_tokens) +
207
+ " estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
208
+
209
+ auto rate_limit_decision = rate_limiter.allow(authenticated->key_id, estimate->estimated_total_tokens);
210
+ if (!rate_limit_decision.allowed) {
211
+ metrics.inc_rate_limited_total();
212
+ return json_response_with_retry_after(
213
+ http::status::too_many_requests,
214
+ {{"error", rate_limit_decision.error}},
215
+ rate_limit_decision.retry_after_sec);
216
+ }
217
+
218
+ auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
219
+ if (!scheduler.try_enqueue(ctx)) {
220
+ ctx->cancelled.store(true);
221
+ registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
222
+ metrics.inc_queue_rejected_total();
223
+ return json_response_with_retry_after(
224
+ http::status::service_unavailable,
225
+ {{"error", "Queue full"}},
226
+ scheduler.retry_after_sec());
227
+ }
228
+
229
+ std::unique_lock<std::mutex> lock(ctx->mu);
230
+ const bool finished = ctx->cv.wait_for(
231
+ lock,
232
+ std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
233
+ [&]() { return ctx->completed; });
234
+ if (!finished) {
235
+ lock.unlock();
236
+ registry.cancel_request(request_id);
237
+ metrics.add_cancellations_total();
238
+ std::string restart_error;
239
+ bool restarted = true;
240
+ if (ctx->state.load() == RequestState::RUNNING) {
241
+ restarted = manager.restart_active(restart_error);
242
+ if (restarted) metrics.inc_worker_restarts_total();
243
+ }
244
+ json timeout_payload = {
245
+ {"error", "Request timed out"},
246
+ {"state", state_to_string(ctx->state.load())}
247
+ };
248
+ if (!restarted) timeout_payload["restart_error"] = restart_error;
249
+ return json_response(http::status::gateway_timeout, timeout_payload);
250
+ }
251
+
252
+ const auto final_state = ctx->state.load();
253
+ RequestResult result = ctx->result;
254
+ lock.unlock();
255
+
256
+ if (final_state == RequestState::CANCELLED) {
257
+ return json_response(http::status::ok, {{"status", "cancelled"}});
258
+ }
259
+
260
+ http::response<http::string_body> res{
261
+ static_cast<http::status>(result.status), req.version()};
262
+ res.set(http::field::content_type, result.content_type);
263
+ res.set(http::field::server, "llm-manager");
264
+ res.set("X-Request-Id", request_id);
265
+ res.keep_alive(req.keep_alive());
266
+ res.body() = result.body;
267
+ res.prepare_payload();
268
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
269
+ std::chrono::steady_clock::now() - start).count();
270
+ metrics.observe_request_latency_ms(elapsed_ms);
271
+ log_line("request_id=" + request_id +
272
+ " final_state=" + state_to_string(final_state) +
273
+ " upstream_status=" + std::to_string(result.status) +
274
+ " elapsed_ms=" + std::to_string(elapsed_ms));
275
+ return res;
276
+ }
277
+
278
+ if (req.method() == http::verb::get) {
279
+ auto worker = manager.active_worker();
280
+ if (!worker) {
281
+ return json_response(http::status::service_unavailable, {{"error", "No active model"}});
282
+ }
283
+ auto upstream = forward_get_to_worker(*worker, target);
284
+ http::response<http::string_body> res{
285
+ static_cast<http::status>(upstream.status), req.version()};
286
+ res.set(http::field::content_type, upstream.content_type);
287
+ if (!upstream.content_encoding.empty()) {
288
+ res.set(http::field::content_encoding, upstream.content_encoding);
289
+ }
290
+ res.set(http::field::server, "llm-manager");
291
+ res.set("X-Request-Id", request_id);
292
+ res.keep_alive(req.keep_alive());
293
+ res.body() = upstream.body;
294
+ res.prepare_payload();
295
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
296
+ std::chrono::steady_clock::now() - start).count();
297
+ log_line("request_id=" + request_id +
298
+ " proxied_get model=" + worker->model +
299
+ " upstream_status=" + std::to_string(upstream.status) +
300
+ " elapsed_ms=" + std::to_string(elapsed_ms));
301
+ return res;
302
+ }
303
+
304
+ return json_response(http::status::not_found, {{"error", "Not found"}});
305
+ } catch (const std::exception &e) {
306
+ return json_response(http::status::internal_server_error, {{"error", e.what()}});
307
+ }
308
+ }
309
+
310
+ void do_session(
311
+ boost::asio::ip::tcp::socket socket,
312
+ ModelManager &manager,
313
+ const ManagerConfig &config,
314
+ const ApiKeyAuth &auth,
315
+ RateLimiterStore &rate_limiter,
316
+ RequestRegistry &registry,
317
+ MetricsRegistry &metrics,
318
+ Scheduler &scheduler) {
319
+ try {
320
+ beast::flat_buffer buffer;
321
+ http::request<http::string_body> req;
322
+ http::read(socket, buffer, req);
323
+ auto res = handle_request(manager, config, auth, rate_limiter, registry, metrics, scheduler, std::move(req));
324
+ http::write(socket, res);
325
+ beast::error_code ec;
326
+ socket.shutdown(boost::asio::ip::tcp::socket::shutdown_send, ec);
327
+ } catch (...) {
328
+ }
329
+ }
cpp/server.h ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <boost/asio.hpp>
4
+ #include <boost/beast/http.hpp>
5
+
6
+ #include "llm_manager_types.h"
7
+
8
+ class ModelManager;
9
+ class ApiKeyAuth;
10
+ class RateLimiterStore;
11
+ class RequestRegistry;
12
+ class MetricsRegistry;
13
+ class Scheduler;
14
+
15
+ namespace http = boost::beast::http;
16
+
17
+ http::response<http::string_body> handle_request(
18
+ ModelManager &manager,
19
+ const ManagerConfig &config,
20
+ const ApiKeyAuth &auth,
21
+ RateLimiterStore &rate_limiter,
22
+ RequestRegistry &registry,
23
+ MetricsRegistry &metrics,
24
+ Scheduler &scheduler,
25
+ http::request<http::string_body> &&req);
26
+
27
+ void do_session(
28
+ boost::asio::ip::tcp::socket socket,
29
+ ModelManager &manager,
30
+ const ManagerConfig &config,
31
+ const ApiKeyAuth &auth,
32
+ RateLimiterStore &rate_limiter,
33
+ RequestRegistry &registry,
34
+ MetricsRegistry &metrics,
35
+ Scheduler &scheduler);
app.py → python/app.py RENAMED
File without changes
logger.py → python/logger.py RENAMED
File without changes