Spaces:

ResearchEngineering
/

AGI

Sleeping

App Files Files Community

Dmitry Beresnev commited on 18 days ago

Commit

332826f

1 Parent(s): d9ce859

Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder

Browse files

Files changed (19) hide show

Dockerfile +4 -4
config.toml.example +3 -0
cpp/config.cpp +224 -0
cpp/config.h +13 -0
cpp/http_helpers.cpp +97 -0
cpp/http_helpers.h +14 -0
cpp/llm_manager.cpp +15 -1512
cpp/llm_manager_types.cpp +20 -0
cpp/llm_manager_types.h +167 -0
cpp/model_manager.cpp +253 -0
cpp/model_manager.h +46 -0
cpp/request_parsing.cpp +87 -0
cpp/request_parsing.h +13 -0
cpp/runtime_components.cpp +352 -0
cpp/runtime_components.h +149 -0
cpp/server.cpp +329 -0
cpp/server.h +35 -0
app.py → python/app.py +0 -0
logger.py → python/logger.py +0 -0

Dockerfile CHANGED Viewed

@@ -55,9 +55,9 @@ RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/l
     ldd build/bin/llama-server || true
 # Build C++ manager (Boost.Beast + JSON)
-COPY cpp/llm_manager.cpp /build/llm_manager.cpp
 RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
-    /build/llm_manager.cpp -o /build/llm-manager
 # Runtime stage
 FROM debian:bookworm-slim
@@ -95,8 +95,7 @@ RUN useradd -m -u 1000 user && \
     chown -R user:user /home/user
 # Copy application code
-COPY --chown=user:user app.py /home/user/app.py
-COPY --chown=user:user logger.py /home/user/logger.py
 USER user
 WORKDIR /home/user
@@ -105,6 +104,7 @@ WORKDIR /home/user
 ENV HOME=/home/user \
     LLAMA_CACHE=/home/user/.cache/llama.cpp \
     PATH=/home/user/.local/bin:$PATH \
     PYTHONUNBUFFERED=1
 EXPOSE 7860

     ldd build/bin/llama-server || true
 # Build C++ manager (Boost.Beast + JSON)
+COPY cpp/ /build/cpp/
 RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
+    /build/cpp/*.cpp -o /build/llm-manager
 # Runtime stage
 FROM debian:bookworm-slim
     chown -R user:user /home/user
 # Copy application code
+COPY --chown=user:user python/ /home/user/python/
 USER user
 WORKDIR /home/user
 ENV HOME=/home/user \
     LLAMA_CACHE=/home/user/.cache/llama.cpp \
     PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=/home/user/python \
     PYTHONUNBUFFERED=1
 EXPOSE 7860

config.toml.example CHANGED Viewed

@@ -35,6 +35,9 @@ retry_after_sec = 5
 [scheduler]
 max_concurrent = 1
 [rate_limit]
 requests_per_minute = 60
 estimated_tokens_per_minute = 6000

 [scheduler]
 max_concurrent = 1
+[streaming]
+enabled = false
 [rate_limit]
 requests_per_minute = 60
 estimated_tokens_per_minute = 6000

cpp/config.cpp ADDED Viewed

	@@ -0,0 +1,224 @@

+#include "config.h"
+#include "http_helpers.h"
+#include <algorithm>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <unordered_map>
+std::string get_env_or(const char *name, const std::string &fallback) {
+    const char *v = std::getenv(name);
+    return (v && *v) ? std::string(v) : fallback;
+}
+int get_env_int_or(const char *name, int fallback) {
+    const char *v = std::getenv(name);
+    if (!v || !*v) return fallback;
+    try {
+        return std::stoi(v);
+    } catch (...) {
+        return fallback;
+    }
+}
+std::string trim_copy(const std::string &value) {
+    const auto first = value.find_first_not_of(" \t\r\n");
+    if (first == std::string::npos) return "";
+    const auto last = value.find_last_not_of(" \t\r\n");
+    return value.substr(first, last - first + 1);
+}
+std::string strip_quotes(const std::string &value) {
+    if (value.size() >= 2) {
+        const char first = value.front();
+        const char last = value.back();
+        if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
+            return value.substr(1, value.size() - 2);
+        }
+    }
+    return value;
+}
+bool parse_bool_or(const std::string &value, bool fallback) {
+    const std::string normalized = trim_copy(value);
+    if (normalized == "true") return true;
+    if (normalized == "false") return false;
+    return fallback;
+}
+Role parse_role_or(const std::string &value, Role fallback) {
+    const std::string normalized = trim_copy(value);
+    if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
+    if (normalized == "user" || normalized == "USER") return Role::USER;
+    return fallback;
+}
+static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
+parse_simple_toml(const std::string &path) {
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
+    std::ifstream input(path);
+    if (!input.is_open()) return out;
+    std::string current_section;
+    std::string line;
+    while (std::getline(input, line)) {
+        auto hash = line.find('#');
+        if (hash != std::string::npos) line = line.substr(0, hash);
+        line = trim_copy(line);
+        if (line.empty()) continue;
+        if (line.front() == '[' && line.back() == ']') {
+            current_section = trim_copy(line.substr(1, line.size() - 2));
+            continue;
+        }
+        const auto eq = line.find('=');
+        if (eq == std::string::npos) continue;
+        std::string key = trim_copy(line.substr(0, eq));
+        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
+        out[current_section][key] = value;
+    }
+    return out;
+}
+static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
+    std::vector<ApiKeyRecord> keys;
+    std::ifstream input(path);
+    if (!input.is_open()) return keys;
+    std::string line;
+    bool in_api_key = false;
+    ApiKeyRecord current;
+    bool has_any_field = false;
+    auto flush_current = [&]() {
+        if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
+            keys.push_back(current);
+        }
+        current = ApiKeyRecord{};
+        has_any_field = false;
+    };
+    while (std::getline(input, line)) {
+        auto hash = line.find('#');
+        if (hash != std::string::npos) line = line.substr(0, hash);
+        line = trim_copy(line);
+        if (line.empty()) continue;
+        if (line == "[[api_keys]]") {
+            flush_current();
+            in_api_key = true;
+            continue;
+        }
+        if (!in_api_key) continue;
+        if (line.front() == '[' && line.back() == ']') {
+            flush_current();
+            in_api_key = false;
+            continue;
+        }
+        const auto eq = line.find('=');
+        if (eq == std::string::npos) continue;
+        std::string key = trim_copy(line.substr(0, eq));
+        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
+        has_any_field = true;
+        if (key == "key_id") current.key_id = value;
+        else if (key == "secret") current.secret = value;
+        else if (key == "role") current.role = parse_role_or(value, current.role);
+        else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
+    }
+    flush_current();
+    return keys;
+}
+static std::string get_toml_string_or(
+    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
+    const std::string &section,
+    const std::string &key,
+    const std::string &fallback) {
+    const auto it = data.find(section);
+    if (it == data.end()) return fallback;
+    const auto kv = it->second.find(key);
+    if (kv == it->second.end() || kv->second.empty()) return fallback;
+    return kv->second;
+}
+static int get_toml_int_or(
+    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
+    const std::string &section,
+    const std::string &key,
+    int fallback) {
+    const auto it = data.find(section);
+    if (it == data.end()) return fallback;
+    const auto kv = it->second.find(key);
+    if (kv == it->second.end() || kv->second.empty()) return fallback;
+    try {
+        return std::stoi(kv->second);
+    } catch (...) {
+        return fallback;
+    }
+}
+ManagerConfig load_manager_config() {
+    ManagerConfig cfg;
+    const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
+    if (std::filesystem::exists(config_path)) {
+        toml = parse_simple_toml(config_path);
+        log_line("config: loaded " + config_path);
+    } else {
+        log_line("config: using environment/defaults (file not found: " + config_path + ")");
+    }
+    cfg.server.host = get_env_or("MANAGER_HOST", get_toml_string_or(toml, "server", "host", cfg.server.host));
+    cfg.server.port = get_env_int_or("MANAGER_PORT", get_toml_int_or(toml, "server", "port", cfg.server.port));
+    cfg.worker.default_model = get_env_or("DEFAULT_MODEL", get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
+    cfg.worker.llama_server_bin = get_env_or("LLAMA_SERVER_BIN", get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
+    cfg.worker.host = get_env_or("WORKER_HOST", get_toml_string_or(toml, "worker", "host", cfg.worker.host));
+    cfg.worker.bind_host = get_env_or("WORKER_BIND_HOST", get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
+    cfg.worker.base_port = get_env_int_or("WORKER_BASE_PORT", get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
+    cfg.worker.switch_timeout_sec = get_env_int_or("SWITCH_TIMEOUT_SEC", get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
+    cfg.llama.n_ctx = get_env_int_or("MODEL_N_CTX", get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
+    cfg.llama.threads = get_env_int_or("MODEL_THREADS", get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
+    cfg.llama.ngl = get_env_int_or("MODEL_NGL", get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
+    cfg.llama.batch = get_env_int_or("MODEL_BATCH", get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
+    cfg.llama.ubatch = get_env_int_or("MODEL_UBATCH", get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
+    cfg.auth.header = get_env_or("AUTH_HEADER", get_toml_string_or(toml, "auth", "header", cfg.auth.header));
+    cfg.auth.scheme = get_env_or("AUTH_SCHEME", get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
+    cfg.limits.default_max_tokens = get_env_int_or("DEFAULT_MAX_TOKENS", get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
+    cfg.limits.max_tokens_per_request = get_env_int_or("MAX_TOKENS_PER_REQUEST", get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
+    cfg.limits.request_timeout_sec = get_env_int_or("REQUEST_TIMEOUT_SEC", get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
+    cfg.queue.max_size = static_cast<size_t>(std::max(
+        1,
+        get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
+    cfg.queue.max_tokens = get_env_int_or("QUEUE_MAX_TOKENS", get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
+    cfg.queue.admin_quota = get_env_int_or("QUEUE_ADMIN_QUOTA", get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
+    cfg.queue.retry_after_sec = get_env_int_or("QUEUE_RETRY_AFTER_SEC", get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
+    cfg.rate_limit.requests_per_minute = get_env_int_or("REQUESTS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
+    cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or("ESTIMATED_TOKENS_PER_MINUTE", get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
+    cfg.scheduler.max_concurrent = get_env_int_or("SCHEDULER_MAX_CONCURRENT", get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
+    cfg.streaming.enabled = parse_bool_or(
+        get_env_or("STREAMING_ENABLED", get_toml_string_or(toml, "streaming", "enabled", cfg.streaming.enabled ? "true" : "false")),
+        cfg.streaming.enabled);
+    if (!config_path.empty() && std::filesystem::exists(config_path)) {
+        cfg.api_keys = parse_api_keys_toml(config_path);
+    }
+    return cfg;
+}

cpp/config.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <string>
+#include "llm_manager_types.h"
+std::string get_env_or(const char *name, const std::string &fallback);
+int get_env_int_or(const char *name, int fallback);
+std::string trim_copy(const std::string &value);
+std::string strip_quotes(const std::string &value);
+bool parse_bool_or(const std::string &value, bool fallback);
+Role parse_role_or(const std::string &value, Role fallback);
+ManagerConfig load_manager_config();

cpp/http_helpers.cpp ADDED Viewed

	@@ -0,0 +1,97 @@

+#include "http_helpers.h"
+#include <boost/asio.hpp>
+#include <boost/beast/core.hpp>
+#include <boost/beast/http.hpp>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+namespace asio = boost::asio;
+namespace beast = boost::beast;
+namespace http = beast::http;
+std::string now_utc_iso() {
+    std::time_t t = std::time(nullptr);
+    std::tm tm{};
+    gmtime_r(&t, &tm);
+    std::ostringstream oss;
+    oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
+    return oss.str();
+}
+void log_line(const std::string &line) {
+    std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
+}
+std::string truncate_body(const std::string &body, size_t max_len) {
+    if (body.size() <= max_len) return body;
+    return body.substr(0, max_len) + "...[truncated]";
+}
+std::optional<std::string> extract_cancel_request_id(const std::string &path) {
+    const std::string prefix = "/requests/";
+    const std::string suffix = "/cancel";
+    if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
+    if (path.rfind(prefix, 0) != 0) return std::nullopt;
+    if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
+    const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
+    if (request_id.empty()) return std::nullopt;
+    return request_id;
+}
+std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
+    asio::io_context ioc;
+    asio::ip::tcp::resolver resolver(ioc);
+    beast::tcp_stream stream(ioc);
+    auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
+    stream.connect(results);
+    http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
+    req.set(http::field::host, "127.0.0.1");
+    req.set(http::field::content_type, "application/json");
+    req.set(http::field::user_agent, "llm-manager");
+    req.body() = body;
+    req.prepare_payload();
+    http::write(stream, req);
+    beast::flat_buffer buffer;
+    http::response<http::string_body> res;
+    http::read(stream, buffer, res);
+    beast::error_code ec;
+    stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
+    return {res.result_int(), res.body()};
+}
+ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target) {
+    asio::io_context ioc;
+    asio::ip::tcp::resolver resolver(ioc);
+    beast::tcp_stream stream(ioc);
+    auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
+    stream.connect(results);
+    http::request<http::string_body> req{http::verb::get, target, 11};
+    req.set(http::field::host, "127.0.0.1");
+    req.set(http::field::user_agent, "llm-manager");
+    req.set(http::field::accept_encoding, "gzip, identity");
+    http::write(stream, req);
+    beast::flat_buffer buffer;
+    http::response<http::string_body> res;
+    http::read(stream, buffer, res);
+    beast::error_code ec;
+    stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
+    ProxiedGetResult out;
+    out.status = res.result_int();
+    out.body = res.body();
+    if (res.base().find(http::field::content_type) != res.base().end()) {
+        out.content_type = res.base()[http::field::content_type].to_string();
+    }
+    if (res.base().find(http::field::content_encoding) != res.base().end()) {
+        out.content_encoding = res.base()[http::field::content_encoding].to_string();
+    }
+    return out;
+}

cpp/http_helpers.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <optional>
+#include <string>
+#include <utility>
+#include "llm_manager_types.h"
+std::string now_utc_iso();
+void log_line(const std::string &line);
+std::string truncate_body(const std::string &body, size_t max_len = 2000);
+std::optional<std::string> extract_cancel_request_id(const std::string &path);
+std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
+ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker, const std::string &target);

cpp/llm_manager.cpp CHANGED Viewed

@@ -1,1530 +1,29 @@
 #include <boost/asio.hpp>
-#include <boost/beast/core.hpp>
-#include <boost/beast/http.hpp>
-#include <boost/beast/version.hpp>
-#include <nlohmann/json.hpp>
-#include <algorithm>
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <csignal>
-#include <cstdlib>
-#include <ctime>
-#include <deque>
-#include <filesystem>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <sstream>
 #include <string>
 #include <thread>
-#include <unordered_map>
-#include <vector>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
 namespace asio = boost::asio;
-namespace beast = boost::beast;
-namespace http = beast::http;
-using json = nlohmann::json;
-static void log_line(const std::string &line);
-struct ServerConfig {
-    std::string host = "0.0.0.0";
-    int port = 7860;
-};
-struct WorkerConfig {
-    std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
-    std::string llama_server_bin = "/usr/local/bin/llama-server";
-    std::string host = "127.0.0.1";
-    std::string bind_host = "0.0.0.0";
-    int base_port = 8080;
-    int switch_timeout_sec = 300;
-};
-struct LlamaConfig {
-    int n_ctx = 8192;
-    int threads = 4;
-    int ngl = 0;
-    int batch = 128;
-    int ubatch = 64;
-};
-enum class Role {
-    ADMIN,
-    USER
-};
-struct AuthConfig {
-    std::string header = "Authorization";
-    std::string scheme = "Bearer";
-};
-struct LimitsConfig {
-    int default_max_tokens = 256;
-    int max_tokens_per_request = 2048;
-    int request_timeout_sec = 30;
-};
-struct QueueConfig {
-    size_t max_size = 100;
-    int max_tokens = 20000;
-    int admin_quota = 3;
-    int retry_after_sec = 5;
-};
-struct RateLimitConfig {
-    int requests_per_minute = 60;
-    int estimated_tokens_per_minute = 6000;
-};
-struct SchedulerConfig {
-    int max_concurrent = 1;
-};
-struct ApiKeyRecord {
-    std::string key_id;
-    std::string secret;
-    Role role = Role::USER;
-    bool enabled = true;
-};
-struct ManagerConfig {
-    ServerConfig server;
-    WorkerConfig worker;
-    LlamaConfig llama;
-    AuthConfig auth;
-    LimitsConfig limits;
-    QueueConfig queue;
-    RateLimitConfig rate_limit;
-    SchedulerConfig scheduler;
-    std::vector<ApiKeyRecord> api_keys;
-};
-struct WorkerInfo {
-    std::string model;
-    int port = 0;
-    pid_t pid = -1;
-    std::string last_loaded;
-};
-static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
-static std::string now_utc_iso() {
-    std::time_t t = std::time(nullptr);
-    std::tm tm{};
-    gmtime_r(&t, &tm);
-    std::ostringstream oss;
-    oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
-    return oss.str();
-}
-static std::string get_env_or(const char *name, const std::string &fallback) {
-    const char *v = std::getenv(name);
-    return (v && *v) ? std::string(v) : fallback;
-}
-static int get_env_int_or(const char *name, int fallback) {
-    const char *v = std::getenv(name);
-    if (!v || !*v) return fallback;
-    try {
-        return std::stoi(v);
-    } catch (...) {
-        return fallback;
-    }
-}
-static std::string trim_copy(const std::string &value) {
-    const auto first = value.find_first_not_of(" \t\r\n");
-    if (first == std::string::npos) return "";
-    const auto last = value.find_last_not_of(" \t\r\n");
-    return value.substr(first, last - first + 1);
-}
-static std::string strip_quotes(const std::string &value) {
-    if (value.size() >= 2) {
-        const char first = value.front();
-        const char last = value.back();
-        if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
-            return value.substr(1, value.size() - 2);
-        }
-    }
-    return value;
-}
-static bool parse_bool_or(const std::string &value, bool fallback) {
-    const std::string normalized = trim_copy(value);
-    if (normalized == "true") return true;
-    if (normalized == "false") return false;
-    return fallback;
-}
-static Role parse_role_or(const std::string &value, Role fallback) {
-    const std::string normalized = trim_copy(value);
-    if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
-    if (normalized == "user" || normalized == "USER") return Role::USER;
-    return fallback;
-}
-static std::string role_to_string(Role role) {
-    return role == Role::ADMIN ? "admin" : "user";
-}
-enum class Priority {
-    ADMIN = 0,
-    USER = 1
-};
-static Priority role_to_priority(Role role) {
-    return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
-}
-enum class RequestState {
-    QUEUED,
-    RUNNING,
-    CANCELLED,
-    FAILED,
-    DONE
-};
-static std::string state_to_string(RequestState state) {
-    switch (state) {
-        case RequestState::QUEUED: return "queued";
-        case RequestState::RUNNING: return "running";
-        case RequestState::CANCELLED: return "cancelled";
-        case RequestState::FAILED: return "failed";
-        case RequestState::DONE: return "done";
-    }
-    return "unknown";
-}
-struct TokenEstimate {
-    int prompt_tokens = 0;
-    int requested_max_tokens = 0;
-    int estimated_total_tokens = 0;
-};
-struct RateLimitDecision {
-    bool allowed = true;
-    int retry_after_sec = 0;
-    std::string error;
-};
-struct RequestResult {
-    int status = 500;
-    std::string body;
-    std::string content_type = "application/json";
-};
-struct RequestContext {
-    std::string request_id;
-    std::string api_key_id;
-    Role role = Role::USER;
-    Priority priority = Priority::USER;
-    TokenEstimate estimate;
-    std::string request_body;
-    std::atomic<RequestState> state{RequestState::QUEUED};
-    std::atomic<bool> cancelled{false};
-    std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
-    std::chrono::steady_clock::time_point enqueue_time{created_at};
-    std::chrono::steady_clock::time_point start_time{};
-    std::mutex mu;
-    std::condition_variable cv;
-    bool completed = false;
-    RequestResult result;
-};
-static int estimate_text_tokens_rough(const std::string &text) {
-    if (text.empty()) return 0;
-    return std::max(1, static_cast<int>((text.size() + 3) / 4));
-}
-static std::string flatten_json_content(const json &content) {
-    if (content.is_string()) {
-        return content.get<std::string>();
-    }
-    if (content.is_array()) {
-        std::ostringstream oss;
-        bool first = true;
-        for (const auto &item : content) {
-            std::string part;
-            if (item.is_string()) {
-                part = item.get<std::string>();
-            } else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
-                part = item["text"].get<std::string>();
-            }
-            if (part.empty()) continue;
-            if (!first) oss << '\n';
-            oss << part;
-            first = false;
-        }
-        return oss.str();
-    }
-    return "";
-}
-static std::optional<TokenEstimate> estimate_chat_tokens(
-    const json &payload,
-    const LimitsConfig &limits,
-    std::string &error) {
-    if (!payload.is_object()) {
-        error = "Expected JSON object";
-        return std::nullopt;
-    }
-    if (!payload.contains("messages") || !payload["messages"].is_array()) {
-        error = "Expected 'messages' array";
-        return std::nullopt;
-    }
-    TokenEstimate estimate;
-    estimate.requested_max_tokens = limits.default_max_tokens;
-    if (payload.contains("max_tokens")) {
-        if (!payload["max_tokens"].is_number_integer()) {
-            error = "Expected integer 'max_tokens'";
-            return std::nullopt;
-        }
-        estimate.requested_max_tokens = payload["max_tokens"].get<int>();
-    }
-    if (estimate.requested_max_tokens <= 0) {
-        error = "'max_tokens' must be > 0";
-        return std::nullopt;
-    }
-    for (const auto &message : payload["messages"]) {
-        if (!message.is_object()) continue;
-        if (message.contains("role") && message["role"].is_string()) {
-            estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
-        }
-        if (message.contains("content")) {
-            estimate.prompt_tokens += estimate_text_tokens_rough(
-                flatten_json_content(message["content"]));
-        }
-        estimate.prompt_tokens += 4;
-    }
-    estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
-    if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
-        error = "Estimated request tokens exceed configured limit";
-        return std::nullopt;
-    }
-    error.clear();
-    return estimate;
-}
-static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
-parse_simple_toml(const std::string &path) {
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
-    std::ifstream input(path);
-    if (!input.is_open()) return out;
-    std::string current_section;
-    std::string line;
-    while (std::getline(input, line)) {
-        auto hash = line.find('#');
-        if (hash != std::string::npos) line = line.substr(0, hash);
-        line = trim_copy(line);
-        if (line.empty()) continue;
-        if (line.front() == '[' && line.back() == ']') {
-            current_section = trim_copy(line.substr(1, line.size() - 2));
-            continue;
-        }
-        const auto eq = line.find('=');
-        if (eq == std::string::npos) continue;
-        std::string key = trim_copy(line.substr(0, eq));
-        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
-        out[current_section][key] = value;
-    }
-    return out;
-}
-static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
-    std::vector<ApiKeyRecord> keys;
-    std::ifstream input(path);
-    if (!input.is_open()) return keys;
-    std::string line;
-    bool in_api_key = false;
-    ApiKeyRecord current;
-    bool has_any_field = false;
-    auto flush_current = [&]() {
-        if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
-            keys.push_back(current);
-        }
-        current = ApiKeyRecord{};
-        has_any_field = false;
-    };
-    while (std::getline(input, line)) {
-        auto hash = line.find('#');
-        if (hash != std::string::npos) line = line.substr(0, hash);
-        line = trim_copy(line);
-        if (line.empty()) continue;
-        if (line == "[[api_keys]]") {
-            flush_current();
-            in_api_key = true;
-            continue;
-        }
-        if (!in_api_key) continue;
-        if (line.front() == '[' && line.back() == ']') {
-            flush_current();
-            in_api_key = false;
-            continue;
-        }
-        const auto eq = line.find('=');
-        if (eq == std::string::npos) continue;
-        std::string key = trim_copy(line.substr(0, eq));
-        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
-        has_any_field = true;
-        if (key == "key_id") current.key_id = value;
-        else if (key == "secret") current.secret = value;
-        else if (key == "role") current.role = parse_role_or(value, current.role);
-        else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
-    }
-    flush_current();
-    return keys;
-}
-static std::string get_toml_string_or(
-    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
-    const std::string &section,
-    const std::string &key,
-    const std::string &fallback) {
-    const auto it = data.find(section);
-    if (it == data.end()) return fallback;
-    const auto kv = it->second.find(key);
-    if (kv == it->second.end() || kv->second.empty()) return fallback;
-    return kv->second;
-}
-static int get_toml_int_or(
-    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
-    const std::string &section,
-    const std::string &key,
-    int fallback) {
-    const auto it = data.find(section);
-    if (it == data.end()) return fallback;
-    const auto kv = it->second.find(key);
-    if (kv == it->second.end() || kv->second.empty()) return fallback;
-    try {
-        return std::stoi(kv->second);
-    } catch (...) {
-        return fallback;
-    }
-}
-static ManagerConfig load_manager_config() {
-    ManagerConfig cfg;
-    const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
-    if (std::filesystem::exists(config_path)) {
-        toml = parse_simple_toml(config_path);
-        log_line("config: loaded " + config_path);
-    } else {
-        log_line("config: using environment/defaults (file not found: " + config_path + ")");
-    }
-    cfg.server.host = get_env_or(
-        "MANAGER_HOST",
-        get_toml_string_or(toml, "server", "host", cfg.server.host));
-    cfg.server.port = get_env_int_or(
-        "MANAGER_PORT",
-        get_toml_int_or(toml, "server", "port", cfg.server.port));
-    cfg.worker.default_model = get_env_or(
-        "DEFAULT_MODEL",
-        get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
-    cfg.worker.llama_server_bin = get_env_or(
-        "LLAMA_SERVER_BIN",
-        get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
-    cfg.worker.host = get_env_or(
-        "WORKER_HOST",
-        get_toml_string_or(toml, "worker", "host", cfg.worker.host));
-    cfg.worker.bind_host = get_env_or(
-        "WORKER_BIND_HOST",
-        get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
-    cfg.worker.base_port = get_env_int_or(
-        "WORKER_BASE_PORT",
-        get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
-    cfg.worker.switch_timeout_sec = get_env_int_or(
-        "SWITCH_TIMEOUT_SEC",
-        get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
-    cfg.llama.n_ctx = get_env_int_or(
-        "MODEL_N_CTX",
-        get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
-    cfg.llama.threads = get_env_int_or(
-        "MODEL_THREADS",
-        get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
-    cfg.llama.ngl = get_env_int_or(
-        "MODEL_NGL",
-        get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
-    cfg.llama.batch = get_env_int_or(
-        "MODEL_BATCH",
-        get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
-    cfg.llama.ubatch = get_env_int_or(
-        "MODEL_UBATCH",
-        get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
-    cfg.auth.header = get_env_or(
-        "AUTH_HEADER",
-        get_toml_string_or(toml, "auth", "header", cfg.auth.header));
-    cfg.auth.scheme = get_env_or(
-        "AUTH_SCHEME",
-        get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
-    cfg.limits.default_max_tokens = get_env_int_or(
-        "DEFAULT_MAX_TOKENS",
-        get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
-    cfg.limits.max_tokens_per_request = get_env_int_or(
-        "MAX_TOKENS_PER_REQUEST",
-        get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
-    cfg.limits.request_timeout_sec = get_env_int_or(
-        "REQUEST_TIMEOUT_SEC",
-        get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
-    cfg.queue.max_size = static_cast<size_t>(std::max(
-        1,
-        get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
-    cfg.queue.max_tokens = get_env_int_or(
-        "QUEUE_MAX_TOKENS",
-        get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
-    cfg.queue.admin_quota = get_env_int_or(
-        "QUEUE_ADMIN_QUOTA",
-        get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
-    cfg.queue.retry_after_sec = get_env_int_or(
-        "QUEUE_RETRY_AFTER_SEC",
-        get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
-    cfg.rate_limit.requests_per_minute = get_env_int_or(
-        "REQUESTS_PER_MINUTE",
-        get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
-    cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or(
-        "ESTIMATED_TOKENS_PER_MINUTE",
-        get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
-    cfg.scheduler.max_concurrent = get_env_int_or(
-        "SCHEDULER_MAX_CONCURRENT",
-        get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
-    if (!config_path.empty() && std::filesystem::exists(config_path)) {
-        cfg.api_keys = parse_api_keys_toml(config_path);
-    }
-    return cfg;
-}
-static bool is_alive(pid_t pid) {
-    if (pid <= 0) return false;
-    return kill(pid, 0) == 0;
-}
-static void shutdown_worker(pid_t pid, int wait_seconds = 15) {
-    if (pid <= 0) return;
-    kill(pid, SIGTERM);
-    const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
-    while (std::chrono::steady_clock::now() < deadline) {
-        int status = 0;
-        pid_t r = waitpid(pid, &status, WNOHANG);
-        if (r == pid) return;
-        std::this_thread::sleep_for(std::chrono::milliseconds(200));
-    }
-    kill(pid, SIGKILL);
-    int status = 0;
-    waitpid(pid, &status, 0);
-}
-class ModelManager {
-public:
-    explicit ModelManager(const ManagerConfig &config)
-        : _default_model(config.worker.default_model),
-          _llama_server_bin(config.worker.llama_server_bin),
-          _worker_host(config.worker.host),
-          _worker_bind_host(config.worker.bind_host),
-          _base_port(config.worker.base_port),
-          _switch_timeout_sec(config.worker.switch_timeout_sec),
-          _n_ctx(config.llama.n_ctx),
-          _n_threads(config.llama.threads),
-          _n_gpu_layers(config.llama.ngl),
-          _n_batch(config.llama.batch),
-          _n_ubatch(config.llama.ubatch),
-          _next_port(_base_port) {}
-    bool initialize_default(std::string &error) {
-        return switch_model(_default_model, error);
-    }
-    bool switch_model(const std::string &model, std::string &error) {
-        {
-            std::lock_guard<std::mutex> lock(_mu);
-            if (_switch_in_progress) {
-                error = "Switch already in progress";
-                return false;
-            }
-            if (_active && _active->model == model && is_alive(_active->pid)) {
-                return true;
-            }
-            _switch_in_progress = true;
-        }
-        std::optional<WorkerInfo> old_worker;
-        {
-            std::lock_guard<std::mutex> lock(_mu);
-            if (_active) old_worker = _active;
-        }
-        int port = allocate_port();
-        pid_t pid = spawn_worker(model, port);
-        if (pid <= 0) {
-            finish_switch(false);
-            error = "Failed to start worker process";
-            return false;
-        }
-        if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
-            shutdown_worker(pid);
-            finish_switch(false);
-            error = "New model did not become ready in time";
-            return false;
-        }
-        WorkerInfo new_worker{model, port, pid, now_utc_iso()};
-        {
-            std::lock_guard<std::mutex> lock(_mu);
-            _active = new_worker;
-            _switch_in_progress = false;
-        }
-        if (old_worker && old_worker->pid != pid) {
-            shutdown_worker(old_worker->pid);
-        }
-        return true;
-    }
-    bool restart_active(std::string &error) {
-        std::optional<WorkerInfo> old_worker;
-        std::string model;
-        {
-            std::lock_guard<std::mutex> lock(_mu);
-            if (_switch_in_progress) {
-                error = "Switch already in progress";
-                return false;
-            }
-            if (!_active || !is_alive(_active->pid)) {
-                error = "No active model";
-                return false;
-            }
-            _switch_in_progress = true;
-            old_worker = _active;
-            model = _active->model;
-        }
-        shutdown_worker(old_worker->pid);
-        int port = allocate_port();
-        pid_t pid = spawn_worker(model, port);
-        if (pid <= 0) {
-            std::lock_guard<std::mutex> lock(_mu);
-            _active = std::nullopt;
-            _switch_in_progress = false;
-            error = "Failed to start worker process";
-            return false;
-        }
-        if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
-            shutdown_worker(pid);
-            std::lock_guard<std::mutex> lock(_mu);
-            _active = std::nullopt;
-            _switch_in_progress = false;
-            error = "New model did not become ready in time";
-            return false;
-        }
-        WorkerInfo new_worker{model, port, pid, now_utc_iso()};
-        {
-            std::lock_guard<std::mutex> lock(_mu);
-            _active = new_worker;
-            _switch_in_progress = false;
-        }
-        return true;
-    }
-    std::optional<WorkerInfo> active_worker() {
-        std::lock_guard<std::mutex> lock(_mu);
-        if (_active && is_alive(_active->pid)) return _active;
-        return std::nullopt;
-    }
-    json models_view() {
-        std::lock_guard<std::mutex> lock(_mu);
-        json out;
-        out["status"] = (_active && is_alive(_active->pid)) ? "ready" : "no_active_model";
-        out["switch_in_progress"] = _switch_in_progress;
-        if (_active && is_alive(_active->pid)) {
-            out["current_model"] = _active->model;
-            out["last_loaded"] = _active->last_loaded;
-            out["active_pid"] = _active->pid;
-            out["active_port"] = _active->port;
-        } else {
-            out["current_model"] = nullptr;
-            out["last_loaded"] = nullptr;
-            out["active_pid"] = nullptr;
-            out["active_port"] = nullptr;
-        }
-        return out;
-    }
-private:
-    std::mutex _mu;
-    std::optional<WorkerInfo> _active;
-    bool _switch_in_progress = false;
-    std::string _default_model;
-    std::string _llama_server_bin;
-    std::string _worker_host;
-    std::string _worker_bind_host;
-    int _base_port;
-    int _switch_timeout_sec;
-    int _n_ctx;
-    int _n_threads;
-    int _n_gpu_layers;
-    int _n_batch;
-    int _n_ubatch;
-    int _next_port;
-    int allocate_port() {
-        std::lock_guard<std::mutex> lock(_mu);
-        return _next_port++;
-    }
-    void finish_switch(bool ok) {
-        std::lock_guard<std::mutex> lock(_mu);
-        if (!ok) _switch_in_progress = false;
-    }
-    pid_t spawn_worker(const std::string &model, int port) {
-        pid_t pid = fork();
-        if (pid < 0) return -1;
-        if (pid == 0) {
-            setsid();
-            std::string port_s = std::to_string(port);
-            std::string n_ctx_s = std::to_string(_n_ctx);
-            std::string threads_s = std::to_string(_n_threads);
-            std::string ngl_s = std::to_string(_n_gpu_layers);
-            std::string batch_s = std::to_string(_n_batch);
-            std::string ubatch_s = std::to_string(_n_ubatch);
-            std::vector<std::string> args = {
-                _llama_server_bin,
-                "-hf", model,
-                "--host", _worker_bind_host,
-                "--port", port_s,
-                "-c", n_ctx_s,
-                "-t", threads_s,
-                "-ngl", ngl_s,
-                "--cont-batching",
-                "-b", batch_s,
-                "--ubatch-size", ubatch_s
-            };
-            std::vector<char *> argv;
-            argv.reserve(args.size() + 1);
-            for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
-            argv.push_back(nullptr);
-            execvp(argv[0], argv.data());
-            _exit(127);
-        }
-        return pid;
-    }
-    bool wait_until_ready(pid_t pid, int port, int timeout_sec) {
-        const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
-        while (std::chrono::steady_clock::now() < deadline) {
-            if (!is_alive(pid)) return false;
-            try {
-                auto [status, _] = http_get(port, "/");
-                if (status == 200) return true;
-            } catch (...) {
-            }
-            std::this_thread::sleep_for(std::chrono::milliseconds(800));
-        }
-        return false;
-    }
-    std::pair<int, std::string> http_get(int port, const std::string &target) {
-        asio::io_context ioc;
-        asio::ip::tcp::resolver resolver(ioc);
-        beast::tcp_stream stream(ioc);
-        auto const results = resolver.resolve(_worker_host, std::to_string(port));
-        stream.connect(results);
-        http::request<http::string_body> req{http::verb::get, target, 11};
-        req.set(http::field::host, _worker_host);
-        req.set(http::field::user_agent, "llm-manager");
-        http::write(stream, req);
-        beast::flat_buffer buffer;
-        http::response<http::string_body> res;
-        http::read(stream, buffer, res);
-        beast::error_code ec;
-        stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
-        return {res.result_int(), res.body()};
-    }
-};
-static std::atomic<uint64_t> g_req_id{1};
-class RateLimiterStore {
-public:
-    explicit RateLimiterStore(const RateLimitConfig &config)
-        : _requests_per_minute(std::max(0, config.requests_per_minute)),
-          _estimated_tokens_per_minute(std::max(0, config.estimated_tokens_per_minute)) {}
-    RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens) {
-        if (_requests_per_minute <= 0 && _estimated_tokens_per_minute <= 0) {
-            return {};
-        }
-        std::lock_guard<std::mutex> lock(_mu);
-        auto &bucket = _buckets[api_key_id];
-        const auto now = std::chrono::steady_clock::now();
-        refill(bucket.request_tokens, bucket.last_request_refill, _requests_per_minute, now);
-        refill(bucket.estimated_tokens, bucket.last_estimated_refill, _estimated_tokens_per_minute, now);
-        if (_requests_per_minute > 0 && bucket.request_tokens < 1.0) {
-            return {false, 1, "Rate limit exceeded: requests"};
-        }
-        if (_estimated_tokens_per_minute > 0 && bucket.estimated_tokens < estimated_tokens) {
-            return {false, 1, "Rate limit exceeded: estimated tokens"};
-        }
-        if (_requests_per_minute > 0) bucket.request_tokens -= 1.0;
-        if (_estimated_tokens_per_minute > 0) bucket.estimated_tokens -= estimated_tokens;
-        return {};
-    }
-private:
-    struct Bucket {
-        double request_tokens = 0.0;
-        double estimated_tokens = 0.0;
-        std::chrono::steady_clock::time_point last_request_refill{};
-        std::chrono::steady_clock::time_point last_estimated_refill{};
-    };
-    std::mutex _mu;
-    std::unordered_map<std::string, Bucket> _buckets;
-    int _requests_per_minute;
-    int _estimated_tokens_per_minute;
-    static void refill(
-        double &tokens,
-        std::chrono::steady_clock::time_point &last_refill,
-        int limit_per_minute,
-        std::chrono::steady_clock::time_point now) {
-        if (limit_per_minute <= 0) return;
-        if (last_refill.time_since_epoch().count() == 0) {
-            tokens = limit_per_minute;
-            last_refill = now;
-            return;
-        }
-        const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
-        if (elapsed <= 0) return;
-        const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
-        tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
-        last_refill = now;
-    }
-};
-class RequestRegistry {
-public:
-    std::shared_ptr<RequestContext> create(
-        const std::string &request_id,
-        const ApiKeyRecord &principal,
-        const TokenEstimate &estimate,
-        const std::string &request_body) {
-        auto ctx = std::make_shared<RequestContext>();
-        ctx->request_id = request_id;
-        ctx->api_key_id = principal.key_id;
-        ctx->role = principal.role;
-        ctx->priority = role_to_priority(principal.role);
-        ctx->estimate = estimate;
-        ctx->request_body = request_body;
-        ctx->created_at = std::chrono::steady_clock::now();
-        ctx->enqueue_time = ctx->created_at;
-        std::lock_guard<std::mutex> lock(_mu);
-        _requests[request_id] = ctx;
-        return ctx;
-    }
-    std::shared_ptr<RequestContext> find(const std::string &request_id) const {
-        std::lock_guard<std::mutex> lock(_mu);
-        const auto it = _requests.find(request_id);
-        if (it == _requests.end()) return nullptr;
-        return it->second;
-    }
-    void mark_state(const std::string &request_id, RequestState state) {
-        auto ctx = find(request_id);
-        if (ctx) ctx->state.store(state);
-    }
-    void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
-        {
-            std::lock_guard<std::mutex> lock(ctx->mu);
-            ctx->state.store(state);
-            ctx->result = std::move(result);
-            ctx->completed = true;
-        }
-        ctx->cv.notify_all();
-    }
-    std::shared_ptr<RequestContext> cancel_request(const std::string &request_id) {
-        auto ctx = find(request_id);
-        if (!ctx) return nullptr;
-        ctx->cancelled.store(true);
-        const auto state = ctx->state.load();
-        if (state == RequestState::QUEUED) {
-            complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
-        } else if (state == RequestState::RUNNING) {
-            ctx->state.store(RequestState::CANCELLED);
-        }
-        return ctx;
-    }
-    std::vector<std::shared_ptr<RequestContext>> cancel_all() {
-        std::vector<std::shared_ptr<RequestContext>> out;
-        std::lock_guard<std::mutex> lock(_mu);
-        out.reserve(_requests.size());
-        for (auto &[_, ctx] : _requests) {
-            ctx->cancelled.store(true);
-            const auto state = ctx->state.load();
-            if (state == RequestState::QUEUED) {
-                {
-                    std::lock_guard<std::mutex> ctx_lock(ctx->mu);
-                    ctx->state.store(RequestState::CANCELLED);
-                    ctx->result = {499, R"({"error":"Request cancelled"})"};
-                    ctx->completed = true;
-                }
-                ctx->cv.notify_all();
-            } else if (state == RequestState::RUNNING) {
-                ctx->state.store(RequestState::CANCELLED);
-            }
-            out.push_back(ctx);
-        }
-        return out;
-    }
-private:
-    mutable std::mutex _mu;
-    std::unordered_map<std::string, std::shared_ptr<RequestContext>> _requests;
-};
-class PrioritySchedulerQueue {
-public:
-    explicit PrioritySchedulerQueue(const QueueConfig &config)
-        : _max_size(config.max_size),
-          _max_tokens(config.max_tokens),
-          _admin_quota(std::max(1, config.admin_quota)),
-          _retry_after_sec(std::max(1, config.retry_after_sec)) {}
-    bool try_push(const std::shared_ptr<RequestContext> &ctx) {
-        std::lock_guard<std::mutex> lock(_mu);
-        if (_current_size >= _max_size) return false;
-        if (_current_tokens + ctx->estimate.estimated_total_tokens > _max_tokens) return false;
-        if (ctx->priority == Priority::ADMIN) {
-            _admin_queue.push_back(ctx);
-        } else {
-            _user_queue.push_back(ctx);
-        }
-        ++_current_size;
-        _current_tokens += ctx->estimate.estimated_total_tokens;
-        _cv.notify_one();
-        return true;
-    }
-    std::shared_ptr<RequestContext> pop_next() {
-        std::unique_lock<std::mutex> lock(_mu);
-        _cv.wait(lock, [&]() { return _stopped || _current_size > 0; });
-        if (_stopped) return nullptr;
-        std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
-        if (!_admin_queue.empty() && (_admin_streak < _admin_quota || _user_queue.empty())) {
-            selected_queue = &_admin_queue;
-            ++_admin_streak;
-        } else if (!_user_queue.empty()) {
-            selected_queue = &_user_queue;
-            _admin_streak = 0;
-        } else if (!_admin_queue.empty()) {
-            selected_queue = &_admin_queue;
-            _admin_streak = 1;
-        }
-        if (!selected_queue || selected_queue->empty()) return nullptr;
-        auto best_it = std::min_element(
-            selected_queue->begin(),
-            selected_queue->end(),
-            [](const auto &a, const auto &b) {
-                return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
-            });
-        auto ctx = *best_it;
-        selected_queue->erase(best_it);
-        --_current_size;
-        _current_tokens -= ctx->estimate.estimated_total_tokens;
-        return ctx;
-    }
-    void stop() {
-        std::lock_guard<std::mutex> lock(_mu);
-        _stopped = true;
-        _cv.notify_all();
-    }
-    int retry_after_sec() const {
-        return _retry_after_sec;
-    }
-private:
-    mutable std::mutex _mu;
-    std::condition_variable _cv;
-    std::deque<std::shared_ptr<RequestContext>> _admin_queue;
-    std::deque<std::shared_ptr<RequestContext>> _user_queue;
-    size_t _max_size;
-    size_t _current_size = 0;
-    int _max_tokens;
-    int _current_tokens = 0;
-    int _admin_quota;
-    int _admin_streak = 0;
-    int _retry_after_sec;
-    bool _stopped = false;
-};
-class Scheduler {
-public:
-    Scheduler(ModelManager &manager, RequestRegistry &registry, const QueueConfig &queue_config)
-        : _manager(manager), _registry(registry), _queue(queue_config) {
-        _worker = std::thread([this]() { worker_loop(); });
-    }
-    ~Scheduler() {
-        _queue.stop();
-        if (_worker.joinable()) _worker.join();
-    }
-    bool try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
-        return _queue.try_push(ctx);
-    }
-    int retry_after_sec() const {
-        return _queue.retry_after_sec();
-    }
-private:
-    ModelManager &_manager;
-    RequestRegistry &_registry;
-    PrioritySchedulerQueue _queue;
-    std::thread _worker;
-    void worker_loop() {
-        for (;;) {
-            auto ctx = _queue.pop_next();
-            if (!ctx) return;
-            if (ctx->cancelled.load()) {
-                _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
-                continue;
-            }
-            ctx->state.store(RequestState::RUNNING);
-            ctx->start_time = std::chrono::steady_clock::now();
-            auto worker = _manager.active_worker();
-            if (!worker) {
-                _registry.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
-                continue;
-            }
-            try {
-                auto [status, body] = forward_chat(*worker, ctx->request_body);
-                if (ctx->cancelled.load()) {
-                    _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
-                    continue;
-                }
-                _registry.complete(ctx, RequestState::DONE, {status, body});
-            } catch (const std::exception &e) {
-                _registry.complete(
-                    ctx,
-                    RequestState::FAILED,
-                    {500, json({{"error", e.what()}}).dump()});
-            }
-        }
-    }
-};
-class ApiKeyAuth {
-public:
-    explicit ApiKeyAuth(const ManagerConfig &config)
-        : _header_name(config.auth.header), _scheme(config.auth.scheme) {
-        for (const auto &record : config.api_keys) {
-            _records_by_secret.emplace(record.secret, record);
-        }
-    }
-    bool enabled() const {
-        return !_records_by_secret.empty();
-    }
-    template <typename Body, typename Allocator>
-    std::optional<ApiKeyRecord> authenticate(
-        const http::request<Body, http::basic_fields<Allocator>> &req,
-        std::string &error) const {
-        if (!enabled()) {
-            error.clear();
-            return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
-        }
-        const auto token = extract_bearer_token(req, error);
-        if (!token) return std::nullopt;
-        const auto it = _records_by_secret.find(*token);
-        if (it == _records_by_secret.end()) {
-            error = "Invalid API key";
-            return std::nullopt;
-        }
-        if (!it->second.enabled) {
-            error = "API key disabled";
-            return std::nullopt;
-        }
-        error.clear();
-        return it->second;
-    }
-private:
-    std::string _header_name;
-    std::string _scheme;
-    std::unordered_map<std::string, ApiKeyRecord> _records_by_secret;
-    template <typename Body, typename Allocator>
-    std::optional<std::string> extract_bearer_token(
-        const http::request<Body, http::basic_fields<Allocator>> &req,
-        std::string &error) const {
-        const auto header_it = req.find(_header_name);
-        if (header_it == req.end()) {
-            error = "Missing authorization header";
-            return std::nullopt;
-        }
-        const std::string value = trim_copy(header_it->value().to_string());
-        const std::string prefix = _scheme + " ";
-        if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
-            error = "Invalid authorization scheme";
-            return std::nullopt;
-        }
-        std::string token = trim_copy(value.substr(prefix.size()));
-        if (token.empty()) {
-            error = "Missing API key";
-            return std::nullopt;
-        }
-        return token;
-    }
-};
-static void log_line(const std::string &line) {
-    std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
-}
-static std::string truncate_body(const std::string &body, size_t max_len = 2000) {
-    if (body.size() <= max_len) return body;
-    return body.substr(0, max_len) + "...[truncated]";
-}
-static std::optional<std::string> extract_cancel_request_id(const std::string &path) {
-    const std::string prefix = "/requests/";
-    const std::string suffix = "/cancel";
-    if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
-    if (path.rfind(prefix, 0) != 0) return std::nullopt;
-    if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
-    const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
-    if (request_id.empty()) return std::nullopt;
-    return request_id;
-}
-static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
-    asio::io_context ioc;
-    asio::ip::tcp::resolver resolver(ioc);
-    beast::tcp_stream stream(ioc);
-    auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
-    stream.connect(results);
-    http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
-    req.set(http::field::host, "127.0.0.1");
-    req.set(http::field::content_type, "application/json");
-    req.set(http::field::user_agent, "llm-manager");
-    req.body() = body;
-    req.prepare_payload();
-    http::write(stream, req);
-    beast::flat_buffer buffer;
-    http::response<http::string_body> res;
-    http::read(stream, buffer, res);
-    beast::error_code ec;
-    stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
-    return {res.result_int(), res.body()};
-}
-struct ProxiedGetResult {
-  int status = 500;
-  std::string body;
-  std::string content_type = "text/plain; charset=utf-8";
-  std::string content_encoding;
-};
-static ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker,
-                                              const std::string &target) {
-  asio::io_context ioc;
-  asio::ip::tcp::resolver resolver(ioc);
-  beast::tcp_stream stream(ioc);
-  auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
-  stream.connect(results);
-  http::request<http::string_body> req{http::verb::get, target, 11};
-  req.set(http::field::host, "127.0.0.1");
-  req.set(http::field::user_agent, "llm-manager");
-  req.set(http::field::accept_encoding, "gzip, identity");
-  http::write(stream, req);
-  beast::flat_buffer buffer;
-  http::response<http::string_body> res;
-  http::read(stream, buffer, res);
-  beast::error_code ec;
-  stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
-  ProxiedGetResult out;
-  out.status = res.result_int();
-  out.body = res.body();
-  if (res.base().find(http::field::content_type) != res.base().end()) {
-    out.content_type = res.base()[http::field::content_type].to_string();
-  }
-  if (res.base().find(http::field::content_encoding) != res.base().end()) {
-    out.content_encoding = res.base()[http::field::content_encoding].to_string();
-  }
-  return out;
-}
-template <typename Body, typename Allocator>
-http::response<http::string_body> handle_request(
-    ModelManager &manager,
-    const ManagerConfig &config,
-    const ApiKeyAuth &auth,
-    RateLimiterStore &rate_limiter,
-    RequestRegistry &registry,
-    Scheduler &scheduler,
-    http::request<Body, http::basic_fields<Allocator>> &&req) {
-    const auto start = std::chrono::steady_clock::now();
-    const auto req_id_num = g_req_id.fetch_add(1);
-    const std::string request_id = std::to_string(req_id_num);
-    const std::string target = req.target().to_string();
-    const std::string method = req.method_string().to_string();
-    const std::string path = target.substr(0, target.find('?'));
-    auto authenticated = std::optional<ApiKeyRecord>{};
-    log_line("request_id=" + request_id + " method=" + method + " path=" + target);
-    if constexpr (std::is_same_v<Body, http::string_body>) {
-        if (!req.body().empty()) {
-            log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
-        }
-    }
-    auto json_response = [&](http::status status, const json &obj) {
-        json payload = obj;
-        payload["request_id"] = request_id;
-        http::response<http::string_body> res{status, req.version()};
-        res.set(http::field::content_type, "application/json");
-        res.set(http::field::server, "llm-manager");
-        res.set("X-Request-Id", request_id);
-        res.keep_alive(req.keep_alive());
-        res.body() = payload.dump();
-        res.prepare_payload();
-        auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
-            std::chrono::steady_clock::now() - start).count();
-        log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
-                 " elapsed_ms=" + std::to_string(elapsed_ms));
-        return res;
-    };
-    auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
-        auto res = json_response(status, obj);
-        res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
-        return res;
-    };
-    auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
-        std::string auth_error;
-        authenticated = auth.authenticate(req, auth_error);
-        if (!authenticated) {
-            return json_response(http::status::unauthorized, {{"error", auth_error}});
-        }
-        if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
-            return json_response(http::status::forbidden, {{"error", "Admin role required"}});
-        }
-        log_line("request_id=" + request_id +
-                 " api_key_id=" + authenticated->key_id +
-                 " role=" + role_to_string(authenticated->role));
-        return std::nullopt;
-    };
-    try {
-        if (path == "/health" && req.method() == http::verb::get) {
-            return json_response(http::status::ok, manager.models_view());
-        }
-        if (path == "/models" && req.method() == http::verb::get) {
-            return json_response(http::status::ok, manager.models_view());
-        }
-        if (path == "/switch-model" && req.method() == http::verb::post) {
-            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
-            std::string body(req.body().data(), req.body().size());
-            json j = json::parse(body, nullptr, false);
-            if (j.is_discarded()) {
-                return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
-            }
-            std::string model;
-            if (j.contains("model_name")) model = j["model_name"].get<std::string>();
-            if (j.contains("model")) model = j["model"].get<std::string>();
-            if (model.empty()) {
-                return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
-            }
-            std::string err;
-            bool ok = manager.switch_model(model, err);
-            if (!ok) {
-                auto status = (err == "Switch already in progress") ? http::status::conflict : http::status::internal_server_error;
-                return json_response(status, {{"status", "error"}, {"error", err}});
-            }
-            auto state = manager.models_view();
-            state["message"] = "Switched model successfully";
-            return json_response(http::status::ok, state);
-        }
-        if (path == "/stop" && req.method() == http::verb::post) {
-            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
-            registry.cancel_all();
-            std::string err;
-            bool ok = manager.restart_active(err);
-            if (!ok) {
-                http::status status = http::status::internal_server_error;
-                if (err == "Switch already in progress") {
-                    status = http::status::conflict;
-                } else if (err == "No active model") {
-                    status = http::status::service_unavailable;
-                }
-                return json_response(status, {{"status", "error"}, {"error", err}});
-            }
-            auto state = manager.models_view();
-            state["message"] = "Stopped in-flight prompts and restarted model";
-            return json_response(http::status::ok, state);
-        }
-        if (req.method() == http::verb::post) {
-            if (auto cancel_id = extract_cancel_request_id(path)) {
-                if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
-                auto ctx = registry.find(*cancel_id);
-                if (!ctx) {
-                    return json_response(http::status::not_found, {{"error", "Unknown request id"}});
-                }
-                if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
-                    return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
-                }
-                const auto previous_state = ctx->state.load();
-                registry.cancel_request(*cancel_id);
-                std::string restart_error;
-                bool restarted = true;
-                if (previous_state == RequestState::RUNNING) {
-                    restarted = manager.restart_active(restart_error);
-                }
-                json payload = {
-                    {"cancelled_request_id", *cancel_id},
-                    {"state", state_to_string(ctx->state.load())}
-                };
-                if (!restarted) {
-                    payload["restart_error"] = restart_error;
-                }
-                return json_response(http::status::ok, payload);
-            }
-        }
-        if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
-            if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
-            json payload = json::parse(req.body(), nullptr, false);
-            if (payload.is_discarded()) {
-                return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
-            }
-            std::string token_error;
-            auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
-            if (!estimate) {
-                return json_response(http::status::bad_request, {{"error", token_error}});
-            }
-            log_line("request_id=" + request_id +
-                     " prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
-                     " max_tokens=" + std::to_string(estimate->requested_max_tokens) +
-                     " estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
-            auto rate_limit_decision = rate_limiter.allow(
-                authenticated->key_id,
-                estimate->estimated_total_tokens);
-            if (!rate_limit_decision.allowed) {
-                return json_response_with_retry_after(
-                    http::status::too_many_requests,
-                    {{"error", rate_limit_decision.error}},
-                    rate_limit_decision.retry_after_sec);
-            }
-            auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
-            if (!scheduler.try_enqueue(ctx)) {
-                ctx->cancelled.store(true);
-                registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
-                return json_response_with_retry_after(
-                    http::status::service_unavailable,
-                    {{"error", "Queue full"}},
-                    scheduler.retry_after_sec());
-            }
-            std::unique_lock<std::mutex> lock(ctx->mu);
-            const bool finished = ctx->cv.wait_for(
-                lock,
-                std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
-                [&]() { return ctx->completed; });
-            if (!finished) {
-                lock.unlock();
-                registry.cancel_request(request_id);
-                std::string restart_error;
-                bool restarted = true;
-                if (ctx->state.load() == RequestState::RUNNING) {
-                    restarted = manager.restart_active(restart_error);
-                }
-                json timeout_payload = {
-                    {"error", "Request timed out"},
-                    {"state", state_to_string(ctx->state.load())}
-                };
-                if (!restarted) timeout_payload["restart_error"] = restart_error;
-                return json_response(http::status::gateway_timeout, timeout_payload);
-            }
-            const auto final_state = ctx->state.load();
-            RequestResult result = ctx->result;
-            lock.unlock();
-            if (final_state == RequestState::CANCELLED) {
-                return json_response(http::status::ok, {{"status", "cancelled"}});
-            }
-            http::response<http::string_body> res{
-                static_cast<http::status>(result.status), req.version()};
-            res.set(http::field::content_type, result.content_type);
-            res.set(http::field::server, "llm-manager");
-            res.set("X-Request-Id", request_id);
-            res.keep_alive(req.keep_alive());
-            res.body() = result.body;
-            res.prepare_payload();
-            auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
-                std::chrono::steady_clock::now() - start).count();
-            log_line("request_id=" + request_id +
-                     " final_state=" + state_to_string(final_state) +
-                     " upstream_status=" + std::to_string(result.status) +
-                     " elapsed_ms=" + std::to_string(elapsed_ms));
-            return res;
-        }
-        // Proxy GET requests not handled by manager endpoints to active llama-server.
-        // This enables llama.cpp UI/static routes (including "/").
-        if (req.method() == http::verb::get) {
-            auto worker = manager.active_worker();
-            if (!worker) {
-                return json_response(http::status::service_unavailable, {{"error", "No active model"}});
-            }
-      auto upstream = forward_get_to_worker(*worker, target);
-      http::response<http::string_body> res{
-          static_cast<http::status>(upstream.status), req.version()};
-      res.set(http::field::content_type, upstream.content_type);
-      if (!upstream.content_encoding.empty()) {
-        res.set(http::field::content_encoding, upstream.content_encoding);
-      }
-      res.set(http::field::server, "llm-manager");
-      res.set("X-Request-Id", request_id);
-      res.keep_alive(req.keep_alive());
-      res.body() = upstream.body;
-      res.prepare_payload();
-      auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
-                            std::chrono::steady_clock::now() - start)
-                            .count();
-      log_line("request_id=" + request_id +
-               " proxied_get model=" + worker->model +
-               " upstream_status=" + std::to_string(upstream.status) +
-               " elapsed_ms=" + std::to_string(elapsed_ms));
-      return res;
-    }
-        return json_response(http::status::not_found, {{"error", "Not found"}});
-    } catch (const std::exception &e) {
-        return json_response(http::status::internal_server_error, {{"error", e.what()}});
-    }
-}
-void do_session(
-    asio::ip::tcp::socket socket,
-    ModelManager &manager,
-    const ManagerConfig &config,
-    const ApiKeyAuth &auth,
-    RateLimiterStore &rate_limiter,
-    RequestRegistry &registry,
-    Scheduler &scheduler) {
-    try {
-        beast::flat_buffer buffer;
-        http::request<http::string_body> req;
-        http::read(socket, buffer, req);
-        auto res = handle_request(manager, config, auth, rate_limiter, registry, scheduler, std::move(req));
-        http::write(socket, res);
-        beast::error_code ec;
-        socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
-    } catch (...) {
-    }
-}
 int main() {
     const ManagerConfig config = load_manager_config();
     const auto &bind_host = config.server.host;
     const int bind_port = config.server.port;
     ModelManager manager(config);
     ApiKeyAuth auth(config);
     RateLimiterStore rate_limiter(config.rate_limit);
     RequestRegistry registry;
-    Scheduler scheduler(manager, registry, config.queue);
     std::string init_error;
     if (auth.enabled()) {
@@ -1532,6 +31,7 @@ int main() {
     } else {
         log_line("auth: disabled (no configured api keys)");
     }
     log_line("startup: loading default model");
     if (!manager.initialize_default(init_error)) {
         log_line("startup: default model failed: " + init_error);
@@ -1540,7 +40,9 @@ int main() {
     }
     asio::io_context ioc{1};
-    asio::ip::tcp::acceptor acceptor{ioc, {asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
     log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
     for (;;) {
@@ -1554,6 +56,7 @@ int main() {
             std::cref(auth),
             std::ref(rate_limiter),
             std::ref(registry),
             std::ref(scheduler)).detach();
     }
 }

 #include <boost/asio.hpp>
+#include <functional>
 #include <string>
 #include <thread>
+#include <utility>
+#include "config.h"
+#include "http_helpers.h"
+#include "model_manager.h"
+#include "runtime_components.h"
+#include "server.h"
 namespace asio = boost::asio;
 int main() {
     const ManagerConfig config = load_manager_config();
     const auto &bind_host = config.server.host;
     const int bind_port = config.server.port;
     ModelManager manager(config);
     ApiKeyAuth auth(config);
     RateLimiterStore rate_limiter(config.rate_limit);
     RequestRegistry registry;
+    MetricsRegistry metrics;
+    Scheduler scheduler(manager, registry, metrics, config.queue);
     std::string init_error;
     if (auth.enabled()) {
     } else {
         log_line("auth: disabled (no configured api keys)");
     }
     log_line("startup: loading default model");
     if (!manager.initialize_default(init_error)) {
         log_line("startup: default model failed: " + init_error);
     }
     asio::io_context ioc{1};
+    asio::ip::tcp::acceptor acceptor{
+        ioc,
+        {asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
     log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
     for (;;) {
             std::cref(auth),
             std::ref(rate_limiter),
             std::ref(registry),
+            std::ref(metrics),
             std::ref(scheduler)).detach();
     }
 }

cpp/llm_manager_types.cpp ADDED Viewed

	@@ -0,0 +1,20 @@

+#include "llm_manager_types.h"
+std::string role_to_string(Role role) {
+    return role == Role::ADMIN ? "admin" : "user";
+}
+Priority role_to_priority(Role role) {
+    return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
+}
+std::string state_to_string(RequestState state) {
+    switch (state) {
+        case RequestState::QUEUED: return "queued";
+        case RequestState::RUNNING: return "running";
+        case RequestState::CANCELLED: return "cancelled";
+        case RequestState::FAILED: return "failed";
+        case RequestState::DONE: return "done";
+    }
+    return "unknown";
+}

cpp/llm_manager_types.h ADDED Viewed

	@@ -0,0 +1,167 @@

+#pragma once
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include <sys/types.h>
+using json = nlohmann::json;
+struct ServerConfig {
+    std::string host = "0.0.0.0";
+    int port = 7860;
+};
+struct WorkerConfig {
+    std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
+    std::string llama_server_bin = "/usr/local/bin/llama-server";
+    std::string host = "127.0.0.1";
+    std::string bind_host = "0.0.0.0";
+    int base_port = 8080;
+    int switch_timeout_sec = 300;
+};
+struct LlamaConfig {
+    int n_ctx = 8192;
+    int threads = 4;
+    int ngl = 0;
+    int batch = 128;
+    int ubatch = 64;
+};
+enum class Role {
+    ADMIN,
+    USER
+};
+struct AuthConfig {
+    std::string header = "Authorization";
+    std::string scheme = "Bearer";
+};
+struct LimitsConfig {
+    int default_max_tokens = 256;
+    int max_tokens_per_request = 2048;
+    int request_timeout_sec = 30;
+};
+struct QueueConfig {
+    size_t max_size = 100;
+    int max_tokens = 20000;
+    int admin_quota = 3;
+    int retry_after_sec = 5;
+};
+struct RateLimitConfig {
+    int requests_per_minute = 60;
+    int estimated_tokens_per_minute = 6000;
+};
+struct SchedulerConfig {
+    int max_concurrent = 1;
+};
+struct StreamingConfig {
+    bool enabled = false;
+};
+struct ApiKeyRecord {
+    std::string key_id;
+    std::string secret;
+    Role role = Role::USER;
+    bool enabled = true;
+};
+struct ManagerConfig {
+    ServerConfig server;
+    WorkerConfig worker;
+    LlamaConfig llama;
+    AuthConfig auth;
+    LimitsConfig limits;
+    QueueConfig queue;
+    RateLimitConfig rate_limit;
+    SchedulerConfig scheduler;
+    StreamingConfig streaming;
+    std::vector<ApiKeyRecord> api_keys;
+};
+struct WorkerInfo {
+    std::string model;
+    int port = 0;
+    pid_t pid = -1;
+    std::string last_loaded;
+};
+enum class Priority {
+    ADMIN = 0,
+    USER = 1
+};
+enum class RequestState {
+    QUEUED,
+    RUNNING,
+    CANCELLED,
+    FAILED,
+    DONE
+};
+struct TokenEstimate {
+    int prompt_tokens = 0;
+    int requested_max_tokens = 0;
+    int estimated_total_tokens = 0;
+};
+struct RateLimitDecision {
+    bool allowed = true;
+    int retry_after_sec = 0;
+    std::string error;
+};
+struct QueueSnapshot {
+    size_t total_size = 0;
+    size_t admin_size = 0;
+    size_t user_size = 0;
+    int total_tokens = 0;
+};
+struct RequestResult {
+    int status = 500;
+    std::string body;
+    std::string content_type = "application/json";
+};
+struct RequestContext {
+    std::string request_id;
+    std::string api_key_id;
+    Role role = Role::USER;
+    Priority priority = Priority::USER;
+    TokenEstimate estimate;
+    std::string request_body;
+    std::atomic<RequestState> state{RequestState::QUEUED};
+    std::atomic<bool> cancelled{false};
+    std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
+    std::chrono::steady_clock::time_point enqueue_time{created_at};
+    std::chrono::steady_clock::time_point start_time{};
+    std::mutex mu;
+    std::condition_variable cv;
+    bool completed = false;
+    RequestResult result;
+};
+struct ProxiedGetResult {
+    int status = 500;
+    std::string body;
+    std::string content_type = "text/plain; charset=utf-8";
+    std::string content_encoding;
+};
+std::string role_to_string(Role role);
+Priority role_to_priority(Role role);
+std::string state_to_string(RequestState state);

cpp/model_manager.cpp ADDED Viewed

	@@ -0,0 +1,253 @@

+#include "model_manager.h"
+#include <boost/asio.hpp>
+#include <boost/beast/core.hpp>
+#include <boost/beast/http.hpp>
+#include "http_helpers.h"
+#include <csignal>
+#include <thread>
+#include <unistd.h>
+#include <sys/wait.h>
+namespace asio = boost::asio;
+namespace beast = boost::beast;
+namespace http = beast::http;
+bool is_alive(pid_t pid) {
+    if (pid <= 0) return false;
+    return kill(pid, 0) == 0;
+}
+void shutdown_worker(pid_t pid, int wait_seconds) {
+    if (pid <= 0) return;
+    kill(pid, SIGTERM);
+    const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
+    while (std::chrono::steady_clock::now() < deadline) {
+        int status = 0;
+        pid_t r = waitpid(pid, &status, WNOHANG);
+        if (r == pid) return;
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+    }
+    kill(pid, SIGKILL);
+    int status = 0;
+    waitpid(pid, &status, 0);
+}
+ModelManager::ModelManager(const ManagerConfig &config)
+    : default_model_(config.worker.default_model),
+      llama_server_bin_(config.worker.llama_server_bin),
+      worker_host_(config.worker.host),
+      worker_bind_host_(config.worker.bind_host),
+      base_port_(config.worker.base_port),
+      switch_timeout_sec_(config.worker.switch_timeout_sec),
+      n_ctx_(config.llama.n_ctx),
+      n_threads_(config.llama.threads),
+      n_gpu_layers_(config.llama.ngl),
+      n_batch_(config.llama.batch),
+      n_ubatch_(config.llama.ubatch),
+      next_port_(base_port_) {}
+bool ModelManager::initialize_default(std::string &error) {
+    return switch_model(default_model_, error);
+}
+bool ModelManager::switch_model(const std::string &model, std::string &error) {
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        if (switch_in_progress_) {
+            error = "Switch already in progress";
+            return false;
+        }
+        if (active_ && active_->model == model && is_alive(active_->pid)) {
+            return true;
+        }
+        switch_in_progress_ = true;
+    }
+    std::optional<WorkerInfo> old_worker;
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        if (active_) old_worker = active_;
+    }
+    int port = allocate_port();
+    pid_t pid = spawn_worker(model, port);
+    if (pid <= 0) {
+        finish_switch(false);
+        error = "Failed to start worker process";
+        return false;
+    }
+    if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
+        shutdown_worker(pid);
+        finish_switch(false);
+        error = "New model did not become ready in time";
+        return false;
+    }
+    WorkerInfo new_worker{model, port, pid, now_utc_iso()};
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        active_ = new_worker;
+        switch_in_progress_ = false;
+    }
+    if (old_worker && old_worker->pid != pid) {
+        shutdown_worker(old_worker->pid);
+    }
+    return true;
+}
+bool ModelManager::restart_active(std::string &error) {
+    std::optional<WorkerInfo> old_worker;
+    std::string model;
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        if (switch_in_progress_) {
+            error = "Switch already in progress";
+            return false;
+        }
+        if (!active_ || !is_alive(active_->pid)) {
+            error = "No active model";
+            return false;
+        }
+        switch_in_progress_ = true;
+        old_worker = active_;
+        model = active_->model;
+    }
+    shutdown_worker(old_worker->pid);
+    int port = allocate_port();
+    pid_t pid = spawn_worker(model, port);
+    if (pid <= 0) {
+        std::lock_guard<std::mutex> lock(mu_);
+        active_ = std::nullopt;
+        switch_in_progress_ = false;
+        error = "Failed to start worker process";
+        return false;
+    }
+    if (!wait_until_ready(pid, port, switch_timeout_sec_)) {
+        shutdown_worker(pid);
+        std::lock_guard<std::mutex> lock(mu_);
+        active_ = std::nullopt;
+        switch_in_progress_ = false;
+        error = "New model did not become ready in time";
+        return false;
+    }
+    WorkerInfo new_worker{model, port, pid, now_utc_iso()};
+    {
+        std::lock_guard<std::mutex> lock(mu_);
+        active_ = new_worker;
+        switch_in_progress_ = false;
+    }
+    return true;
+}
+std::optional<WorkerInfo> ModelManager::active_worker() {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (active_ && is_alive(active_->pid)) return active_;
+    return std::nullopt;
+}
+json ModelManager::models_view() {
+    std::lock_guard<std::mutex> lock(mu_);
+    json out;
+    out["status"] = (active_ && is_alive(active_->pid)) ? "ready" : "no_active_model";
+    out["switch_in_progress"] = switch_in_progress_;
+    if (active_ && is_alive(active_->pid)) {
+        out["current_model"] = active_->model;
+        out["last_loaded"] = active_->last_loaded;
+        out["active_pid"] = active_->pid;
+        out["active_port"] = active_->port;
+    } else {
+        out["current_model"] = nullptr;
+        out["last_loaded"] = nullptr;
+        out["active_pid"] = nullptr;
+        out["active_port"] = nullptr;
+    }
+    return out;
+}
+int ModelManager::allocate_port() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return next_port_++;
+}
+void ModelManager::finish_switch(bool ok) {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (!ok) switch_in_progress_ = false;
+}
+pid_t ModelManager::spawn_worker(const std::string &model, int port) {
+    pid_t pid = fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        setsid();
+        std::string port_s = std::to_string(port);
+        std::string n_ctx_s = std::to_string(n_ctx_);
+        std::string threads_s = std::to_string(n_threads_);
+        std::string ngl_s = std::to_string(n_gpu_layers_);
+        std::string batch_s = std::to_string(n_batch_);
+        std::string ubatch_s = std::to_string(n_ubatch_);
+        std::vector<std::string> args = {
+            llama_server_bin_,
+            "-hf", model,
+            "--host", worker_bind_host_,
+            "--port", port_s,
+            "-c", n_ctx_s,
+            "-t", threads_s,
+            "-ngl", ngl_s,
+            "--cont-batching",
+            "-b", batch_s,
+            "--ubatch-size", ubatch_s
+        };
+        std::vector<char *> argv;
+        argv.reserve(args.size() + 1);
+        for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
+        argv.push_back(nullptr);
+        execvp(argv[0], argv.data());
+        _exit(127);
+    }
+    return pid;
+}
+bool ModelManager::wait_until_ready(pid_t pid, int port, int timeout_sec) {
+    const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
+    while (std::chrono::steady_clock::now() < deadline) {
+        if (!is_alive(pid)) return false;
+        try {
+            auto [status, _] = http_get(port, "/");
+            if (status == 200) return true;
+        } catch (...) {
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(800));
+    }
+    return false;
+}
+std::pair<int, std::string> ModelManager::http_get(int port, const std::string &target) {
+    asio::io_context ioc;
+    asio::ip::tcp::resolver resolver(ioc);
+    beast::tcp_stream stream(ioc);
+    auto const results = resolver.resolve(worker_host_, std::to_string(port));
+    stream.connect(results);
+    http::request<http::string_body> req{http::verb::get, target, 11};
+    req.set(http::field::host, worker_host_);
+    req.set(http::field::user_agent, "llm-manager");
+    http::write(stream, req);
+    beast::flat_buffer buffer;
+    http::response<http::string_body> res;
+    http::read(stream, buffer, res);
+    beast::error_code ec;
+    stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
+    return {res.result_int(), res.body()};
+}

cpp/model_manager.h ADDED Viewed

	@@ -0,0 +1,46 @@

+#pragma once
+#include <mutex>
+#include <optional>
+#include <string>
+#include <utility>
+#include "llm_manager_types.h"
+class ModelManager {
+public:
+    explicit ModelManager(const ManagerConfig &config);
+    bool initialize_default(std::string &error);
+    bool switch_model(const std::string &model, std::string &error);
+    bool restart_active(std::string &error);
+    std::optional<WorkerInfo> active_worker();
+    json models_view();
+private:
+    std::mutex mu_;
+    std::optional<WorkerInfo> active_;
+    bool switch_in_progress_ = false;
+    std::string default_model_;
+    std::string llama_server_bin_;
+    std::string worker_host_;
+    std::string worker_bind_host_;
+    int base_port_;
+    int switch_timeout_sec_;
+    int n_ctx_;
+    int n_threads_;
+    int n_gpu_layers_;
+    int n_batch_;
+    int n_ubatch_;
+    int next_port_;
+    int allocate_port();
+    void finish_switch(bool ok);
+    pid_t spawn_worker(const std::string &model, int port);
+    bool wait_until_ready(pid_t pid, int port, int timeout_sec);
+    std::pair<int, std::string> http_get(int port, const std::string &target);
+};
+bool is_alive(pid_t pid);
+void shutdown_worker(pid_t pid, int wait_seconds = 15);

cpp/request_parsing.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+#include "request_parsing.h"
+#include <algorithm>
+#include <sstream>
+static int estimate_text_tokens_rough(const std::string &text) {
+    if (text.empty()) return 0;
+    return std::max(1, static_cast<int>((text.size() + 3) / 4));
+}
+static std::string flatten_json_content(const json &content) {
+    if (content.is_string()) {
+        return content.get<std::string>();
+    }
+    if (content.is_array()) {
+        std::ostringstream oss;
+        bool first = true;
+        for (const auto &item : content) {
+            std::string part;
+            if (item.is_string()) {
+                part = item.get<std::string>();
+            } else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
+                part = item["text"].get<std::string>();
+            }
+            if (part.empty()) continue;
+            if (!first) oss << '\n';
+            oss << part;
+            first = false;
+        }
+        return oss.str();
+    }
+    return "";
+}
+std::optional<TokenEstimate> estimate_chat_tokens(
+    const json &payload,
+    const LimitsConfig &limits,
+    std::string &error) {
+    if (!payload.is_object()) {
+        error = "Expected JSON object";
+        return std::nullopt;
+    }
+    if (!payload.contains("messages") || !payload["messages"].is_array()) {
+        error = "Expected 'messages' array";
+        return std::nullopt;
+    }
+    TokenEstimate estimate;
+    estimate.requested_max_tokens = limits.default_max_tokens;
+    if (payload.contains("max_tokens")) {
+        if (!payload["max_tokens"].is_number_integer()) {
+            error = "Expected integer 'max_tokens'";
+            return std::nullopt;
+        }
+        estimate.requested_max_tokens = payload["max_tokens"].get<int>();
+    }
+    if (estimate.requested_max_tokens <= 0) {
+        error = "'max_tokens' must be > 0";
+        return std::nullopt;
+    }
+    for (const auto &message : payload["messages"]) {
+        if (!message.is_object()) continue;
+        if (message.contains("role") && message["role"].is_string()) {
+            estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
+        }
+        if (message.contains("content")) {
+            estimate.prompt_tokens += estimate_text_tokens_rough(flatten_json_content(message["content"]));
+        }
+        estimate.prompt_tokens += 4;
+    }
+    estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
+    if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
+        error = "Estimated request tokens exceed configured limit";
+        return std::nullopt;
+    }
+    error.clear();
+    return estimate;
+}
+bool request_stream_enabled(const json &payload) {
+    if (!payload.is_object() || !payload.contains("stream")) return false;
+    return payload["stream"].is_boolean() && payload["stream"].get<bool>();
+}

cpp/request_parsing.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <optional>
+#include <string>
+#include "llm_manager_types.h"
+std::optional<TokenEstimate> estimate_chat_tokens(
+    const json &payload,
+    const LimitsConfig &limits,
+    std::string &error);
+bool request_stream_enabled(const json &payload);

cpp/runtime_components.cpp ADDED Viewed

	@@ -0,0 +1,352 @@

+#include "runtime_components.h"
+#include "config.h"
+#include "http_helpers.h"
+#include "model_manager.h"
+#include <algorithm>
+#include <sstream>
+RateLimiterStore::RateLimiterStore(const RateLimitConfig &config)
+    : requests_per_minute_(std::max(0, config.requests_per_minute)),
+      estimated_tokens_per_minute_(std::max(0, config.estimated_tokens_per_minute)) {}
+RateLimitDecision RateLimiterStore::allow(const std::string &api_key_id, int estimated_tokens) {
+    if (requests_per_minute_ <= 0 && estimated_tokens_per_minute_ <= 0) return {};
+    std::lock_guard<std::mutex> lock(mu_);
+    auto &bucket = buckets_[api_key_id];
+    const auto now = std::chrono::steady_clock::now();
+    refill(bucket.request_tokens, bucket.last_request_refill, requests_per_minute_, now);
+    refill(bucket.estimated_tokens, bucket.last_estimated_refill, estimated_tokens_per_minute_, now);
+    if (requests_per_minute_ > 0 && bucket.request_tokens < 1.0) {
+        return {false, 1, "Rate limit exceeded: requests"};
+    }
+    if (estimated_tokens_per_minute_ > 0 && bucket.estimated_tokens < estimated_tokens) {
+        return {false, 1, "Rate limit exceeded: estimated tokens"};
+    }
+    if (requests_per_minute_ > 0) bucket.request_tokens -= 1.0;
+    if (estimated_tokens_per_minute_ > 0) bucket.estimated_tokens -= estimated_tokens;
+    return {};
+}
+void RateLimiterStore::refill(
+    double &tokens,
+    std::chrono::steady_clock::time_point &last_refill,
+    int limit_per_minute,
+    std::chrono::steady_clock::time_point now) {
+    if (limit_per_minute <= 0) return;
+    if (last_refill.time_since_epoch().count() == 0) {
+        tokens = limit_per_minute;
+        last_refill = now;
+        return;
+    }
+    const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
+    if (elapsed <= 0) return;
+    const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
+    tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
+    last_refill = now;
+}
+std::shared_ptr<RequestContext> RequestRegistry::create(
+    const std::string &request_id,
+    const ApiKeyRecord &principal,
+    const TokenEstimate &estimate,
+    const std::string &request_body) {
+    auto ctx = std::make_shared<RequestContext>();
+    ctx->request_id = request_id;
+    ctx->api_key_id = principal.key_id;
+    ctx->role = principal.role;
+    ctx->priority = role_to_priority(principal.role);
+    ctx->estimate = estimate;
+    ctx->request_body = request_body;
+    ctx->created_at = std::chrono::steady_clock::now();
+    ctx->enqueue_time = ctx->created_at;
+    std::lock_guard<std::mutex> lock(mu_);
+    requests_[request_id] = ctx;
+    return ctx;
+}
+std::shared_ptr<RequestContext> RequestRegistry::find(const std::string &request_id) const {
+    std::lock_guard<std::mutex> lock(mu_);
+    const auto it = requests_.find(request_id);
+    if (it == requests_.end()) return nullptr;
+    return it->second;
+}
+void RequestRegistry::complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
+    {
+        std::lock_guard<std::mutex> lock(ctx->mu);
+        ctx->state.store(state);
+        ctx->result = std::move(result);
+        ctx->completed = true;
+    }
+    ctx->cv.notify_all();
+}
+std::shared_ptr<RequestContext> RequestRegistry::cancel_request(const std::string &request_id) {
+    auto ctx = find(request_id);
+    if (!ctx) return nullptr;
+    ctx->cancelled.store(true);
+    const auto state = ctx->state.load();
+    if (state == RequestState::QUEUED) {
+        complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+    } else if (state == RequestState::RUNNING) {
+        ctx->state.store(RequestState::CANCELLED);
+    }
+    return ctx;
+}
+std::vector<std::shared_ptr<RequestContext>> RequestRegistry::cancel_all() {
+    std::vector<std::shared_ptr<RequestContext>> out;
+    std::lock_guard<std::mutex> lock(mu_);
+    out.reserve(requests_.size());
+    for (auto &[_, ctx] : requests_) {
+        ctx->cancelled.store(true);
+        const auto state = ctx->state.load();
+        if (state == RequestState::QUEUED) {
+            {
+                std::lock_guard<std::mutex> ctx_lock(ctx->mu);
+                ctx->state.store(RequestState::CANCELLED);
+                ctx->result = {499, R"({"error":"Request cancelled"})"};
+                ctx->completed = true;
+            }
+            ctx->cv.notify_all();
+        } else if (state == RequestState::RUNNING) {
+            ctx->state.store(RequestState::CANCELLED);
+        }
+        out.push_back(ctx);
+    }
+    return out;
+}
+void MetricsRegistry::inc_requests_total() { requests_total_.fetch_add(1); }
+void MetricsRegistry::inc_requests_inflight() { requests_inflight_.fetch_add(1); }
+void MetricsRegistry::dec_requests_inflight() { requests_inflight_.fetch_sub(1); }
+void MetricsRegistry::inc_queue_rejected_total() { queue_rejected_total_.fetch_add(1); }
+void MetricsRegistry::inc_rate_limited_total() { rate_limited_total_.fetch_add(1); }
+void MetricsRegistry::add_cancellations_total(uint64_t delta) { cancellations_total_.fetch_add(delta); }
+void MetricsRegistry::inc_switch_total() { switch_total_.fetch_add(1); }
+void MetricsRegistry::inc_worker_restarts_total() { worker_restarts_total_.fetch_add(1); }
+void MetricsRegistry::observe_request_latency_ms(int64_t value) {
+    request_latency_ms_total_.fetch_add(value);
+    request_latency_samples_.fetch_add(1);
+}
+void MetricsRegistry::observe_queue_wait_ms(int64_t value) {
+    queue_wait_ms_total_.fetch_add(value);
+    queue_wait_samples_.fetch_add(1);
+}
+std::string MetricsRegistry::render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const {
+    std::ostringstream oss;
+    oss << "llm_manager_requests_total " << requests_total_.load() << '\n';
+    oss << "llm_manager_requests_inflight " << requests_inflight_.load() << '\n';
+    oss << "llm_manager_request_latency_ms_total " << request_latency_ms_total_.load() << '\n';
+    oss << "llm_manager_request_latency_ms_samples " << request_latency_samples_.load() << '\n';
+    oss << "llm_manager_queue_size " << queue.total_size << '\n';
+    oss << "llm_manager_queue_admin_size " << queue.admin_size << '\n';
+    oss << "llm_manager_queue_user_size " << queue.user_size << '\n';
+    oss << "llm_manager_queue_tokens " << queue.total_tokens << '\n';
+    oss << "llm_manager_queue_rejected_total " << queue_rejected_total_.load() << '\n';
+    oss << "llm_manager_rate_limited_total " << rate_limited_total_.load() << '\n';
+    oss << "llm_manager_queue_wait_time_ms_total " << queue_wait_ms_total_.load() << '\n';
+    oss << "llm_manager_queue_wait_time_ms_samples " << queue_wait_samples_.load() << '\n';
+    oss << "llm_manager_cancellations_total " << cancellations_total_.load() << '\n';
+    oss << "llm_manager_switch_total " << switch_total_.load() << '\n';
+    oss << "llm_manager_worker_restarts_total " << worker_restarts_total_.load() << '\n';
+    const auto active = manager.active_worker();
+    oss << "llm_manager_active_worker " << (active ? 1 : 0) << '\n';
+    return oss.str();
+}
+PrioritySchedulerQueue::PrioritySchedulerQueue(const QueueConfig &config)
+    : max_size_(config.max_size),
+      max_tokens_(config.max_tokens),
+      admin_quota_(std::max(1, config.admin_quota)),
+      retry_after_sec_(std::max(1, config.retry_after_sec)) {}
+bool PrioritySchedulerQueue::try_push(const std::shared_ptr<RequestContext> &ctx) {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (current_size_ >= max_size_) return false;
+    if (current_tokens_ + ctx->estimate.estimated_total_tokens > max_tokens_) return false;
+    if (ctx->priority == Priority::ADMIN) admin_queue_.push_back(ctx);
+    else user_queue_.push_back(ctx);
+    ++current_size_;
+    current_tokens_ += ctx->estimate.estimated_total_tokens;
+    cv_.notify_one();
+    return true;
+}
+std::shared_ptr<RequestContext> PrioritySchedulerQueue::pop_next() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [&]() { return stopped_ || current_size_ > 0; });
+    if (stopped_) return nullptr;
+    std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
+    if (!admin_queue_.empty() && (admin_streak_ < admin_quota_ || user_queue_.empty())) {
+        selected_queue = &admin_queue_;
+        ++admin_streak_;
+    } else if (!user_queue_.empty()) {
+        selected_queue = &user_queue_;
+        admin_streak_ = 0;
+    } else if (!admin_queue_.empty()) {
+        selected_queue = &admin_queue_;
+        admin_streak_ = 1;
+    }
+    if (!selected_queue || selected_queue->empty()) return nullptr;
+    auto best_it = std::min_element(
+        selected_queue->begin(),
+        selected_queue->end(),
+        [](const auto &a, const auto &b) {
+            return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
+        });
+    auto ctx = *best_it;
+    selected_queue->erase(best_it);
+    --current_size_;
+    current_tokens_ -= ctx->estimate.estimated_total_tokens;
+    return ctx;
+}
+void PrioritySchedulerQueue::stop() {
+    std::lock_guard<std::mutex> lock(mu_);
+    stopped_ = true;
+    cv_.notify_all();
+}
+int PrioritySchedulerQueue::retry_after_sec() const {
+    return retry_after_sec_;
+}
+QueueSnapshot PrioritySchedulerQueue::snapshot() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return QueueSnapshot{current_size_, admin_queue_.size(), user_queue_.size(), current_tokens_};
+}
+Scheduler::Scheduler(
+    ModelManager &manager,
+    RequestRegistry &registry,
+    MetricsRegistry &metrics,
+    const QueueConfig &queue_config)
+    : manager_(manager), registry_(registry), metrics_(metrics), queue_(queue_config) {
+    worker_ = std::thread([this]() { worker_loop(); });
+}
+Scheduler::~Scheduler() {
+    queue_.stop();
+    if (worker_.joinable()) worker_.join();
+}
+bool Scheduler::try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
+    return queue_.try_push(ctx);
+}
+int Scheduler::retry_after_sec() const {
+    return queue_.retry_after_sec();
+}
+QueueSnapshot Scheduler::snapshot() const {
+    return queue_.snapshot();
+}
+void Scheduler::worker_loop() {
+    for (;;) {
+        auto ctx = queue_.pop_next();
+        if (!ctx) return;
+        if (ctx->cancelled.load()) {
+            registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+            continue;
+        }
+        ctx->state.store(RequestState::RUNNING);
+        ctx->start_time = std::chrono::steady_clock::now();
+        metrics_.observe_queue_wait_ms(
+            std::chrono::duration_cast<std::chrono::milliseconds>(ctx->start_time - ctx->enqueue_time).count());
+        auto worker = manager_.active_worker();
+        if (!worker) {
+            registry_.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
+            continue;
+        }
+        try {
+            auto [status, body] = forward_chat(*worker, ctx->request_body);
+            if (ctx->cancelled.load()) {
+                registry_.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+                continue;
+            }
+            registry_.complete(ctx, RequestState::DONE, {status, body});
+        } catch (const std::exception &e) {
+            registry_.complete(ctx, RequestState::FAILED, {500, json({{"error", e.what()}}).dump()});
+        }
+    }
+}
+ApiKeyAuth::ApiKeyAuth(const ManagerConfig &config)
+    : header_name_(config.auth.header), scheme_(config.auth.scheme) {
+    for (const auto &record : config.api_keys) {
+        records_by_secret_.emplace(record.secret, record);
+    }
+}
+bool ApiKeyAuth::enabled() const {
+    return !records_by_secret_.empty();
+}
+std::optional<ApiKeyRecord> ApiKeyAuth::authenticate(
+    const http::request<http::string_body> &req,
+    std::string &error) const {
+    if (!enabled()) {
+        error.clear();
+        return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
+    }
+    const auto token = extract_bearer_token(req, error);
+    if (!token) return std::nullopt;
+    const auto it = records_by_secret_.find(*token);
+    if (it == records_by_secret_.end()) {
+        error = "Invalid API key";
+        return std::nullopt;
+    }
+    if (!it->second.enabled) {
+        error = "API key disabled";
+        return std::nullopt;
+    }
+    error.clear();
+    return it->second;
+}
+std::optional<std::string> ApiKeyAuth::extract_bearer_token(
+    const http::request<http::string_body> &req,
+    std::string &error) const {
+    const auto header_it = req.find(header_name_);
+    if (header_it == req.end()) {
+        error = "Missing authorization header";
+        return std::nullopt;
+    }
+    const std::string value = trim_copy(header_it->value().to_string());
+    const std::string prefix = scheme_ + " ";
+    if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
+        error = "Invalid authorization scheme";
+        return std::nullopt;
+    }
+    std::string token = trim_copy(value.substr(prefix.size()));
+    if (token.empty()) {
+        error = "Missing API key";
+        return std::nullopt;
+    }
+    return token;
+}

cpp/runtime_components.h ADDED Viewed

	@@ -0,0 +1,149 @@

+#pragma once
+#include <boost/beast/http.hpp>
+#include <deque>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include "llm_manager_types.h"
+class ModelManager;
+namespace http = boost::beast::http;
+class RateLimiterStore {
+public:
+    explicit RateLimiterStore(const RateLimitConfig &config);
+    RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens);
+private:
+    struct Bucket {
+        double request_tokens = 0.0;
+        double estimated_tokens = 0.0;
+        std::chrono::steady_clock::time_point last_request_refill{};
+        std::chrono::steady_clock::time_point last_estimated_refill{};
+    };
+    std::mutex mu_;
+    std::unordered_map<std::string, Bucket> buckets_;
+    int requests_per_minute_;
+    int estimated_tokens_per_minute_;
+    static void refill(
+        double &tokens,
+        std::chrono::steady_clock::time_point &last_refill,
+        int limit_per_minute,
+        std::chrono::steady_clock::time_point now);
+};
+class RequestRegistry {
+public:
+    std::shared_ptr<RequestContext> create(
+        const std::string &request_id,
+        const ApiKeyRecord &principal,
+        const TokenEstimate &estimate,
+        const std::string &request_body);
+    std::shared_ptr<RequestContext> find(const std::string &request_id) const;
+    void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result);
+    std::shared_ptr<RequestContext> cancel_request(const std::string &request_id);
+    std::vector<std::shared_ptr<RequestContext>> cancel_all();
+private:
+    mutable std::mutex mu_;
+    std::unordered_map<std::string, std::shared_ptr<RequestContext>> requests_;
+};
+class MetricsRegistry {
+public:
+    void inc_requests_total();
+    void inc_requests_inflight();
+    void dec_requests_inflight();
+    void inc_queue_rejected_total();
+    void inc_rate_limited_total();
+    void add_cancellations_total(uint64_t delta = 1);
+    void inc_switch_total();
+    void inc_worker_restarts_total();
+    void observe_request_latency_ms(int64_t value);
+    void observe_queue_wait_ms(int64_t value);
+    std::string render_prometheus(const QueueSnapshot &queue, ModelManager &manager) const;
+private:
+    std::atomic<uint64_t> requests_total_{0};
+    std::atomic<int64_t> requests_inflight_{0};
+    std::atomic<uint64_t> request_latency_ms_total_{0};
+    std::atomic<uint64_t> request_latency_samples_{0};
+    std::atomic<uint64_t> queue_rejected_total_{0};
+    std::atomic<uint64_t> rate_limited_total_{0};
+    std::atomic<uint64_t> queue_wait_ms_total_{0};
+    std::atomic<uint64_t> queue_wait_samples_{0};
+    std::atomic<uint64_t> cancellations_total_{0};
+    std::atomic<uint64_t> switch_total_{0};
+    std::atomic<uint64_t> worker_restarts_total_{0};
+};
+class PrioritySchedulerQueue {
+public:
+    explicit PrioritySchedulerQueue(const QueueConfig &config);
+    bool try_push(const std::shared_ptr<RequestContext> &ctx);
+    std::shared_ptr<RequestContext> pop_next();
+    void stop();
+    int retry_after_sec() const;
+    QueueSnapshot snapshot() const;
+private:
+    mutable std::mutex mu_;
+    std::condition_variable cv_;
+    std::deque<std::shared_ptr<RequestContext>> admin_queue_;
+    std::deque<std::shared_ptr<RequestContext>> user_queue_;
+    size_t max_size_;
+    size_t current_size_ = 0;
+    int max_tokens_;
+    int current_tokens_ = 0;
+    int admin_quota_;
+    int admin_streak_ = 0;
+    int retry_after_sec_;
+    bool stopped_ = false;
+};
+class Scheduler {
+public:
+    Scheduler(
+        ModelManager &manager,
+        RequestRegistry &registry,
+        MetricsRegistry &metrics,
+        const QueueConfig &queue_config);
+    ~Scheduler();
+    bool try_enqueue(const std::shared_ptr<RequestContext> &ctx);
+    int retry_after_sec() const;
+    QueueSnapshot snapshot() const;
+private:
+    ModelManager &manager_;
+    RequestRegistry &registry_;
+    MetricsRegistry &metrics_;
+    PrioritySchedulerQueue queue_;
+    std::thread worker_;
+    void worker_loop();
+};
+class ApiKeyAuth {
+public:
+    explicit ApiKeyAuth(const ManagerConfig &config);
+    bool enabled() const;
+    std::optional<ApiKeyRecord> authenticate(
+        const http::request<http::string_body> &req,
+        std::string &error) const;
+private:
+    std::string header_name_;
+    std::string scheme_;
+    std::unordered_map<std::string, ApiKeyRecord> records_by_secret_;
+    std::optional<std::string> extract_bearer_token(
+        const http::request<http::string_body> &req,
+        std::string &error) const;
+};

cpp/server.cpp ADDED Viewed

	@@ -0,0 +1,329 @@

+#include "server.h"
+#include <boost/beast/core.hpp>
+#include "http_helpers.h"
+#include "model_manager.h"
+#include "request_parsing.h"
+#include "runtime_components.h"
+#include <algorithm>
+#include <atomic>
+namespace beast = boost::beast;
+namespace http = beast::http;
+static std::atomic<uint64_t> g_req_id{1};
+http::response<http::string_body> handle_request(
+    ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    MetricsRegistry &metrics,
+    Scheduler &scheduler,
+    http::request<http::string_body> &&req) {
+    const auto start = std::chrono::steady_clock::now();
+    const auto req_id_num = g_req_id.fetch_add(1);
+    const std::string request_id = std::to_string(req_id_num);
+    const std::string target = req.target().to_string();
+    const std::string method = req.method_string().to_string();
+    const std::string path = target.substr(0, target.find('?'));
+    auto authenticated = std::optional<ApiKeyRecord>{};
+    metrics.inc_requests_total();
+    metrics.inc_requests_inflight();
+    struct InflightGuard {
+        MetricsRegistry &metrics;
+        ~InflightGuard() { metrics.dec_requests_inflight(); }
+    } inflight_guard{metrics};
+    log_line("request_id=" + request_id + " method=" + method + " path=" + target);
+    if (!req.body().empty()) {
+        log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
+    }
+    auto json_response = [&](http::status status, const json &obj) {
+        json payload = obj;
+        payload["request_id"] = request_id;
+        http::response<http::string_body> res{status, req.version()};
+        res.set(http::field::content_type, "application/json");
+        res.set(http::field::server, "llm-manager");
+        res.set("X-Request-Id", request_id);
+        res.keep_alive(req.keep_alive());
+        res.body() = payload.dump();
+        res.prepare_payload();
+        auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start).count();
+        metrics.observe_request_latency_ms(elapsed_ms);
+        log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
+                 " elapsed_ms=" + std::to_string(elapsed_ms));
+        return res;
+    };
+    auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
+        auto res = json_response(status, obj);
+        res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
+        return res;
+    };
+    auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
+        std::string auth_error;
+        authenticated = auth.authenticate(req, auth_error);
+        if (!authenticated) {
+            return json_response(http::status::unauthorized, {{"error", auth_error}});
+        }
+        if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
+            return json_response(http::status::forbidden, {{"error", "Admin role required"}});
+        }
+        log_line("request_id=" + request_id +
+                 " api_key_id=" + authenticated->key_id +
+                 " role=" + role_to_string(authenticated->role));
+        return std::nullopt;
+    };
+    try {
+        if (path == "/health" && req.method() == http::verb::get) {
+            return json_response(http::status::ok, manager.models_view());
+        }
+        if (path == "/models" && req.method() == http::verb::get) {
+            return json_response(http::status::ok, manager.models_view());
+        }
+        if (path == "/queue/metrics" && req.method() == http::verb::get) {
+            http::response<http::string_body> res{http::status::ok, req.version()};
+            res.set(http::field::content_type, "text/plain; version=0.0.4; charset=utf-8");
+            res.set(http::field::server, "llm-manager");
+            res.set("X-Request-Id", request_id);
+            res.keep_alive(req.keep_alive());
+            res.body() = metrics.render_prometheus(scheduler.snapshot(), manager);
+            res.prepare_payload();
+            auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start).count();
+            metrics.observe_request_latency_ms(elapsed_ms);
+            log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
+                     " elapsed_ms=" + std::to_string(elapsed_ms));
+            return res;
+        }
+        if (path == "/switch-model" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
+            json j = json::parse(req.body(), nullptr, false);
+            if (j.is_discarded()) {
+                return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
+            }
+            std::string model;
+            if (j.contains("model_name")) model = j["model_name"].get<std::string>();
+            if (j.contains("model")) model = j["model"].get<std::string>();
+            if (model.empty()) {
+                return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
+            }
+            std::string err;
+            bool ok = manager.switch_model(model, err);
+            if (!ok) {
+                auto status = (err == "Switch already in progress")
+                    ? http::status::conflict
+                    : http::status::internal_server_error;
+                return json_response(status, {{"status", "error"}, {"error", err}});
+            }
+            auto state = manager.models_view();
+            state["message"] = "Switched model successfully";
+            metrics.inc_switch_total();
+            return json_response(http::status::ok, state);
+        }
+        if (path == "/stop" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
+            const auto cancelled = registry.cancel_all();
+            metrics.add_cancellations_total(cancelled.size());
+            std::string err;
+            bool ok = manager.restart_active(err);
+            if (!ok) {
+                http::status status = http::status::internal_server_error;
+                if (err == "Switch already in progress") status = http::status::conflict;
+                else if (err == "No active model") status = http::status::service_unavailable;
+                return json_response(status, {{"status", "error"}, {"error", err}});
+            }
+            auto state = manager.models_view();
+            state["message"] = "Stopped in-flight prompts and restarted model";
+            metrics.inc_worker_restarts_total();
+            return json_response(http::status::ok, state);
+        }
+        if (req.method() == http::verb::post) {
+            if (auto cancel_id = extract_cancel_request_id(path)) {
+                if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
+                auto ctx = registry.find(*cancel_id);
+                if (!ctx) {
+                    return json_response(http::status::not_found, {{"error", "Unknown request id"}});
+                }
+                if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
+                    return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
+                }
+                const auto previous_state = ctx->state.load();
+                registry.cancel_request(*cancel_id);
+                metrics.add_cancellations_total();
+                std::string restart_error;
+                bool restarted = true;
+                if (previous_state == RequestState::RUNNING) {
+                    restarted = manager.restart_active(restart_error);
+                    if (restarted) metrics.inc_worker_restarts_total();
+                }
+                json payload = {
+                    {"cancelled_request_id", *cancel_id},
+                    {"state", state_to_string(ctx->state.load())}
+                };
+                if (!restarted) payload["restart_error"] = restart_error;
+                return json_response(http::status::ok, payload);
+            }
+        }
+        if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
+            json payload = json::parse(req.body(), nullptr, false);
+            if (payload.is_discarded()) {
+                return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
+            }
+            if (request_stream_enabled(payload)) {
+                if (!config.streaming.enabled) {
+                    return json_response(http::status::not_implemented, {{"error", "Streaming is disabled"}});
+                }
+                return json_response(http::status::not_implemented, {{"error", "Streaming relay is not implemented yet"}});
+            }
+            std::string token_error;
+            auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
+            if (!estimate) {
+                return json_response(http::status::bad_request, {{"error", token_error}});
+            }
+            log_line("request_id=" + request_id +
+                     " prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
+                     " max_tokens=" + std::to_string(estimate->requested_max_tokens) +
+                     " estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
+            auto rate_limit_decision = rate_limiter.allow(authenticated->key_id, estimate->estimated_total_tokens);
+            if (!rate_limit_decision.allowed) {
+                metrics.inc_rate_limited_total();
+                return json_response_with_retry_after(
+                    http::status::too_many_requests,
+                    {{"error", rate_limit_decision.error}},
+                    rate_limit_decision.retry_after_sec);
+            }
+            auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
+            if (!scheduler.try_enqueue(ctx)) {
+                ctx->cancelled.store(true);
+                registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
+                metrics.inc_queue_rejected_total();
+                return json_response_with_retry_after(
+                    http::status::service_unavailable,
+                    {{"error", "Queue full"}},
+                    scheduler.retry_after_sec());
+            }
+            std::unique_lock<std::mutex> lock(ctx->mu);
+            const bool finished = ctx->cv.wait_for(
+                lock,
+                std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
+                [&]() { return ctx->completed; });
+            if (!finished) {
+                lock.unlock();
+                registry.cancel_request(request_id);
+                metrics.add_cancellations_total();
+                std::string restart_error;
+                bool restarted = true;
+                if (ctx->state.load() == RequestState::RUNNING) {
+                    restarted = manager.restart_active(restart_error);
+                    if (restarted) metrics.inc_worker_restarts_total();
+                }
+                json timeout_payload = {
+                    {"error", "Request timed out"},
+                    {"state", state_to_string(ctx->state.load())}
+                };
+                if (!restarted) timeout_payload["restart_error"] = restart_error;
+                return json_response(http::status::gateway_timeout, timeout_payload);
+            }
+            const auto final_state = ctx->state.load();
+            RequestResult result = ctx->result;
+            lock.unlock();
+            if (final_state == RequestState::CANCELLED) {
+                return json_response(http::status::ok, {{"status", "cancelled"}});
+            }
+            http::response<http::string_body> res{
+                static_cast<http::status>(result.status), req.version()};
+            res.set(http::field::content_type, result.content_type);
+            res.set(http::field::server, "llm-manager");
+            res.set("X-Request-Id", request_id);
+            res.keep_alive(req.keep_alive());
+            res.body() = result.body;
+            res.prepare_payload();
+            auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start).count();
+            metrics.observe_request_latency_ms(elapsed_ms);
+            log_line("request_id=" + request_id +
+                     " final_state=" + state_to_string(final_state) +
+                     " upstream_status=" + std::to_string(result.status) +
+                     " elapsed_ms=" + std::to_string(elapsed_ms));
+            return res;
+        }
+        if (req.method() == http::verb::get) {
+            auto worker = manager.active_worker();
+            if (!worker) {
+                return json_response(http::status::service_unavailable, {{"error", "No active model"}});
+            }
+            auto upstream = forward_get_to_worker(*worker, target);
+            http::response<http::string_body> res{
+                static_cast<http::status>(upstream.status), req.version()};
+            res.set(http::field::content_type, upstream.content_type);
+            if (!upstream.content_encoding.empty()) {
+                res.set(http::field::content_encoding, upstream.content_encoding);
+            }
+            res.set(http::field::server, "llm-manager");
+            res.set("X-Request-Id", request_id);
+            res.keep_alive(req.keep_alive());
+            res.body() = upstream.body;
+            res.prepare_payload();
+            auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start).count();
+            log_line("request_id=" + request_id +
+                     " proxied_get model=" + worker->model +
+                     " upstream_status=" + std::to_string(upstream.status) +
+                     " elapsed_ms=" + std::to_string(elapsed_ms));
+            return res;
+        }
+        return json_response(http::status::not_found, {{"error", "Not found"}});
+    } catch (const std::exception &e) {
+        return json_response(http::status::internal_server_error, {{"error", e.what()}});
+    }
+}
+void do_session(
+    boost::asio::ip::tcp::socket socket,
+    ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    MetricsRegistry &metrics,
+    Scheduler &scheduler) {
+    try {
+        beast::flat_buffer buffer;
+        http::request<http::string_body> req;
+        http::read(socket, buffer, req);
+        auto res = handle_request(manager, config, auth, rate_limiter, registry, metrics, scheduler, std::move(req));
+        http::write(socket, res);
+        beast::error_code ec;
+        socket.shutdown(boost::asio::ip::tcp::socket::shutdown_send, ec);
+    } catch (...) {
+    }
+}

cpp/server.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#pragma once
+#include <boost/asio.hpp>
+#include <boost/beast/http.hpp>
+#include "llm_manager_types.h"
+class ModelManager;
+class ApiKeyAuth;
+class RateLimiterStore;
+class RequestRegistry;
+class MetricsRegistry;
+class Scheduler;
+namespace http = boost::beast::http;
+http::response<http::string_body> handle_request(
+    ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    MetricsRegistry &metrics,
+    Scheduler &scheduler,
+    http::request<http::string_body> &&req);
+void do_session(
+    boost::asio::ip::tcp::socket socket,
+    ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    MetricsRegistry &metrics,
+    Scheduler &scheduler);

app.py → python/app.py RENAMED Viewed

File without changes

logger.py → python/logger.py RENAMED Viewed

File without changes