Spaces:

ResearchEngineering
/

AGI

Sleeping

App Files Files Community

Dmitry Beresnev commited on Mar 17

Commit

d9ce859

1 Parent(s): 8ef326a

add auth, token policy, queue scheduler, and cancel flow, etc

Browse files

Files changed (3) hide show

.gitignore +3 -0
config.toml.example +52 -0
cpp/llm_manager.cpp +1031 -35

.gitignore CHANGED Viewed

@@ -131,3 +131,6 @@ temp/
 tests/
 *.md
 docs/

 tests/
 *.md
 docs/
+#
+.clang-format

config.toml.example ADDED Viewed

	@@ -0,0 +1,52 @@

+[server]
+host = "0.0.0.0"
+port = 7860
+[worker]
+default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m"
+llama_server_bin = "/usr/local/bin/llama-server"
+host = "127.0.0.1"
+bind_host = "0.0.0.0"
+base_port = 8080
+switch_timeout_sec = 300
+[llama]
+n_ctx = 8192
+threads = 4
+ngl = 0
+batch = 128
+ubatch = 64
+[auth]
+header = "Authorization"
+scheme = "Bearer"
+[limits]
+default_max_tokens = 256
+max_tokens_per_request = 2048
+request_timeout_sec = 30
+[queue]
+max_size = 100
+max_tokens = 20000
+admin_quota = 3
+retry_after_sec = 5
+[scheduler]
+max_concurrent = 1
+[rate_limit]
+requests_per_minute = 60
+estimated_tokens_per_minute = 6000
+[[api_keys]]
+key_id = "admin-main"
+secret = "change-me-admin"
+role = "admin"
+enabled = true
+[[api_keys]]
+key_id = "user-main"
+secret = "change-me-user"
+role = "user"
+enabled = true

cpp/llm_manager.cpp CHANGED Viewed

@@ -4,18 +4,25 @@
 #include <boost/beast/version.hpp>
 #include <nlohmann/json.hpp>
 #include <atomic>
 #include <chrono>
 #include <csignal>
 #include <cstdlib>
 #include <ctime>
 #include <iomanip>
 #include <iostream>
 #include <mutex>
 #include <optional>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <vector>
 #include <sys/types.h>
@@ -27,6 +34,81 @@ namespace beast = boost::beast;
 namespace http = beast::http;
 using json = nlohmann::json;
 struct WorkerInfo {
     std::string model;
     int port = 0;
@@ -34,6 +116,8 @@ struct WorkerInfo {
     std::string last_loaded;
 };
 static std::string now_utc_iso() {
     std::time_t t = std::time(nullptr);
     std::tm tm{};
@@ -58,6 +142,399 @@ static int get_env_int_or(const char *name, int fallback) {
     }
 }
 static bool is_alive(pid_t pid) {
     if (pid <= 0) return false;
     return kill(pid, 0) == 0;
@@ -80,18 +557,18 @@ static void shutdown_worker(pid_t pid, int wait_seconds = 15) {
 class ModelManager {
 public:
-    ModelManager()
-        : _default_model(get_env_or("DEFAULT_MODEL", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m")),
-          _llama_server_bin(get_env_or("LLAMA_SERVER_BIN", "/usr/local/bin/llama-server")),
-          _worker_host(get_env_or("WORKER_HOST", "127.0.0.1")),
-          _worker_bind_host(get_env_or("WORKER_BIND_HOST", "0.0.0.0")),
-          _base_port(get_env_int_or("WORKER_BASE_PORT", 8080)),
-          _switch_timeout_sec(get_env_int_or("SWITCH_TIMEOUT_SEC", 300)),
-          _n_ctx(get_env_int_or("MODEL_N_CTX", 8192)),
-          _n_threads(get_env_int_or("MODEL_THREADS", 4)),
-          _n_gpu_layers(get_env_int_or("MODEL_NGL", 0)),
-          _n_batch(get_env_int_or("MODEL_BATCH", 128)),
-          _n_ubatch(get_env_int_or("MODEL_UBATCH", 64)),
           _next_port(_base_port) {}
     bool initialize_default(std::string &error) {
@@ -318,6 +795,364 @@ private:
 static std::atomic<uint64_t> g_req_id{1};
 static void log_line(const std::string &line) {
     std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
 }
@@ -327,6 +1162,17 @@ static std::string truncate_body(const std::string &body, size_t max_len = 2000)
     return body.substr(0, max_len) + "...[truncated]";
 }
 static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
     asio::io_context ioc;
     asio::ip::tcp::resolver resolver(ioc);
@@ -391,34 +1237,65 @@ static ProxiedGetResult forward_get_to_worker(const WorkerInfo &worker,
 template <typename Body, typename Allocator>
 http::response<http::string_body> handle_request(
     ModelManager &manager,
     http::request<Body, http::basic_fields<Allocator>> &&req) {
     const auto start = std::chrono::steady_clock::now();
-    const auto req_id = g_req_id.fetch_add(1);
     const std::string target = req.target().to_string();
     const std::string method = req.method_string().to_string();
     const std::string path = target.substr(0, target.find('?'));
-    log_line("request_id=" + std::to_string(req_id) + " method=" + method + " path=" + target);
     if constexpr (std::is_same_v<Body, http::string_body>) {
         if (!req.body().empty()) {
-            log_line("request_id=" + std::to_string(req_id) + " body=" + truncate_body(req.body()));
         }
     }
     auto json_response = [&](http::status status, const json &obj) {
         http::response<http::string_body> res{status, req.version()};
         res.set(http::field::content_type, "application/json");
         res.set(http::field::server, "llm-manager");
         res.keep_alive(req.keep_alive());
-        res.body() = obj.dump();
         res.prepare_payload();
         auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
             std::chrono::steady_clock::now() - start).count();
-        log_line("request_id=" + std::to_string(req_id) + " status=" + std::to_string(res.result_int()) +
                  " elapsed_ms=" + std::to_string(elapsed_ms));
         return res;
     };
     try {
         if (path == "/health" && req.method() == http::verb::get) {
             return json_response(http::status::ok, manager.models_view());
@@ -429,6 +1306,7 @@ http::response<http::string_body> handle_request(
         }
         if (path == "/switch-model" && req.method() == http::verb::post) {
             std::string body(req.body().data(), req.body().size());
             json j = json::parse(body, nullptr, false);
             if (j.is_discarded()) {
@@ -453,6 +1331,8 @@ http::response<http::string_body> handle_request(
         }
         if (path == "/stop" && req.method() == http::verb::post) {
             std::string err;
             bool ok = manager.restart_active(err);
             if (!ok) {
@@ -469,24 +1349,114 @@ http::response<http::string_body> handle_request(
             return json_response(http::status::ok, state);
         }
         if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
-            auto worker = manager.active_worker();
-            if (!worker) {
-                return json_response(http::status::service_unavailable, {{"error", "No active model"}});
             }
-            auto [upstream_status, upstream_body] = forward_chat(*worker, req.body());
-            http::response<http::string_body> res{static_cast<http::status>(upstream_status), req.version()};
-            res.set(http::field::content_type, "application/json");
             res.set(http::field::server, "llm-manager");
             res.keep_alive(req.keep_alive());
-            res.body() = upstream_body;
             res.prepare_payload();
             auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                 std::chrono::steady_clock::now() - start).count();
-            log_line("request_id=" + std::to_string(req_id) + " model=" + worker->model +
-                     " active_pid=" + std::to_string(worker->pid) +
-                     " active_port=" + std::to_string(worker->port) +
-                     " upstream_status=" + std::to_string(upstream_status) +
                      " elapsed_ms=" + std::to_string(elapsed_ms));
             return res;
         }
@@ -506,13 +1476,14 @@ http::response<http::string_body> handle_request(
         res.set(http::field::content_encoding, upstream.content_encoding);
       }
       res.set(http::field::server, "llm-manager");
       res.keep_alive(req.keep_alive());
       res.body() = upstream.body;
       res.prepare_payload();
       auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                             std::chrono::steady_clock::now() - start)
                             .count();
-      log_line("request_id=" + std::to_string(req_id) +
                " proxied_get model=" + worker->model +
                " upstream_status=" + std::to_string(upstream.status) +
                " elapsed_ms=" + std::to_string(elapsed_ms));
@@ -525,12 +1496,19 @@ http::response<http::string_body> handle_request(
     }
 }
-void do_session(asio::ip::tcp::socket socket, ModelManager &manager) {
     try {
         beast::flat_buffer buffer;
         http::request<http::string_body> req;
         http::read(socket, buffer, req);
-        auto res = handle_request(manager, std::move(req));
         http::write(socket, res);
         beast::error_code ec;
         socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
@@ -539,11 +1517,21 @@ void do_session(asio::ip::tcp::socket socket, ModelManager &manager) {
 }
 int main() {
-    const auto bind_host = get_env_or("MANAGER_HOST", "0.0.0.0");
-    const int bind_port = get_env_int_or("MANAGER_PORT", 7860);
-    ModelManager manager;
     std::string init_error;
     log_line("startup: loading default model");
     if (!manager.initialize_default(init_error)) {
         log_line("startup: default model failed: " + init_error);
@@ -558,6 +1546,14 @@ int main() {
     for (;;) {
         asio::ip::tcp::socket socket{ioc};
         acceptor.accept(socket);
-        std::thread(&do_session, std::move(socket), std::ref(manager)).detach();
     }
 }

 #include <boost/beast/version.hpp>
 #include <nlohmann/json.hpp>
+#include <algorithm>
 #include <atomic>
 #include <chrono>
+#include <condition_variable>
 #include <csignal>
 #include <cstdlib>
 #include <ctime>
+#include <deque>
+#include <filesystem>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <memory>
 #include <mutex>
 #include <optional>
 #include <sstream>
 #include <string>
 #include <thread>
+#include <unordered_map>
 #include <vector>
 #include <sys/types.h>
 namespace http = beast::http;
 using json = nlohmann::json;
+static void log_line(const std::string &line);
+struct ServerConfig {
+    std::string host = "0.0.0.0";
+    int port = 7860;
+};
+struct WorkerConfig {
+    std::string default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m";
+    std::string llama_server_bin = "/usr/local/bin/llama-server";
+    std::string host = "127.0.0.1";
+    std::string bind_host = "0.0.0.0";
+    int base_port = 8080;
+    int switch_timeout_sec = 300;
+};
+struct LlamaConfig {
+    int n_ctx = 8192;
+    int threads = 4;
+    int ngl = 0;
+    int batch = 128;
+    int ubatch = 64;
+};
+enum class Role {
+    ADMIN,
+    USER
+};
+struct AuthConfig {
+    std::string header = "Authorization";
+    std::string scheme = "Bearer";
+};
+struct LimitsConfig {
+    int default_max_tokens = 256;
+    int max_tokens_per_request = 2048;
+    int request_timeout_sec = 30;
+};
+struct QueueConfig {
+    size_t max_size = 100;
+    int max_tokens = 20000;
+    int admin_quota = 3;
+    int retry_after_sec = 5;
+};
+struct RateLimitConfig {
+    int requests_per_minute = 60;
+    int estimated_tokens_per_minute = 6000;
+};
+struct SchedulerConfig {
+    int max_concurrent = 1;
+};
+struct ApiKeyRecord {
+    std::string key_id;
+    std::string secret;
+    Role role = Role::USER;
+    bool enabled = true;
+};
+struct ManagerConfig {
+    ServerConfig server;
+    WorkerConfig worker;
+    LlamaConfig llama;
+    AuthConfig auth;
+    LimitsConfig limits;
+    QueueConfig queue;
+    RateLimitConfig rate_limit;
+    SchedulerConfig scheduler;
+    std::vector<ApiKeyRecord> api_keys;
+};
 struct WorkerInfo {
     std::string model;
     int port = 0;
     std::string last_loaded;
 };
+static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body);
 static std::string now_utc_iso() {
     std::time_t t = std::time(nullptr);
     std::tm tm{};
     }
 }
+static std::string trim_copy(const std::string &value) {
+    const auto first = value.find_first_not_of(" \t\r\n");
+    if (first == std::string::npos) return "";
+    const auto last = value.find_last_not_of(" \t\r\n");
+    return value.substr(first, last - first + 1);
+}
+static std::string strip_quotes(const std::string &value) {
+    if (value.size() >= 2) {
+        const char first = value.front();
+        const char last = value.back();
+        if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) {
+            return value.substr(1, value.size() - 2);
+        }
+    }
+    return value;
+}
+static bool parse_bool_or(const std::string &value, bool fallback) {
+    const std::string normalized = trim_copy(value);
+    if (normalized == "true") return true;
+    if (normalized == "false") return false;
+    return fallback;
+}
+static Role parse_role_or(const std::string &value, Role fallback) {
+    const std::string normalized = trim_copy(value);
+    if (normalized == "admin" || normalized == "ADMIN") return Role::ADMIN;
+    if (normalized == "user" || normalized == "USER") return Role::USER;
+    return fallback;
+}
+static std::string role_to_string(Role role) {
+    return role == Role::ADMIN ? "admin" : "user";
+}
+enum class Priority {
+    ADMIN = 0,
+    USER = 1
+};
+static Priority role_to_priority(Role role) {
+    return role == Role::ADMIN ? Priority::ADMIN : Priority::USER;
+}
+enum class RequestState {
+    QUEUED,
+    RUNNING,
+    CANCELLED,
+    FAILED,
+    DONE
+};
+static std::string state_to_string(RequestState state) {
+    switch (state) {
+        case RequestState::QUEUED: return "queued";
+        case RequestState::RUNNING: return "running";
+        case RequestState::CANCELLED: return "cancelled";
+        case RequestState::FAILED: return "failed";
+        case RequestState::DONE: return "done";
+    }
+    return "unknown";
+}
+struct TokenEstimate {
+    int prompt_tokens = 0;
+    int requested_max_tokens = 0;
+    int estimated_total_tokens = 0;
+};
+struct RateLimitDecision {
+    bool allowed = true;
+    int retry_after_sec = 0;
+    std::string error;
+};
+struct RequestResult {
+    int status = 500;
+    std::string body;
+    std::string content_type = "application/json";
+};
+struct RequestContext {
+    std::string request_id;
+    std::string api_key_id;
+    Role role = Role::USER;
+    Priority priority = Priority::USER;
+    TokenEstimate estimate;
+    std::string request_body;
+    std::atomic<RequestState> state{RequestState::QUEUED};
+    std::atomic<bool> cancelled{false};
+    std::chrono::steady_clock::time_point created_at{std::chrono::steady_clock::now()};
+    std::chrono::steady_clock::time_point enqueue_time{created_at};
+    std::chrono::steady_clock::time_point start_time{};
+    std::mutex mu;
+    std::condition_variable cv;
+    bool completed = false;
+    RequestResult result;
+};
+static int estimate_text_tokens_rough(const std::string &text) {
+    if (text.empty()) return 0;
+    return std::max(1, static_cast<int>((text.size() + 3) / 4));
+}
+static std::string flatten_json_content(const json &content) {
+    if (content.is_string()) {
+        return content.get<std::string>();
+    }
+    if (content.is_array()) {
+        std::ostringstream oss;
+        bool first = true;
+        for (const auto &item : content) {
+            std::string part;
+            if (item.is_string()) {
+                part = item.get<std::string>();
+            } else if (item.is_object() && item.contains("text") && item["text"].is_string()) {
+                part = item["text"].get<std::string>();
+            }
+            if (part.empty()) continue;
+            if (!first) oss << '\n';
+            oss << part;
+            first = false;
+        }
+        return oss.str();
+    }
+    return "";
+}
+static std::optional<TokenEstimate> estimate_chat_tokens(
+    const json &payload,
+    const LimitsConfig &limits,
+    std::string &error) {
+    if (!payload.is_object()) {
+        error = "Expected JSON object";
+        return std::nullopt;
+    }
+    if (!payload.contains("messages") || !payload["messages"].is_array()) {
+        error = "Expected 'messages' array";
+        return std::nullopt;
+    }
+    TokenEstimate estimate;
+    estimate.requested_max_tokens = limits.default_max_tokens;
+    if (payload.contains("max_tokens")) {
+        if (!payload["max_tokens"].is_number_integer()) {
+            error = "Expected integer 'max_tokens'";
+            return std::nullopt;
+        }
+        estimate.requested_max_tokens = payload["max_tokens"].get<int>();
+    }
+    if (estimate.requested_max_tokens <= 0) {
+        error = "'max_tokens' must be > 0";
+        return std::nullopt;
+    }
+    for (const auto &message : payload["messages"]) {
+        if (!message.is_object()) continue;
+        if (message.contains("role") && message["role"].is_string()) {
+            estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>());
+        }
+        if (message.contains("content")) {
+            estimate.prompt_tokens += estimate_text_tokens_rough(
+                flatten_json_content(message["content"]));
+        }
+        estimate.prompt_tokens += 4;
+    }
+    estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens;
+    if (estimate.estimated_total_tokens > limits.max_tokens_per_request) {
+        error = "Estimated request tokens exceed configured limit";
+        return std::nullopt;
+    }
+    error.clear();
+    return estimate;
+}
+static std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
+parse_simple_toml(const std::string &path) {
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> out;
+    std::ifstream input(path);
+    if (!input.is_open()) return out;
+    std::string current_section;
+    std::string line;
+    while (std::getline(input, line)) {
+        auto hash = line.find('#');
+        if (hash != std::string::npos) line = line.substr(0, hash);
+        line = trim_copy(line);
+        if (line.empty()) continue;
+        if (line.front() == '[' && line.back() == ']') {
+            current_section = trim_copy(line.substr(1, line.size() - 2));
+            continue;
+        }
+        const auto eq = line.find('=');
+        if (eq == std::string::npos) continue;
+        std::string key = trim_copy(line.substr(0, eq));
+        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
+        out[current_section][key] = value;
+    }
+    return out;
+}
+static std::vector<ApiKeyRecord> parse_api_keys_toml(const std::string &path) {
+    std::vector<ApiKeyRecord> keys;
+    std::ifstream input(path);
+    if (!input.is_open()) return keys;
+    std::string line;
+    bool in_api_key = false;
+    ApiKeyRecord current;
+    bool has_any_field = false;
+    auto flush_current = [&]() {
+        if (has_any_field && !current.key_id.empty() && !current.secret.empty()) {
+            keys.push_back(current);
+        }
+        current = ApiKeyRecord{};
+        has_any_field = false;
+    };
+    while (std::getline(input, line)) {
+        auto hash = line.find('#');
+        if (hash != std::string::npos) line = line.substr(0, hash);
+        line = trim_copy(line);
+        if (line.empty()) continue;
+        if (line == "[[api_keys]]") {
+            flush_current();
+            in_api_key = true;
+            continue;
+        }
+        if (!in_api_key) continue;
+        if (line.front() == '[' && line.back() == ']') {
+            flush_current();
+            in_api_key = false;
+            continue;
+        }
+        const auto eq = line.find('=');
+        if (eq == std::string::npos) continue;
+        std::string key = trim_copy(line.substr(0, eq));
+        std::string value = strip_quotes(trim_copy(line.substr(eq + 1)));
+        has_any_field = true;
+        if (key == "key_id") current.key_id = value;
+        else if (key == "secret") current.secret = value;
+        else if (key == "role") current.role = parse_role_or(value, current.role);
+        else if (key == "enabled") current.enabled = parse_bool_or(value, current.enabled);
+    }
+    flush_current();
+    return keys;
+}
+static std::string get_toml_string_or(
+    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
+    const std::string &section,
+    const std::string &key,
+    const std::string &fallback) {
+    const auto it = data.find(section);
+    if (it == data.end()) return fallback;
+    const auto kv = it->second.find(key);
+    if (kv == it->second.end() || kv->second.empty()) return fallback;
+    return kv->second;
+}
+static int get_toml_int_or(
+    const std::unordered_map<std::string, std::unordered_map<std::string, std::string>> &data,
+    const std::string &section,
+    const std::string &key,
+    int fallback) {
+    const auto it = data.find(section);
+    if (it == data.end()) return fallback;
+    const auto kv = it->second.find(key);
+    if (kv == it->second.end() || kv->second.empty()) return fallback;
+    try {
+        return std::stoi(kv->second);
+    } catch (...) {
+        return fallback;
+    }
+}
+static ManagerConfig load_manager_config() {
+    ManagerConfig cfg;
+    const std::string config_path = get_env_or("MANAGER_CONFIG", "config.toml");
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> toml;
+    if (std::filesystem::exists(config_path)) {
+        toml = parse_simple_toml(config_path);
+        log_line("config: loaded " + config_path);
+    } else {
+        log_line("config: using environment/defaults (file not found: " + config_path + ")");
+    }
+    cfg.server.host = get_env_or(
+        "MANAGER_HOST",
+        get_toml_string_or(toml, "server", "host", cfg.server.host));
+    cfg.server.port = get_env_int_or(
+        "MANAGER_PORT",
+        get_toml_int_or(toml, "server", "port", cfg.server.port));
+    cfg.worker.default_model = get_env_or(
+        "DEFAULT_MODEL",
+        get_toml_string_or(toml, "worker", "default_model", cfg.worker.default_model));
+    cfg.worker.llama_server_bin = get_env_or(
+        "LLAMA_SERVER_BIN",
+        get_toml_string_or(toml, "worker", "llama_server_bin", cfg.worker.llama_server_bin));
+    cfg.worker.host = get_env_or(
+        "WORKER_HOST",
+        get_toml_string_or(toml, "worker", "host", cfg.worker.host));
+    cfg.worker.bind_host = get_env_or(
+        "WORKER_BIND_HOST",
+        get_toml_string_or(toml, "worker", "bind_host", cfg.worker.bind_host));
+    cfg.worker.base_port = get_env_int_or(
+        "WORKER_BASE_PORT",
+        get_toml_int_or(toml, "worker", "base_port", cfg.worker.base_port));
+    cfg.worker.switch_timeout_sec = get_env_int_or(
+        "SWITCH_TIMEOUT_SEC",
+        get_toml_int_or(toml, "worker", "switch_timeout_sec", cfg.worker.switch_timeout_sec));
+    cfg.llama.n_ctx = get_env_int_or(
+        "MODEL_N_CTX",
+        get_toml_int_or(toml, "llama", "n_ctx", cfg.llama.n_ctx));
+    cfg.llama.threads = get_env_int_or(
+        "MODEL_THREADS",
+        get_toml_int_or(toml, "llama", "threads", cfg.llama.threads));
+    cfg.llama.ngl = get_env_int_or(
+        "MODEL_NGL",
+        get_toml_int_or(toml, "llama", "ngl", cfg.llama.ngl));
+    cfg.llama.batch = get_env_int_or(
+        "MODEL_BATCH",
+        get_toml_int_or(toml, "llama", "batch", cfg.llama.batch));
+    cfg.llama.ubatch = get_env_int_or(
+        "MODEL_UBATCH",
+        get_toml_int_or(toml, "llama", "ubatch", cfg.llama.ubatch));
+    cfg.auth.header = get_env_or(
+        "AUTH_HEADER",
+        get_toml_string_or(toml, "auth", "header", cfg.auth.header));
+    cfg.auth.scheme = get_env_or(
+        "AUTH_SCHEME",
+        get_toml_string_or(toml, "auth", "scheme", cfg.auth.scheme));
+    cfg.limits.default_max_tokens = get_env_int_or(
+        "DEFAULT_MAX_TOKENS",
+        get_toml_int_or(toml, "limits", "default_max_tokens", cfg.limits.default_max_tokens));
+    cfg.limits.max_tokens_per_request = get_env_int_or(
+        "MAX_TOKENS_PER_REQUEST",
+        get_toml_int_or(toml, "limits", "max_tokens_per_request", cfg.limits.max_tokens_per_request));
+    cfg.limits.request_timeout_sec = get_env_int_or(
+        "REQUEST_TIMEOUT_SEC",
+        get_toml_int_or(toml, "limits", "request_timeout_sec", cfg.limits.request_timeout_sec));
+    cfg.queue.max_size = static_cast<size_t>(std::max(
+        1,
+        get_env_int_or("QUEUE_MAX_SIZE", get_toml_int_or(toml, "queue", "max_size", static_cast<int>(cfg.queue.max_size)))));
+    cfg.queue.max_tokens = get_env_int_or(
+        "QUEUE_MAX_TOKENS",
+        get_toml_int_or(toml, "queue", "max_tokens", cfg.queue.max_tokens));
+    cfg.queue.admin_quota = get_env_int_or(
+        "QUEUE_ADMIN_QUOTA",
+        get_toml_int_or(toml, "queue", "admin_quota", cfg.queue.admin_quota));
+    cfg.queue.retry_after_sec = get_env_int_or(
+        "QUEUE_RETRY_AFTER_SEC",
+        get_toml_int_or(toml, "queue", "retry_after_sec", cfg.queue.retry_after_sec));
+    cfg.rate_limit.requests_per_minute = get_env_int_or(
+        "REQUESTS_PER_MINUTE",
+        get_toml_int_or(toml, "rate_limit", "requests_per_minute", cfg.rate_limit.requests_per_minute));
+    cfg.rate_limit.estimated_tokens_per_minute = get_env_int_or(
+        "ESTIMATED_TOKENS_PER_MINUTE",
+        get_toml_int_or(toml, "rate_limit", "estimated_tokens_per_minute", cfg.rate_limit.estimated_tokens_per_minute));
+    cfg.scheduler.max_concurrent = get_env_int_or(
+        "SCHEDULER_MAX_CONCURRENT",
+        get_toml_int_or(toml, "scheduler", "max_concurrent", cfg.scheduler.max_concurrent));
+    if (!config_path.empty() && std::filesystem::exists(config_path)) {
+        cfg.api_keys = parse_api_keys_toml(config_path);
+    }
+    return cfg;
+}
 static bool is_alive(pid_t pid) {
     if (pid <= 0) return false;
     return kill(pid, 0) == 0;
 class ModelManager {
 public:
+    explicit ModelManager(const ManagerConfig &config)
+        : _default_model(config.worker.default_model),
+          _llama_server_bin(config.worker.llama_server_bin),
+          _worker_host(config.worker.host),
+          _worker_bind_host(config.worker.bind_host),
+          _base_port(config.worker.base_port),
+          _switch_timeout_sec(config.worker.switch_timeout_sec),
+          _n_ctx(config.llama.n_ctx),
+          _n_threads(config.llama.threads),
+          _n_gpu_layers(config.llama.ngl),
+          _n_batch(config.llama.batch),
+          _n_ubatch(config.llama.ubatch),
           _next_port(_base_port) {}
     bool initialize_default(std::string &error) {
 static std::atomic<uint64_t> g_req_id{1};
+class RateLimiterStore {
+public:
+    explicit RateLimiterStore(const RateLimitConfig &config)
+        : _requests_per_minute(std::max(0, config.requests_per_minute)),
+          _estimated_tokens_per_minute(std::max(0, config.estimated_tokens_per_minute)) {}
+    RateLimitDecision allow(const std::string &api_key_id, int estimated_tokens) {
+        if (_requests_per_minute <= 0 && _estimated_tokens_per_minute <= 0) {
+            return {};
+        }
+        std::lock_guard<std::mutex> lock(_mu);
+        auto &bucket = _buckets[api_key_id];
+        const auto now = std::chrono::steady_clock::now();
+        refill(bucket.request_tokens, bucket.last_request_refill, _requests_per_minute, now);
+        refill(bucket.estimated_tokens, bucket.last_estimated_refill, _estimated_tokens_per_minute, now);
+        if (_requests_per_minute > 0 && bucket.request_tokens < 1.0) {
+            return {false, 1, "Rate limit exceeded: requests"};
+        }
+        if (_estimated_tokens_per_minute > 0 && bucket.estimated_tokens < estimated_tokens) {
+            return {false, 1, "Rate limit exceeded: estimated tokens"};
+        }
+        if (_requests_per_minute > 0) bucket.request_tokens -= 1.0;
+        if (_estimated_tokens_per_minute > 0) bucket.estimated_tokens -= estimated_tokens;
+        return {};
+    }
+private:
+    struct Bucket {
+        double request_tokens = 0.0;
+        double estimated_tokens = 0.0;
+        std::chrono::steady_clock::time_point last_request_refill{};
+        std::chrono::steady_clock::time_point last_estimated_refill{};
+    };
+    std::mutex _mu;
+    std::unordered_map<std::string, Bucket> _buckets;
+    int _requests_per_minute;
+    int _estimated_tokens_per_minute;
+    static void refill(
+        double &tokens,
+        std::chrono::steady_clock::time_point &last_refill,
+        int limit_per_minute,
+        std::chrono::steady_clock::time_point now) {
+        if (limit_per_minute <= 0) return;
+        if (last_refill.time_since_epoch().count() == 0) {
+            tokens = limit_per_minute;
+            last_refill = now;
+            return;
+        }
+        const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_refill).count();
+        if (elapsed <= 0) return;
+        const double refill_amount = (static_cast<double>(limit_per_minute) * elapsed) / 60000.0;
+        tokens = std::min(static_cast<double>(limit_per_minute), tokens + refill_amount);
+        last_refill = now;
+    }
+};
+class RequestRegistry {
+public:
+    std::shared_ptr<RequestContext> create(
+        const std::string &request_id,
+        const ApiKeyRecord &principal,
+        const TokenEstimate &estimate,
+        const std::string &request_body) {
+        auto ctx = std::make_shared<RequestContext>();
+        ctx->request_id = request_id;
+        ctx->api_key_id = principal.key_id;
+        ctx->role = principal.role;
+        ctx->priority = role_to_priority(principal.role);
+        ctx->estimate = estimate;
+        ctx->request_body = request_body;
+        ctx->created_at = std::chrono::steady_clock::now();
+        ctx->enqueue_time = ctx->created_at;
+        std::lock_guard<std::mutex> lock(_mu);
+        _requests[request_id] = ctx;
+        return ctx;
+    }
+    std::shared_ptr<RequestContext> find(const std::string &request_id) const {
+        std::lock_guard<std::mutex> lock(_mu);
+        const auto it = _requests.find(request_id);
+        if (it == _requests.end()) return nullptr;
+        return it->second;
+    }
+    void mark_state(const std::string &request_id, RequestState state) {
+        auto ctx = find(request_id);
+        if (ctx) ctx->state.store(state);
+    }
+    void complete(const std::shared_ptr<RequestContext> &ctx, RequestState state, RequestResult result) {
+        {
+            std::lock_guard<std::mutex> lock(ctx->mu);
+            ctx->state.store(state);
+            ctx->result = std::move(result);
+            ctx->completed = true;
+        }
+        ctx->cv.notify_all();
+    }
+    std::shared_ptr<RequestContext> cancel_request(const std::string &request_id) {
+        auto ctx = find(request_id);
+        if (!ctx) return nullptr;
+        ctx->cancelled.store(true);
+        const auto state = ctx->state.load();
+        if (state == RequestState::QUEUED) {
+            complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+        } else if (state == RequestState::RUNNING) {
+            ctx->state.store(RequestState::CANCELLED);
+        }
+        return ctx;
+    }
+    std::vector<std::shared_ptr<RequestContext>> cancel_all() {
+        std::vector<std::shared_ptr<RequestContext>> out;
+        std::lock_guard<std::mutex> lock(_mu);
+        out.reserve(_requests.size());
+        for (auto &[_, ctx] : _requests) {
+            ctx->cancelled.store(true);
+            const auto state = ctx->state.load();
+            if (state == RequestState::QUEUED) {
+                {
+                    std::lock_guard<std::mutex> ctx_lock(ctx->mu);
+                    ctx->state.store(RequestState::CANCELLED);
+                    ctx->result = {499, R"({"error":"Request cancelled"})"};
+                    ctx->completed = true;
+                }
+                ctx->cv.notify_all();
+            } else if (state == RequestState::RUNNING) {
+                ctx->state.store(RequestState::CANCELLED);
+            }
+            out.push_back(ctx);
+        }
+        return out;
+    }
+private:
+    mutable std::mutex _mu;
+    std::unordered_map<std::string, std::shared_ptr<RequestContext>> _requests;
+};
+class PrioritySchedulerQueue {
+public:
+    explicit PrioritySchedulerQueue(const QueueConfig &config)
+        : _max_size(config.max_size),
+          _max_tokens(config.max_tokens),
+          _admin_quota(std::max(1, config.admin_quota)),
+          _retry_after_sec(std::max(1, config.retry_after_sec)) {}
+    bool try_push(const std::shared_ptr<RequestContext> &ctx) {
+        std::lock_guard<std::mutex> lock(_mu);
+        if (_current_size >= _max_size) return false;
+        if (_current_tokens + ctx->estimate.estimated_total_tokens > _max_tokens) return false;
+        if (ctx->priority == Priority::ADMIN) {
+            _admin_queue.push_back(ctx);
+        } else {
+            _user_queue.push_back(ctx);
+        }
+        ++_current_size;
+        _current_tokens += ctx->estimate.estimated_total_tokens;
+        _cv.notify_one();
+        return true;
+    }
+    std::shared_ptr<RequestContext> pop_next() {
+        std::unique_lock<std::mutex> lock(_mu);
+        _cv.wait(lock, [&]() { return _stopped || _current_size > 0; });
+        if (_stopped) return nullptr;
+        std::deque<std::shared_ptr<RequestContext>> *selected_queue = nullptr;
+        if (!_admin_queue.empty() && (_admin_streak < _admin_quota || _user_queue.empty())) {
+            selected_queue = &_admin_queue;
+            ++_admin_streak;
+        } else if (!_user_queue.empty()) {
+            selected_queue = &_user_queue;
+            _admin_streak = 0;
+        } else if (!_admin_queue.empty()) {
+            selected_queue = &_admin_queue;
+            _admin_streak = 1;
+        }
+        if (!selected_queue || selected_queue->empty()) return nullptr;
+        auto best_it = std::min_element(
+            selected_queue->begin(),
+            selected_queue->end(),
+            [](const auto &a, const auto &b) {
+                return a->estimate.estimated_total_tokens < b->estimate.estimated_total_tokens;
+            });
+        auto ctx = *best_it;
+        selected_queue->erase(best_it);
+        --_current_size;
+        _current_tokens -= ctx->estimate.estimated_total_tokens;
+        return ctx;
+    }
+    void stop() {
+        std::lock_guard<std::mutex> lock(_mu);
+        _stopped = true;
+        _cv.notify_all();
+    }
+    int retry_after_sec() const {
+        return _retry_after_sec;
+    }
+private:
+    mutable std::mutex _mu;
+    std::condition_variable _cv;
+    std::deque<std::shared_ptr<RequestContext>> _admin_queue;
+    std::deque<std::shared_ptr<RequestContext>> _user_queue;
+    size_t _max_size;
+    size_t _current_size = 0;
+    int _max_tokens;
+    int _current_tokens = 0;
+    int _admin_quota;
+    int _admin_streak = 0;
+    int _retry_after_sec;
+    bool _stopped = false;
+};
+class Scheduler {
+public:
+    Scheduler(ModelManager &manager, RequestRegistry &registry, const QueueConfig &queue_config)
+        : _manager(manager), _registry(registry), _queue(queue_config) {
+        _worker = std::thread([this]() { worker_loop(); });
+    }
+    ~Scheduler() {
+        _queue.stop();
+        if (_worker.joinable()) _worker.join();
+    }
+    bool try_enqueue(const std::shared_ptr<RequestContext> &ctx) {
+        return _queue.try_push(ctx);
+    }
+    int retry_after_sec() const {
+        return _queue.retry_after_sec();
+    }
+private:
+    ModelManager &_manager;
+    RequestRegistry &_registry;
+    PrioritySchedulerQueue _queue;
+    std::thread _worker;
+    void worker_loop() {
+        for (;;) {
+            auto ctx = _queue.pop_next();
+            if (!ctx) return;
+            if (ctx->cancelled.load()) {
+                _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+                continue;
+            }
+            ctx->state.store(RequestState::RUNNING);
+            ctx->start_time = std::chrono::steady_clock::now();
+            auto worker = _manager.active_worker();
+            if (!worker) {
+                _registry.complete(ctx, RequestState::FAILED, {503, R"({"error":"No active model"})"});
+                continue;
+            }
+            try {
+                auto [status, body] = forward_chat(*worker, ctx->request_body);
+                if (ctx->cancelled.load()) {
+                    _registry.complete(ctx, RequestState::CANCELLED, {499, R"({"error":"Request cancelled"})"});
+                    continue;
+                }
+                _registry.complete(ctx, RequestState::DONE, {status, body});
+            } catch (const std::exception &e) {
+                _registry.complete(
+                    ctx,
+                    RequestState::FAILED,
+                    {500, json({{"error", e.what()}}).dump()});
+            }
+        }
+    }
+};
+class ApiKeyAuth {
+public:
+    explicit ApiKeyAuth(const ManagerConfig &config)
+        : _header_name(config.auth.header), _scheme(config.auth.scheme) {
+        for (const auto &record : config.api_keys) {
+            _records_by_secret.emplace(record.secret, record);
+        }
+    }
+    bool enabled() const {
+        return !_records_by_secret.empty();
+    }
+    template <typename Body, typename Allocator>
+    std::optional<ApiKeyRecord> authenticate(
+        const http::request<Body, http::basic_fields<Allocator>> &req,
+        std::string &error) const {
+        if (!enabled()) {
+            error.clear();
+            return ApiKeyRecord{"anonymous", "", Role::ADMIN, true};
+        }
+        const auto token = extract_bearer_token(req, error);
+        if (!token) return std::nullopt;
+        const auto it = _records_by_secret.find(*token);
+        if (it == _records_by_secret.end()) {
+            error = "Invalid API key";
+            return std::nullopt;
+        }
+        if (!it->second.enabled) {
+            error = "API key disabled";
+            return std::nullopt;
+        }
+        error.clear();
+        return it->second;
+    }
+private:
+    std::string _header_name;
+    std::string _scheme;
+    std::unordered_map<std::string, ApiKeyRecord> _records_by_secret;
+    template <typename Body, typename Allocator>
+    std::optional<std::string> extract_bearer_token(
+        const http::request<Body, http::basic_fields<Allocator>> &req,
+        std::string &error) const {
+        const auto header_it = req.find(_header_name);
+        if (header_it == req.end()) {
+            error = "Missing authorization header";
+            return std::nullopt;
+        }
+        const std::string value = trim_copy(header_it->value().to_string());
+        const std::string prefix = _scheme + " ";
+        if (value.size() <= prefix.size() || value.rfind(prefix, 0) != 0) {
+            error = "Invalid authorization scheme";
+            return std::nullopt;
+        }
+        std::string token = trim_copy(value.substr(prefix.size()));
+        if (token.empty()) {
+            error = "Missing API key";
+            return std::nullopt;
+        }
+        return token;
+    }
+};
 static void log_line(const std::string &line) {
     std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
 }
     return body.substr(0, max_len) + "...[truncated]";
 }
+static std::optional<std::string> extract_cancel_request_id(const std::string &path) {
+    const std::string prefix = "/requests/";
+    const std::string suffix = "/cancel";
+    if (path.size() <= prefix.size() + suffix.size()) return std::nullopt;
+    if (path.rfind(prefix, 0) != 0) return std::nullopt;
+    if (path.substr(path.size() - suffix.size()) != suffix) return std::nullopt;
+    const std::string request_id = path.substr(prefix.size(), path.size() - prefix.size() - suffix.size());
+    if (request_id.empty()) return std::nullopt;
+    return request_id;
+}
 static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
     asio::io_context ioc;
     asio::ip::tcp::resolver resolver(ioc);
 template <typename Body, typename Allocator>
 http::response<http::string_body> handle_request(
     ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    Scheduler &scheduler,
     http::request<Body, http::basic_fields<Allocator>> &&req) {
     const auto start = std::chrono::steady_clock::now();
+    const auto req_id_num = g_req_id.fetch_add(1);
+    const std::string request_id = std::to_string(req_id_num);
     const std::string target = req.target().to_string();
     const std::string method = req.method_string().to_string();
     const std::string path = target.substr(0, target.find('?'));
+    auto authenticated = std::optional<ApiKeyRecord>{};
+    log_line("request_id=" + request_id + " method=" + method + " path=" + target);
     if constexpr (std::is_same_v<Body, http::string_body>) {
         if (!req.body().empty()) {
+            log_line("request_id=" + request_id + " body=" + truncate_body(req.body()));
         }
     }
     auto json_response = [&](http::status status, const json &obj) {
+        json payload = obj;
+        payload["request_id"] = request_id;
         http::response<http::string_body> res{status, req.version()};
         res.set(http::field::content_type, "application/json");
         res.set(http::field::server, "llm-manager");
+        res.set("X-Request-Id", request_id);
         res.keep_alive(req.keep_alive());
+        res.body() = payload.dump();
         res.prepare_payload();
         auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
             std::chrono::steady_clock::now() - start).count();
+        log_line("request_id=" + request_id + " status=" + std::to_string(res.result_int()) +
                  " elapsed_ms=" + std::to_string(elapsed_ms));
         return res;
     };
+    auto json_response_with_retry_after = [&](http::status status, const json &obj, int retry_after_sec) {
+        auto res = json_response(status, obj);
+        res.set(http::field::retry_after, std::to_string(std::max(1, retry_after_sec)));
+        return res;
+    };
+    auto ensure_authenticated = [&](Role minimum_role) -> std::optional<http::response<http::string_body>> {
+        std::string auth_error;
+        authenticated = auth.authenticate(req, auth_error);
+        if (!authenticated) {
+            return json_response(http::status::unauthorized, {{"error", auth_error}});
+        }
+        if (minimum_role == Role::ADMIN && authenticated->role != Role::ADMIN) {
+            return json_response(http::status::forbidden, {{"error", "Admin role required"}});
+        }
+        log_line("request_id=" + request_id +
+                 " api_key_id=" + authenticated->key_id +
+                 " role=" + role_to_string(authenticated->role));
+        return std::nullopt;
+    };
     try {
         if (path == "/health" && req.method() == http::verb::get) {
             return json_response(http::status::ok, manager.models_view());
         }
         if (path == "/switch-model" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
             std::string body(req.body().data(), req.body().size());
             json j = json::parse(body, nullptr, false);
             if (j.is_discarded()) {
         }
         if (path == "/stop" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::ADMIN)) return *auth_res;
+            registry.cancel_all();
             std::string err;
             bool ok = manager.restart_active(err);
             if (!ok) {
             return json_response(http::status::ok, state);
         }
+        if (req.method() == http::verb::post) {
+            if (auto cancel_id = extract_cancel_request_id(path)) {
+                if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
+                auto ctx = registry.find(*cancel_id);
+                if (!ctx) {
+                    return json_response(http::status::not_found, {{"error", "Unknown request id"}});
+                }
+                if (authenticated->role != Role::ADMIN && authenticated->key_id != ctx->api_key_id) {
+                    return json_response(http::status::forbidden, {{"error", "Cannot cancel another API key request"}});
+                }
+                const auto previous_state = ctx->state.load();
+                registry.cancel_request(*cancel_id);
+                std::string restart_error;
+                bool restarted = true;
+                if (previous_state == RequestState::RUNNING) {
+                    restarted = manager.restart_active(restart_error);
+                }
+                json payload = {
+                    {"cancelled_request_id", *cancel_id},
+                    {"state", state_to_string(ctx->state.load())}
+                };
+                if (!restarted) {
+                    payload["restart_error"] = restart_error;
+                }
+                return json_response(http::status::ok, payload);
+            }
+        }
         if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
+            if (auto auth_res = ensure_authenticated(Role::USER)) return *auth_res;
+            json payload = json::parse(req.body(), nullptr, false);
+            if (payload.is_discarded()) {
+                return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
+            }
+            std::string token_error;
+            auto estimate = estimate_chat_tokens(payload, config.limits, token_error);
+            if (!estimate) {
+                return json_response(http::status::bad_request, {{"error", token_error}});
+            }
+            log_line("request_id=" + request_id +
+                     " prompt_tokens=" + std::to_string(estimate->prompt_tokens) +
+                     " max_tokens=" + std::to_string(estimate->requested_max_tokens) +
+                     " estimated_total_tokens=" + std::to_string(estimate->estimated_total_tokens));
+            auto rate_limit_decision = rate_limiter.allow(
+                authenticated->key_id,
+                estimate->estimated_total_tokens);
+            if (!rate_limit_decision.allowed) {
+                return json_response_with_retry_after(
+                    http::status::too_many_requests,
+                    {{"error", rate_limit_decision.error}},
+                    rate_limit_decision.retry_after_sec);
             }
+            auto ctx = registry.create(request_id, *authenticated, *estimate, req.body());
+            if (!scheduler.try_enqueue(ctx)) {
+                ctx->cancelled.store(true);
+                registry.complete(ctx, RequestState::CANCELLED, {503, R"({"error":"Queue full"})"});
+                return json_response_with_retry_after(
+                    http::status::service_unavailable,
+                    {{"error", "Queue full"}},
+                    scheduler.retry_after_sec());
+            }
+            std::unique_lock<std::mutex> lock(ctx->mu);
+            const bool finished = ctx->cv.wait_for(
+                lock,
+                std::chrono::seconds(std::max(1, config.limits.request_timeout_sec)),
+                [&]() { return ctx->completed; });
+            if (!finished) {
+                lock.unlock();
+                registry.cancel_request(request_id);
+                std::string restart_error;
+                bool restarted = true;
+                if (ctx->state.load() == RequestState::RUNNING) {
+                    restarted = manager.restart_active(restart_error);
+                }
+                json timeout_payload = {
+                    {"error", "Request timed out"},
+                    {"state", state_to_string(ctx->state.load())}
+                };
+                if (!restarted) timeout_payload["restart_error"] = restart_error;
+                return json_response(http::status::gateway_timeout, timeout_payload);
+            }
+            const auto final_state = ctx->state.load();
+            RequestResult result = ctx->result;
+            lock.unlock();
+            if (final_state == RequestState::CANCELLED) {
+                return json_response(http::status::ok, {{"status", "cancelled"}});
+            }
+            http::response<http::string_body> res{
+                static_cast<http::status>(result.status), req.version()};
+            res.set(http::field::content_type, result.content_type);
             res.set(http::field::server, "llm-manager");
+            res.set("X-Request-Id", request_id);
             res.keep_alive(req.keep_alive());
+            res.body() = result.body;
             res.prepare_payload();
             auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                 std::chrono::steady_clock::now() - start).count();
+            log_line("request_id=" + request_id +
+                     " final_state=" + state_to_string(final_state) +
+                     " upstream_status=" + std::to_string(result.status) +
                      " elapsed_ms=" + std::to_string(elapsed_ms));
             return res;
         }
         res.set(http::field::content_encoding, upstream.content_encoding);
       }
       res.set(http::field::server, "llm-manager");
+      res.set("X-Request-Id", request_id);
       res.keep_alive(req.keep_alive());
       res.body() = upstream.body;
       res.prepare_payload();
       auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                             std::chrono::steady_clock::now() - start)
                             .count();
+      log_line("request_id=" + request_id +
                " proxied_get model=" + worker->model +
                " upstream_status=" + std::to_string(upstream.status) +
                " elapsed_ms=" + std::to_string(elapsed_ms));
     }
 }
+void do_session(
+    asio::ip::tcp::socket socket,
+    ModelManager &manager,
+    const ManagerConfig &config,
+    const ApiKeyAuth &auth,
+    RateLimiterStore &rate_limiter,
+    RequestRegistry &registry,
+    Scheduler &scheduler) {
     try {
         beast::flat_buffer buffer;
         http::request<http::string_body> req;
         http::read(socket, buffer, req);
+        auto res = handle_request(manager, config, auth, rate_limiter, registry, scheduler, std::move(req));
         http::write(socket, res);
         beast::error_code ec;
         socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
 }
 int main() {
+    const ManagerConfig config = load_manager_config();
+    const auto &bind_host = config.server.host;
+    const int bind_port = config.server.port;
+    ModelManager manager(config);
+    ApiKeyAuth auth(config);
+    RateLimiterStore rate_limiter(config.rate_limit);
+    RequestRegistry registry;
+    Scheduler scheduler(manager, registry, config.queue);
     std::string init_error;
+    if (auth.enabled()) {
+        log_line("auth: enabled api_keys=" + std::to_string(config.api_keys.size()));
+    } else {
+        log_line("auth: disabled (no configured api keys)");
+    }
     log_line("startup: loading default model");
     if (!manager.initialize_default(init_error)) {
         log_line("startup: default model failed: " + init_error);
     for (;;) {
         asio::ip::tcp::socket socket{ioc};
         acceptor.accept(socket);
+        std::thread(
+            &do_session,
+            std::move(socket),
+            std::ref(manager),
+            std::cref(config),
+            std::cref(auth),
+            std::ref(rate_limiter),
+            std::ref(registry),
+            std::ref(scheduler)).detach();
     }
 }