Dmitry Beresnev commited on
Commit
fc0860f
·
1 Parent(s): f41621b

add cpp server

Browse files
Files changed (3) hide show
  1. .clang-format +4 -0
  2. Dockerfile +26 -2
  3. cpp/llm_manager.cpp +432 -0
.clang-format ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ BasedOnStyle: Google
2
+ Standard: Cpp17
3
+ IndentWidth: 2
4
+ ColumnLimit: 100
Dockerfile CHANGED
@@ -9,6 +9,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
9
  ca-certificates \
10
  libcurl4-openssl-dev \
11
  libssl-dev \
 
 
12
  pkg-config \
13
  && rm -rf /var/lib/apt/lists/*
14
 
@@ -38,6 +40,11 @@ RUN git clone --depth 1 --branch ${LLAMA_CPP_REF} https://github.com/ggerganov/l
38
  echo "=== Binary dependencies ===" && \
39
  ldd build/bin/llama-server || true
40
 
 
 
 
 
 
41
  # Runtime stage
42
  FROM debian:bookworm-slim
43
 
@@ -53,6 +60,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
53
  # Copy llama-server binary and all shared libraries from builder
54
  COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
55
  COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/
 
56
 
57
  # Update library cache
58
  RUN ldconfig
@@ -99,9 +107,25 @@ EXPOSE 7860
99
  #
100
  # Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
101
  # CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]
 
 
102
  #
103
- # Active: QuantFactory Qwen2.5-7B-Instruct Q4_K_M (8k target)
104
- CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  #
106
  # Example Qwen2.5-Coder 7B Instruct (32k context):
107
  # CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
 
9
  ca-certificates \
10
  libcurl4-openssl-dev \
11
  libssl-dev \
12
+ libboost-dev \
13
+ nlohmann-json3-dev \
14
  pkg-config \
15
  && rm -rf /var/lib/apt/lists/*
16
 
 
40
  echo "=== Binary dependencies ===" && \
41
  ldd build/bin/llama-server || true
42
 
43
+ # Build C++ manager (Boost.Beast + JSON)
44
+ COPY cpp/llm_manager.cpp /build/llm_manager.cpp
45
+ RUN g++ -std=c++17 -O2 -DBOOST_ERROR_CODE_HEADER_ONLY -pthread \
46
+ /build/llm_manager.cpp -o /build/llm-manager
47
+
48
  # Runtime stage
49
  FROM debian:bookworm-slim
50
 
 
60
  # Copy llama-server binary and all shared libraries from builder
61
  COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
62
  COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/
63
+ COPY --from=builder /build/llm-manager /usr/local/bin/llm-manager
64
 
65
  # Update library cache
66
  RUN ldconfig
 
107
  #
108
  # Example Mixtral-8x7B-Instruct (known loader incompatibilities on newer llama.cpp + too large for 10GB RAM):
109
  # CMD ["llama-server", "-hf", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", "--host", "0.0.0.0", "--port", "7860", "-c", "4096", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "64", "--ubatch-size", "32"]
110
+
111
+ # CMD ["llama-server", "-hf", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m", "--host", "0.0.0.0", "--port", "7860", "-c", "8192", "-t", "4", "-ngl", "0", "--cont-batching", "-b", "128", "--ubatch-size", "64"]
112
  #
113
+ # Active manager process:
114
+ # - loads default model at startup
115
+ # - supports /switch-model runtime model change
116
+ # - proxies /v1/chat/completions to active worker
117
+ ENV DEFAULT_MODEL=QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m \
118
+ MANAGER_HOST=0.0.0.0 \
119
+ MANAGER_PORT=7860 \
120
+ WORKER_BASE_PORT=8080 \
121
+ SWITCH_TIMEOUT_SEC=300 \
122
+ MODEL_N_CTX=8192 \
123
+ MODEL_THREADS=4 \
124
+ MODEL_NGL=0 \
125
+ MODEL_BATCH=128 \
126
+ MODEL_UBATCH=64
127
+
128
+ CMD ["llm-manager"]
129
  #
130
  # Example Qwen2.5-Coder 7B Instruct (32k context):
131
  # CMD ["llama-server", "-hf", "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
cpp/llm_manager.cpp ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <boost/asio.hpp>
2
+ #include <boost/beast/core.hpp>
3
+ #include <boost/beast/http.hpp>
4
+ #include <boost/beast/version.hpp>
5
+ #include <nlohmann/json.hpp>
6
+
7
+ #include <atomic>
8
+ #include <chrono>
9
+ #include <csignal>
10
+ #include <cstdlib>
11
+ #include <ctime>
12
+ #include <iomanip>
13
+ #include <iostream>
14
+ #include <mutex>
15
+ #include <optional>
16
+ #include <sstream>
17
+ #include <string>
18
+ #include <thread>
19
+ #include <vector>
20
+
21
+ #include <sys/types.h>
22
+ #include <sys/wait.h>
23
+ #include <unistd.h>
24
+
25
+ namespace asio = boost::asio;
26
+ namespace beast = boost::beast;
27
+ namespace http = beast::http;
28
+ using json = nlohmann::json;
29
+
30
+ struct WorkerInfo {
31
+ std::string model;
32
+ int port = 0;
33
+ pid_t pid = -1;
34
+ std::string last_loaded;
35
+ };
36
+
37
+ static std::string now_utc_iso() {
38
+ std::time_t t = std::time(nullptr);
39
+ std::tm tm{};
40
+ gmtime_r(&t, &tm);
41
+ std::ostringstream oss;
42
+ oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
43
+ return oss.str();
44
+ }
45
+
46
+ static std::string get_env_or(const char *name, const std::string &fallback) {
47
+ const char *v = std::getenv(name);
48
+ return (v && *v) ? std::string(v) : fallback;
49
+ }
50
+
51
+ static int get_env_int_or(const char *name, int fallback) {
52
+ const char *v = std::getenv(name);
53
+ if (!v || !*v) return fallback;
54
+ try {
55
+ return std::stoi(v);
56
+ } catch (...) {
57
+ return fallback;
58
+ }
59
+ }
60
+
61
+ static bool is_alive(pid_t pid) {
62
+ if (pid <= 0) return false;
63
+ return kill(pid, 0) == 0;
64
+ }
65
+
66
+ static void shutdown_worker(pid_t pid, int wait_seconds = 15) {
67
+ if (pid <= 0) return;
68
+ kill(pid, SIGTERM);
69
+ const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(wait_seconds);
70
+ while (std::chrono::steady_clock::now() < deadline) {
71
+ int status = 0;
72
+ pid_t r = waitpid(pid, &status, WNOHANG);
73
+ if (r == pid) return;
74
+ std::this_thread::sleep_for(std::chrono::milliseconds(200));
75
+ }
76
+ kill(pid, SIGKILL);
77
+ int status = 0;
78
+ waitpid(pid, &status, 0);
79
+ }
80
+
81
+ class ModelManager {
82
+ public:
83
+ ModelManager()
84
+ : _default_model(get_env_or("DEFAULT_MODEL", "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m")),
85
+ _llama_server_bin(get_env_or("LLAMA_SERVER_BIN", "/usr/local/bin/llama-server")),
86
+ _worker_host(get_env_or("WORKER_HOST", "127.0.0.1")),
87
+ _worker_bind_host(get_env_or("WORKER_BIND_HOST", "0.0.0.0")),
88
+ _base_port(get_env_int_or("WORKER_BASE_PORT", 8080)),
89
+ _switch_timeout_sec(get_env_int_or("SWITCH_TIMEOUT_SEC", 300)),
90
+ _n_ctx(get_env_int_or("MODEL_N_CTX", 8192)),
91
+ _n_threads(get_env_int_or("MODEL_THREADS", 4)),
92
+ _n_gpu_layers(get_env_int_or("MODEL_NGL", 0)),
93
+ _n_batch(get_env_int_or("MODEL_BATCH", 128)),
94
+ _n_ubatch(get_env_int_or("MODEL_UBATCH", 64)),
95
+ _next_port(_base_port) {}
96
+
97
+ bool initialize_default(std::string &error) {
98
+ return switch_model(_default_model, error);
99
+ }
100
+
101
+ bool switch_model(const std::string &model, std::string &error) {
102
+ {
103
+ std::lock_guard<std::mutex> lock(_mu);
104
+ if (_switch_in_progress) {
105
+ error = "Switch already in progress";
106
+ return false;
107
+ }
108
+ if (_active && _active->model == model && is_alive(_active->pid)) {
109
+ return true;
110
+ }
111
+ _switch_in_progress = true;
112
+ }
113
+
114
+ std::optional<WorkerInfo> old_worker;
115
+ {
116
+ std::lock_guard<std::mutex> lock(_mu);
117
+ if (_active) old_worker = _active;
118
+ }
119
+
120
+ int port = allocate_port();
121
+ pid_t pid = spawn_worker(model, port);
122
+ if (pid <= 0) {
123
+ finish_switch(false);
124
+ error = "Failed to start worker process";
125
+ return false;
126
+ }
127
+
128
+ if (!wait_until_ready(pid, port, _switch_timeout_sec)) {
129
+ shutdown_worker(pid);
130
+ finish_switch(false);
131
+ error = "New model did not become ready in time";
132
+ return false;
133
+ }
134
+
135
+ WorkerInfo new_worker{model, port, pid, now_utc_iso()};
136
+ {
137
+ std::lock_guard<std::mutex> lock(_mu);
138
+ _active = new_worker;
139
+ _switch_in_progress = false;
140
+ }
141
+
142
+ if (old_worker && old_worker->pid != pid) {
143
+ shutdown_worker(old_worker->pid);
144
+ }
145
+ return true;
146
+ }
147
+
148
+ std::optional<WorkerInfo> active_worker() {
149
+ std::lock_guard<std::mutex> lock(_mu);
150
+ if (_active && is_alive(_active->pid)) return _active;
151
+ return std::nullopt;
152
+ }
153
+
154
+ json models_view() {
155
+ std::lock_guard<std::mutex> lock(_mu);
156
+ json out;
157
+ out["status"] = (_active && is_alive(_active->pid)) ? "ready" : "no_active_model";
158
+ out["switch_in_progress"] = _switch_in_progress;
159
+ if (_active && is_alive(_active->pid)) {
160
+ out["current_model"] = _active->model;
161
+ out["last_loaded"] = _active->last_loaded;
162
+ out["active_pid"] = _active->pid;
163
+ out["active_port"] = _active->port;
164
+ } else {
165
+ out["current_model"] = nullptr;
166
+ out["last_loaded"] = nullptr;
167
+ out["active_pid"] = nullptr;
168
+ out["active_port"] = nullptr;
169
+ }
170
+ return out;
171
+ }
172
+
173
+ private:
174
+ std::mutex _mu;
175
+ std::optional<WorkerInfo> _active;
176
+ bool _switch_in_progress = false;
177
+
178
+ std::string _default_model;
179
+ std::string _llama_server_bin;
180
+ std::string _worker_host;
181
+ std::string _worker_bind_host;
182
+ int _base_port;
183
+ int _switch_timeout_sec;
184
+ int _n_ctx;
185
+ int _n_threads;
186
+ int _n_gpu_layers;
187
+ int _n_batch;
188
+ int _n_ubatch;
189
+ int _next_port;
190
+
191
+ int allocate_port() {
192
+ std::lock_guard<std::mutex> lock(_mu);
193
+ return _next_port++;
194
+ }
195
+
196
+ void finish_switch(bool ok) {
197
+ std::lock_guard<std::mutex> lock(_mu);
198
+ if (!ok) _switch_in_progress = false;
199
+ }
200
+
201
+ pid_t spawn_worker(const std::string &model, int port) {
202
+ pid_t pid = fork();
203
+ if (pid < 0) return -1;
204
+ if (pid == 0) {
205
+ setsid();
206
+ std::string port_s = std::to_string(port);
207
+ std::string n_ctx_s = std::to_string(_n_ctx);
208
+ std::string threads_s = std::to_string(_n_threads);
209
+ std::string ngl_s = std::to_string(_n_gpu_layers);
210
+ std::string batch_s = std::to_string(_n_batch);
211
+ std::string ubatch_s = std::to_string(_n_ubatch);
212
+
213
+ std::vector<std::string> args = {
214
+ _llama_server_bin,
215
+ "-hf", model,
216
+ "--host", _worker_bind_host,
217
+ "--port", port_s,
218
+ "-c", n_ctx_s,
219
+ "-t", threads_s,
220
+ "-ngl", ngl_s,
221
+ "--cont-batching",
222
+ "-b", batch_s,
223
+ "--ubatch-size", ubatch_s
224
+ };
225
+
226
+ std::vector<char *> argv;
227
+ argv.reserve(args.size() + 1);
228
+ for (auto &s : args) argv.push_back(const_cast<char *>(s.c_str()));
229
+ argv.push_back(nullptr);
230
+ execvp(argv[0], argv.data());
231
+ _exit(127);
232
+ }
233
+ return pid;
234
+ }
235
+
236
+ bool wait_until_ready(pid_t pid, int port, int timeout_sec) {
237
+ const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_sec);
238
+ while (std::chrono::steady_clock::now() < deadline) {
239
+ if (!is_alive(pid)) return false;
240
+ try {
241
+ auto [status, _] = http_get(port, "/");
242
+ if (status == 200) return true;
243
+ } catch (...) {
244
+ }
245
+ std::this_thread::sleep_for(std::chrono::milliseconds(800));
246
+ }
247
+ return false;
248
+ }
249
+
250
+ std::pair<int, std::string> http_get(int port, const std::string &target) {
251
+ asio::io_context ioc;
252
+ asio::ip::tcp::resolver resolver(ioc);
253
+ beast::tcp_stream stream(ioc);
254
+ auto const results = resolver.resolve(_worker_host, std::to_string(port));
255
+ stream.connect(results);
256
+
257
+ http::request<http::string_body> req{http::verb::get, target, 11};
258
+ req.set(http::field::host, _worker_host);
259
+ req.set(http::field::user_agent, "llm-manager");
260
+ http::write(stream, req);
261
+
262
+ beast::flat_buffer buffer;
263
+ http::response<http::string_body> res;
264
+ http::read(stream, buffer, res);
265
+ beast::error_code ec;
266
+ stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
267
+ return {res.result_int(), res.body()};
268
+ }
269
+ };
270
+
271
+ static std::atomic<uint64_t> g_req_id{1};
272
+
273
+ static void log_line(const std::string &line) {
274
+ std::cout << "[" << now_utc_iso() << "] " << line << std::endl;
275
+ }
276
+
277
+ static std::string truncate_body(const std::string &body, size_t max_len = 2000) {
278
+ if (body.size() <= max_len) return body;
279
+ return body.substr(0, max_len) + "...[truncated]";
280
+ }
281
+
282
+ static std::pair<int, std::string> forward_chat(const WorkerInfo &worker, const std::string &body) {
283
+ asio::io_context ioc;
284
+ asio::ip::tcp::resolver resolver(ioc);
285
+ beast::tcp_stream stream(ioc);
286
+ auto const results = resolver.resolve("127.0.0.1", std::to_string(worker.port));
287
+ stream.connect(results);
288
+
289
+ http::request<http::string_body> req{http::verb::post, "/v1/chat/completions", 11};
290
+ req.set(http::field::host, "127.0.0.1");
291
+ req.set(http::field::content_type, "application/json");
292
+ req.set(http::field::user_agent, "llm-manager");
293
+ req.body() = body;
294
+ req.prepare_payload();
295
+ http::write(stream, req);
296
+
297
+ beast::flat_buffer buffer;
298
+ http::response<http::string_body> res;
299
+ http::read(stream, buffer, res);
300
+ beast::error_code ec;
301
+ stream.socket().shutdown(asio::ip::tcp::socket::shutdown_both, ec);
302
+ return {res.result_int(), res.body()};
303
+ }
304
+
305
+ template <typename Body, typename Allocator>
306
+ http::response<http::string_body> handle_request(
307
+ ModelManager &manager,
308
+ http::request<Body, http::basic_fields<Allocator>> &&req) {
309
+ const auto start = std::chrono::steady_clock::now();
310
+ const auto req_id = g_req_id.fetch_add(1);
311
+ const std::string target = req.target().to_string();
312
+ const std::string method = req.method_string().to_string();
313
+ const std::string path = target.substr(0, target.find('?'));
314
+
315
+ log_line("request_id=" + std::to_string(req_id) + " method=" + method + " path=" + target);
316
+ if constexpr (std::is_same_v<Body, http::string_body>) {
317
+ if (!req.body().empty()) {
318
+ log_line("request_id=" + std::to_string(req_id) + " body=" + truncate_body(req.body()));
319
+ }
320
+ }
321
+
322
+ auto json_response = [&](http::status status, const json &obj) {
323
+ http::response<http::string_body> res{status, req.version()};
324
+ res.set(http::field::content_type, "application/json");
325
+ res.set(http::field::server, "llm-manager");
326
+ res.keep_alive(req.keep_alive());
327
+ res.body() = obj.dump();
328
+ res.prepare_payload();
329
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
330
+ std::chrono::steady_clock::now() - start).count();
331
+ log_line("request_id=" + std::to_string(req_id) + " status=" + std::to_string(res.result_int()) +
332
+ " elapsed_ms=" + std::to_string(elapsed_ms));
333
+ return res;
334
+ };
335
+
336
+ try {
337
+ if (path == "/health" && req.method() == http::verb::get) {
338
+ return json_response(http::status::ok, manager.models_view());
339
+ }
340
+
341
+ if (path == "/models" && req.method() == http::verb::get) {
342
+ return json_response(http::status::ok, manager.models_view());
343
+ }
344
+
345
+ if (path == "/switch-model" && req.method() == http::verb::post) {
346
+ std::string body(req.body().data(), req.body().size());
347
+ json j = json::parse(body, nullptr, false);
348
+ if (j.is_discarded()) {
349
+ return json_response(http::status::bad_request, {{"error", "Invalid JSON"}});
350
+ }
351
+ std::string model;
352
+ if (j.contains("model_name")) model = j["model_name"].get<std::string>();
353
+ if (j.contains("model")) model = j["model"].get<std::string>();
354
+ if (model.empty()) {
355
+ return json_response(http::status::bad_request, {{"error", "Expected 'model' or 'model_name'"}});
356
+ }
357
+
358
+ std::string err;
359
+ bool ok = manager.switch_model(model, err);
360
+ if (!ok) {
361
+ auto status = (err == "Switch already in progress") ? http::status::conflict : http::status::internal_server_error;
362
+ return json_response(status, {{"status", "error"}, {"error", err}});
363
+ }
364
+ auto state = manager.models_view();
365
+ state["message"] = "Switched model successfully";
366
+ return json_response(http::status::ok, state);
367
+ }
368
+
369
+ if (path == "/v1/chat/completions" && req.method() == http::verb::post) {
370
+ auto worker = manager.active_worker();
371
+ if (!worker) {
372
+ return json_response(http::status::service_unavailable, {{"error", "No active model"}});
373
+ }
374
+ auto [upstream_status, upstream_body] = forward_chat(*worker, req.body());
375
+ http::response<http::string_body> res{static_cast<http::status>(upstream_status), req.version()};
376
+ res.set(http::field::content_type, "application/json");
377
+ res.set(http::field::server, "llm-manager");
378
+ res.keep_alive(req.keep_alive());
379
+ res.body() = upstream_body;
380
+ res.prepare_payload();
381
+ auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
382
+ std::chrono::steady_clock::now() - start).count();
383
+ log_line("request_id=" + std::to_string(req_id) + " model=" + worker->model +
384
+ " active_pid=" + std::to_string(worker->pid) +
385
+ " active_port=" + std::to_string(worker->port) +
386
+ " upstream_status=" + std::to_string(upstream_status) +
387
+ " elapsed_ms=" + std::to_string(elapsed_ms));
388
+ return res;
389
+ }
390
+
391
+ return json_response(http::status::not_found, {{"error", "Not found"}});
392
+ } catch (const std::exception &e) {
393
+ return json_response(http::status::internal_server_error, {{"error", e.what()}});
394
+ }
395
+ }
396
+
397
+ void do_session(asio::ip::tcp::socket socket, ModelManager &manager) {
398
+ try {
399
+ beast::flat_buffer buffer;
400
+ http::request<http::string_body> req;
401
+ http::read(socket, buffer, req);
402
+ auto res = handle_request(manager, std::move(req));
403
+ http::write(socket, res);
404
+ beast::error_code ec;
405
+ socket.shutdown(asio::ip::tcp::socket::shutdown_send, ec);
406
+ } catch (...) {
407
+ }
408
+ }
409
+
410
+ int main() {
411
+ const auto bind_host = get_env_or("MANAGER_HOST", "0.0.0.0");
412
+ const int bind_port = get_env_int_or("MANAGER_PORT", 7860);
413
+ ModelManager manager;
414
+
415
+ std::string init_error;
416
+ log_line("startup: loading default model");
417
+ if (!manager.initialize_default(init_error)) {
418
+ log_line("startup: default model failed: " + init_error);
419
+ } else {
420
+ log_line("startup: default model ready");
421
+ }
422
+
423
+ asio::io_context ioc{1};
424
+ asio::ip::tcp::acceptor acceptor{ioc, {asio::ip::make_address(bind_host), static_cast<unsigned short>(bind_port)}};
425
+ log_line("manager listening on " + bind_host + ":" + std::to_string(bind_port));
426
+
427
+ for (;;) {
428
+ asio::ip::tcp::socket socket{ioc};
429
+ acceptor.accept(socket);
430
+ std::thread(&do_session, std::move(socket), std::ref(manager)).detach();
431
+ }
432
+ }