Spaces:
Building
Building
Dmitry Beresnev
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
332826f | static int estimate_text_tokens_rough(const std::string &text) { | |
| if (text.empty()) return 0; | |
| return std::max(1, static_cast<int>((text.size() + 3) / 4)); | |
| } | |
| static std::string flatten_json_content(const json &content) { | |
| if (content.is_string()) { | |
| return content.get<std::string>(); | |
| } | |
| if (content.is_array()) { | |
| std::ostringstream oss; | |
| bool first = true; | |
| for (const auto &item : content) { | |
| std::string part; | |
| if (item.is_string()) { | |
| part = item.get<std::string>(); | |
| } else if (item.is_object() && item.contains("text") && item["text"].is_string()) { | |
| part = item["text"].get<std::string>(); | |
| } | |
| if (part.empty()) continue; | |
| if (!first) oss << '\n'; | |
| oss << part; | |
| first = false; | |
| } | |
| return oss.str(); | |
| } | |
| return ""; | |
| } | |
| std::optional<TokenEstimate> estimate_chat_tokens( | |
| const json &payload, | |
| const LimitsConfig &limits, | |
| std::string &error) { | |
| if (!payload.is_object()) { | |
| error = "Expected JSON object"; | |
| return std::nullopt; | |
| } | |
| if (!payload.contains("messages") || !payload["messages"].is_array()) { | |
| error = "Expected 'messages' array"; | |
| return std::nullopt; | |
| } | |
| TokenEstimate estimate; | |
| estimate.requested_max_tokens = limits.default_max_tokens; | |
| if (payload.contains("max_tokens")) { | |
| if (!payload["max_tokens"].is_number_integer()) { | |
| error = "Expected integer 'max_tokens'"; | |
| return std::nullopt; | |
| } | |
| estimate.requested_max_tokens = payload["max_tokens"].get<int>(); | |
| } | |
| if (estimate.requested_max_tokens <= 0) { | |
| error = "'max_tokens' must be > 0"; | |
| return std::nullopt; | |
| } | |
| for (const auto &message : payload["messages"]) { | |
| if (!message.is_object()) continue; | |
| if (message.contains("role") && message["role"].is_string()) { | |
| estimate.prompt_tokens += estimate_text_tokens_rough(message["role"].get<std::string>()); | |
| } | |
| if (message.contains("content")) { | |
| estimate.prompt_tokens += estimate_text_tokens_rough(flatten_json_content(message["content"])); | |
| } | |
| estimate.prompt_tokens += 4; | |
| } | |
| estimate.estimated_total_tokens = estimate.prompt_tokens + estimate.requested_max_tokens; | |
| if (estimate.estimated_total_tokens > limits.max_tokens_per_request) { | |
| error = "Estimated request tokens exceed configured limit"; | |
| return std::nullopt; | |
| } | |
| error.clear(); | |
| return estimate; | |
| } | |
| bool request_stream_enabled(const json &payload) { | |
| if (!payload.is_object() || !payload.contains("stream")) return false; | |
| return payload["stream"].is_boolean() && payload["stream"].get<bool>(); | |
| } | |