Spaces:
Build error
Build error
Dmitry Beresnev
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
332826f | [server] | |
| host = "0.0.0.0" | |
| port = 7860 | |
| [worker] | |
| default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m" | |
| llama_server_bin = "/usr/local/bin/llama-server" | |
| host = "127.0.0.1" | |
| bind_host = "0.0.0.0" | |
| base_port = 8080 | |
| switch_timeout_sec = 300 | |
| [llama] | |
| n_ctx = 8192 | |
| threads = 4 | |
| ngl = 0 | |
| batch = 128 | |
| ubatch = 64 | |
| [auth] | |
| header = "Authorization" | |
| scheme = "Bearer" | |
| [limits] | |
| default_max_tokens = 256 | |
| max_tokens_per_request = 2048 | |
| request_timeout_sec = 30 | |
| [queue] | |
| max_size = 100 | |
| max_tokens = 20000 | |
| admin_quota = 3 | |
| retry_after_sec = 5 | |
| [scheduler] | |
| max_concurrent = 1 | |
| [streaming] | |
| enabled = false | |
| [rate_limit] | |
| requests_per_minute = 60 | |
| estimated_tokens_per_minute = 6000 | |
| [[api_keys]] | |
| key_id = "admin-main" | |
| secret = "change-me-admin" | |
| role = "admin" | |
| enabled = true | |
| [[api_keys]] | |
| key_id = "user-main" | |
| secret = "change-me-user" | |
| role = "user" | |
| enabled = true | |