AGI / config.toml.example
Dmitry Beresnev
Refactor the C++ LLM manager into modular components, moves Python modules under python/, and keeps the current control-plane behavior intact. The C++ server now has clearer separation for config, model lifecycle, runtime services, request parsing, HTTP helpers, and server routing, while Docker build/runtime paths were updated to compile multiple C++ files and load Python code from the new package folder
332826f
raw
history blame contribute delete
847 Bytes
[server]
host = "0.0.0.0"
port = 7860
[worker]
default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m"
llama_server_bin = "/usr/local/bin/llama-server"
host = "127.0.0.1"
bind_host = "0.0.0.0"
base_port = 8080
switch_timeout_sec = 300
[llama]
n_ctx = 8192
threads = 4
ngl = 0
batch = 128
ubatch = 64
[auth]
header = "Authorization"
scheme = "Bearer"
[limits]
default_max_tokens = 256
max_tokens_per_request = 2048
request_timeout_sec = 30
[queue]
max_size = 100
max_tokens = 20000
admin_quota = 3
retry_after_sec = 5
[scheduler]
max_concurrent = 1
[streaming]
enabled = false
[rate_limit]
requests_per_minute = 60
estimated_tokens_per_minute = 6000
[[api_keys]]
key_id = "admin-main"
secret = "change-me-admin"
role = "admin"
enabled = true
[[api_keys]]
key_id = "user-main"
secret = "change-me-user"
role = "user"
enabled = true