[server] host = "0.0.0.0" port = 7860 [worker] default_model = "QuantFactory/Qwen2.5-7B-Instruct-GGUF:q4_k_m" llama_server_bin = "/usr/local/bin/llama-server" host = "127.0.0.1" bind_host = "0.0.0.0" base_port = 8080 switch_timeout_sec = 300 [llama] n_ctx = 8192 threads = 4 ngl = 0 batch = 128 ubatch = 64 [auth] header = "Authorization" scheme = "Bearer" [limits] default_max_tokens = 256 max_tokens_per_request = 2048 request_timeout_sec = 30 [queue] max_size = 100 max_tokens = 20000 admin_quota = 3 retry_after_sec = 5 [scheduler] max_concurrent = 1 [streaming] enabled = false [rate_limit] requests_per_minute = 60 estimated_tokens_per_minute = 6000 [[api_keys]] key_id = "admin-main" secret = "change-me-admin" role = "admin" enabled = true [[api_keys]] key_id = "user-main" secret = "change-me-user" role = "user" enabled = true