Spaces:
Running
Running
$(cat <<EOF
Browse filesFix Groq and Cerebras model IDs and rate limits.
Groq: Model IDs need inner prefixes (qwen/qwen3-32b not qwen3-32b)
Cerebras: Removed inaccessible models, set higher rate limits
Silicon: Increased rate limits to 300/min
All providers now have appropriate concurrency and rate limit settings.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
EOF
)
- providers/cerebras/client.py +4 -9
- providers/groq/client.py +16 -3
- providers/silicon/client.py +3 -0
providers/cerebras/client.py
CHANGED
|
@@ -13,27 +13,23 @@ class CerebrasProvider(OpenAIChatTransport):
|
|
| 13 |
"""Cerebras provider using OpenAI-compatible /chat/completions."""
|
| 14 |
|
| 15 |
# Mapping of proxy model refs to Cerebras API model IDs.
|
| 16 |
-
# The proxy uses full refs like "cerebras/qwen-3-235b-a22b-instruct-2507"
|
| 17 |
-
# but Cerebras API expects bare model IDs like "qwen-3-235b-a22b-instruct-2507".
|
| 18 |
CEREBRAS_MODEL_MAP: dict[str, str] = {
|
| 19 |
-
"llama3.1-8b": "llama3.1-8b",
|
| 20 |
-
"qwen-3-235b-a22b-instruct-2507": "qwen-3-235b-a22b-instruct-2507",
|
| 21 |
-
"zai-glm-4.7": "zai-glm-4.7",
|
| 22 |
-
"gpt-oss-120b": "gpt-oss-120b",
|
| 23 |
"cerebras/llama3.1-8b": "llama3.1-8b",
|
| 24 |
"cerebras/qwen-3-235b-a22b-instruct-2507": "qwen-3-235b-a22b-instruct-2507",
|
| 25 |
-
"cerebras/z-ai/glm4.7": "zai-glm-4.7",
|
| 26 |
}
|
| 27 |
|
| 28 |
def __init__(self, config: ProviderConfig, *, settings: Settings):
|
| 29 |
base_url = (config.base_url or CEREBRAS_DEFAULT_BASE).rstrip("/")
|
| 30 |
if not base_url.endswith("/v1"):
|
| 31 |
base_url = base_url + "/v1"
|
|
|
|
| 32 |
super().__init__(
|
| 33 |
config,
|
| 34 |
provider_name="Cerebras",
|
| 35 |
base_url=base_url,
|
| 36 |
api_key=config.api_key,
|
|
|
|
|
|
|
| 37 |
)
|
| 38 |
self._settings = settings
|
| 39 |
|
|
@@ -47,10 +43,9 @@ class CerebrasProvider(OpenAIChatTransport):
|
|
| 47 |
else ReasoningReplayMode.DISABLED
|
| 48 |
)
|
| 49 |
body = build_base_request_body(request, reasoning_replay=reasoning_replay)
|
| 50 |
-
# Strip cerebras/ prefix so the API gets the bare model ID
|
| 51 |
model = body.get("model", "")
|
| 52 |
if model in self.CEREBRAS_MODEL_MAP:
|
| 53 |
body["model"] = self.CEREBRAS_MODEL_MAP[model]
|
| 54 |
elif model.startswith("cerebras/"):
|
| 55 |
-
body["model"] = model[len("cerebras/")
|
| 56 |
return body
|
|
|
|
| 13 |
"""Cerebras provider using OpenAI-compatible /chat/completions."""
|
| 14 |
|
| 15 |
# Mapping of proxy model refs to Cerebras API model IDs.
|
|
|
|
|
|
|
| 16 |
CEREBRAS_MODEL_MAP: dict[str, str] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"cerebras/llama3.1-8b": "llama3.1-8b",
|
| 18 |
"cerebras/qwen-3-235b-a22b-instruct-2507": "qwen-3-235b-a22b-instruct-2507",
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
def __init__(self, config: ProviderConfig, *, settings: Settings):
|
| 22 |
base_url = (config.base_url or CEREBRAS_DEFAULT_BASE).rstrip("/")
|
| 23 |
if not base_url.endswith("/v1"):
|
| 24 |
base_url = base_url + "/v1"
|
| 25 |
+
# Cerebras has generous rate limits
|
| 26 |
super().__init__(
|
| 27 |
config,
|
| 28 |
provider_name="Cerebras",
|
| 29 |
base_url=base_url,
|
| 30 |
api_key=config.api_key,
|
| 31 |
+
nim_rate_limit=300,
|
| 32 |
+
nim_max_concurrency=80,
|
| 33 |
)
|
| 34 |
self._settings = settings
|
| 35 |
|
|
|
|
| 43 |
else ReasoningReplayMode.DISABLED
|
| 44 |
)
|
| 45 |
body = build_base_request_body(request, reasoning_replay=reasoning_replay)
|
|
|
|
| 46 |
model = body.get("model", "")
|
| 47 |
if model in self.CEREBRAS_MODEL_MAP:
|
| 48 |
body["model"] = self.CEREBRAS_MODEL_MAP[model]
|
| 49 |
elif model.startswith("cerebras/"):
|
| 50 |
+
body["model"] = model[len("cerebras/"):]
|
| 51 |
return body
|
providers/groq/client.py
CHANGED
|
@@ -12,15 +12,26 @@ from providers.openai_compat import OpenAIChatTransport
|
|
| 12 |
class GroqProvider(OpenAIChatTransport):
|
| 13 |
"""Groq provider using OpenAI-compatible /chat/completions."""
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def __init__(self, config: ProviderConfig, *, settings: Settings):
|
| 16 |
base_url = (config.base_url or GROQ_DEFAULT_BASE).rstrip("/")
|
| 17 |
if not base_url.endswith("/v1"):
|
| 18 |
base_url = base_url + "/v1"
|
|
|
|
| 19 |
super().__init__(
|
| 20 |
config,
|
| 21 |
provider_name="Groq",
|
| 22 |
base_url=base_url,
|
| 23 |
api_key=config.api_key,
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
self._settings = settings
|
| 26 |
|
|
@@ -34,8 +45,10 @@ class GroqProvider(OpenAIChatTransport):
|
|
| 34 |
else ReasoningReplayMode.DISABLED
|
| 35 |
)
|
| 36 |
body = build_base_request_body(request, reasoning_replay=reasoning_replay)
|
| 37 |
-
#
|
| 38 |
model = body.get("model", "")
|
| 39 |
-
if model.
|
| 40 |
-
body["model"] =
|
|
|
|
|
|
|
| 41 |
return body
|
|
|
|
| 12 |
class GroqProvider(OpenAIChatTransport):
|
| 13 |
"""Groq provider using OpenAI-compatible /chat/completions."""
|
| 14 |
|
| 15 |
+
# Mapping of proxy model refs to Groq API model IDs.
|
| 16 |
+
# groq/ prefix is stripped, but the inner prefix (like qwen/) is kept.
|
| 17 |
+
GROQ_MODEL_MAP: dict[str, str] = {
|
| 18 |
+
"groq/qwen3-32b": "qwen/qwen3-32b",
|
| 19 |
+
"groq/llama-3.3-70b-versatile": "llama-3.3-70b-versatile",
|
| 20 |
+
"groq/llama-3.1-8b-instant": "llama-3.1-8b-instant",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
def __init__(self, config: ProviderConfig, *, settings: Settings):
|
| 24 |
base_url = (config.base_url or GROQ_DEFAULT_BASE).rstrip("/")
|
| 25 |
if not base_url.endswith("/v1"):
|
| 26 |
base_url = base_url + "/v1"
|
| 27 |
+
# Groq has generous rate limits - set high limits with no adaptive throttling
|
| 28 |
super().__init__(
|
| 29 |
config,
|
| 30 |
provider_name="Groq",
|
| 31 |
base_url=base_url,
|
| 32 |
api_key=config.api_key,
|
| 33 |
+
nim_rate_limit=500, # High limit for Groq
|
| 34 |
+
nim_max_concurrency=100, # High concurrency for Groq
|
| 35 |
)
|
| 36 |
self._settings = settings
|
| 37 |
|
|
|
|
| 45 |
else ReasoningReplayMode.DISABLED
|
| 46 |
)
|
| 47 |
body = build_base_request_body(request, reasoning_replay=reasoning_replay)
|
| 48 |
+
# Map proxy model ref to actual Groq API model ID
|
| 49 |
model = body.get("model", "")
|
| 50 |
+
if model in self.GROQ_MODEL_MAP:
|
| 51 |
+
body["model"] = self.GROQ_MODEL_MAP[model]
|
| 52 |
+
elif model.startswith("groq/"):
|
| 53 |
+
body["model"] = model[len("groq/"):]
|
| 54 |
return body
|
providers/silicon/client.py
CHANGED
|
@@ -16,11 +16,14 @@ class SiliconProvider(OpenAIChatTransport):
|
|
| 16 |
base_url = (config.base_url or SILICON_DEFAULT_BASE).rstrip("/")
|
| 17 |
if not base_url.endswith("/v1"):
|
| 18 |
base_url = base_url + "/v1"
|
|
|
|
| 19 |
super().__init__(
|
| 20 |
config,
|
| 21 |
provider_name="Silicon",
|
| 22 |
base_url=base_url,
|
| 23 |
api_key=config.api_key,
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
self._settings = settings
|
| 26 |
|
|
|
|
| 16 |
base_url = (config.base_url or SILICON_DEFAULT_BASE).rstrip("/")
|
| 17 |
if not base_url.endswith("/v1"):
|
| 18 |
base_url = base_url + "/v1"
|
| 19 |
+
# Silicon Flow has generous rate limits
|
| 20 |
super().__init__(
|
| 21 |
config,
|
| 22 |
provider_name="Silicon",
|
| 23 |
base_url=base_url,
|
| 24 |
api_key=config.api_key,
|
| 25 |
+
nim_rate_limit=300,
|
| 26 |
+
nim_max_concurrency=80,
|
| 27 |
)
|
| 28 |
self._settings = settings
|
| 29 |
|