poolside-laguna-hackathon
/

lean-laguna

Text Generation

Mixture of Experts

speculative-decoding

Model card Files Files and versions

art87able commited on about 2 hours ago

Commit

ee4e9e6

·

verified ·

1 Parent(s): a5214ff

Upload folder using huggingface_hub

Files changed (1) hide show

configs/endpoints.toml +7 -5

configs/endpoints.toml CHANGED Viewed

@@ -56,14 +56,16 @@ key         = "EMPTY"
 type        = "openai_chat_completions"
 # ---------------------------------------------------------------------------
-# prime (HOSTED FALLBACK) — Prime Intellect managed inference. Use if the local
-# vLLM is down or while waiting on venue compute. Costs PI credits per token (the
-# $50 pool covers Prime Inference + Sandboxes + On-Demand GPUs). PRIME_API_KEY is
-# read from the environment — never hard-code a key here.
 # ---------------------------------------------------------------------------
 [[endpoint]]
 endpoint_id = "prime"
-model       = "poolside/Laguna-XS.2"
 url         = "https://api.pinference.ai/api/v1"
 key         = "PRIME_API_KEY"
 type        = "openai_chat_completions"

 type        = "openai_chat_completions"
 # ---------------------------------------------------------------------------
+# prime (HOSTED) — Prime Intellect managed inference (free for Laguna during the
+# hackathon). PRIME_API_KEY is read from the environment — never hard-code a key.
+# NOTE: the pinference model id is LOWERCASE `poolside/laguna-xs.2` (the mixed-case
+# `poolside/Laguna-XS.2` returns 404 model_not_found on pinference). To disable
+# Laguna's reasoning/thinking on this endpoint, pass `reasoning_effort: "none"`
+# (the raw chat_template_kwargs.enable_thinking flag is ignored here).
 # ---------------------------------------------------------------------------
 [[endpoint]]
 endpoint_id = "prime"
+model       = "poolside/laguna-xs.2"
 url         = "https://api.pinference.ai/api/v1"
 key         = "PRIME_API_KEY"
 type        = "openai_chat_completions"