Spaces:
Sleeping
Sleeping
minor-bugfix
#2
by Smestern - opened
- src/labrats/live.py +14 -2
- src/labrats/models/hf_provider.py +10 -2
src/labrats/live.py
CHANGED
|
@@ -264,12 +264,24 @@ class LiveRunner:
|
|
| 264 |
|
| 265 |
store = None
|
| 266 |
if config.memory:
|
| 267 |
-
from .memory import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
# reset=True so each episode starts from a clean store rather
|
| 270 |
# than retrieving stale notes from a prior run's persist dir.
|
| 271 |
store = ChromaMemoryStore(
|
| 272 |
-
|
| 273 |
)
|
| 274 |
|
| 275 |
dialogue_enabled = config.dialogue and n_agents > 1
|
|
|
|
| 264 |
|
| 265 |
store = None
|
| 266 |
if config.memory:
|
| 267 |
+
from .memory import ChromaMemoryStore
|
| 268 |
+
|
| 269 |
+
# Pick the embedder by backend. The hf backend embeds remotely via
|
| 270 |
+
# HF Inference (no torch/sentence-transformers — fits the free CPU
|
| 271 |
+
# Space). The local backend uses the in-process BGE model.
|
| 272 |
+
if config.backend == "local":
|
| 273 |
+
from .memory import BGEEmbedder
|
| 274 |
+
|
| 275 |
+
embedder = BGEEmbedder()
|
| 276 |
+
else:
|
| 277 |
+
from .memory import HFInferenceEmbedder
|
| 278 |
+
|
| 279 |
+
embedder = HFInferenceEmbedder()
|
| 280 |
|
| 281 |
# reset=True so each episode starts from a clean store rather
|
| 282 |
# than retrieving stale notes from a prior run's persist dir.
|
| 283 |
store = ChromaMemoryStore(
|
| 284 |
+
embedder, persist_dir=run_dir / "chroma", reset=True
|
| 285 |
)
|
| 286 |
|
| 287 |
dialogue_enabled = config.dialogue and n_agents > 1
|
src/labrats/models/hf_provider.py
CHANGED
|
@@ -20,7 +20,11 @@ from huggingface_hub import InferenceClient
|
|
| 20 |
from .client import Message
|
| 21 |
|
| 22 |
DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
_TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
|
| 25 |
# Substrings that indicate the provider rejected our response_format/grammar.
|
| 26 |
# Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
|
|
@@ -28,6 +32,7 @@ _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
|
|
| 28 |
_RESPONSE_FORMAT_REJECT_HINTS = (
|
| 29 |
"grammar is not valid",
|
| 30 |
"failed to compile grammar",
|
|
|
|
| 31 |
"response_format",
|
| 32 |
"json_schema",
|
| 33 |
)
|
|
@@ -108,6 +113,9 @@ def _is_transient(exc: BaseException) -> bool:
|
|
| 108 |
|
| 109 |
def _is_response_format_rejection(exc: BaseException) -> bool:
|
| 110 |
msg = str(exc).lower()
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
| 112 |
return False
|
| 113 |
return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)
|
|
|
|
| 20 |
from .client import Message
|
| 21 |
|
| 22 |
DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
| 23 |
+
# "auto" lets HF route to a provider that actually serves the chosen model.
|
| 24 |
+
# The legacy "hf-inference" provider does not serve many 32B models (e.g.
|
| 25 |
+
# Qwen2.5-Coder-32B), which would make every live call 400 and the agent
|
| 26 |
+
# blind-fall-back to MOVE — looking like random stepping.
|
| 27 |
+
DEFAULT_PROVIDER = "auto"
|
| 28 |
_TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
|
| 29 |
# Substrings that indicate the provider rejected our response_format/grammar.
|
| 30 |
# Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
|
|
|
|
| 32 |
_RESPONSE_FORMAT_REJECT_HINTS = (
|
| 33 |
"grammar is not valid",
|
| 34 |
"failed to compile grammar",
|
| 35 |
+
"does not support response format",
|
| 36 |
"response_format",
|
| 37 |
"json_schema",
|
| 38 |
)
|
|
|
|
| 113 |
|
| 114 |
def _is_response_format_rejection(exc: BaseException) -> bool:
|
| 115 |
msg = str(exc).lower()
|
| 116 |
+
# Providers signal "I can't honor this response_format" with different
|
| 117 |
+
# HTTP codes: 422 (hf-inference/Together grammar compiler) or 400
|
| 118 |
+
# (Groq "does not support response format json_schema"). Accept both.
|
| 119 |
+
if not any(code in msg for code in ("422", "unprocessable", "400", "bad request")):
|
| 120 |
return False
|
| 121 |
return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)
|