minor-bugfix

#2
by Smestern - opened
src/labrats/live.py CHANGED
@@ -264,12 +264,24 @@ class LiveRunner:
264
 
265
  store = None
266
  if config.memory:
267
- from .memory import BGEEmbedder, ChromaMemoryStore
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  # reset=True so each episode starts from a clean store rather
270
  # than retrieving stale notes from a prior run's persist dir.
271
  store = ChromaMemoryStore(
272
- BGEEmbedder(), persist_dir=run_dir / "chroma", reset=True
273
  )
274
 
275
  dialogue_enabled = config.dialogue and n_agents > 1
 
264
 
265
  store = None
266
  if config.memory:
267
+ from .memory import ChromaMemoryStore
268
+
269
+ # Pick the embedder by backend. The hf backend embeds remotely via
270
+ # HF Inference (no torch/sentence-transformers — fits the free CPU
271
+ # Space). The local backend uses the in-process BGE model.
272
+ if config.backend == "local":
273
+ from .memory import BGEEmbedder
274
+
275
+ embedder = BGEEmbedder()
276
+ else:
277
+ from .memory import HFInferenceEmbedder
278
+
279
+ embedder = HFInferenceEmbedder()
280
 
281
  # reset=True so each episode starts from a clean store rather
282
  # than retrieving stale notes from a prior run's persist dir.
283
  store = ChromaMemoryStore(
284
+ embedder, persist_dir=run_dir / "chroma", reset=True
285
  )
286
 
287
  dialogue_enabled = config.dialogue and n_agents > 1
src/labrats/models/hf_provider.py CHANGED
@@ -20,7 +20,11 @@ from huggingface_hub import InferenceClient
20
  from .client import Message
21
 
22
  DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
23
- DEFAULT_PROVIDER = "hf-inference"
 
 
 
 
24
  _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
25
  # Substrings that indicate the provider rejected our response_format/grammar.
26
  # Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
@@ -28,6 +32,7 @@ _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
28
  _RESPONSE_FORMAT_REJECT_HINTS = (
29
  "grammar is not valid",
30
  "failed to compile grammar",
 
31
  "response_format",
32
  "json_schema",
33
  )
@@ -108,6 +113,9 @@ def _is_transient(exc: BaseException) -> bool:
108
 
109
  def _is_response_format_rejection(exc: BaseException) -> bool:
110
  msg = str(exc).lower()
111
- if "422" not in msg and "unprocessable" not in msg:
 
 
 
112
  return False
113
  return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)
 
20
  from .client import Message
21
 
22
  DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
23
+ # "auto" lets HF route to a provider that actually serves the chosen model.
24
+ # The legacy "hf-inference" provider does not serve many 32B models (e.g.
25
+ # Qwen2.5-Coder-32B), which would make every live call 400 and the agent
26
+ # blind-fall-back to MOVE — looking like random stepping.
27
+ DEFAULT_PROVIDER = "auto"
28
  _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
29
  # Substrings that indicate the provider rejected our response_format/grammar.
30
  # Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
 
32
  _RESPONSE_FORMAT_REJECT_HINTS = (
33
  "grammar is not valid",
34
  "failed to compile grammar",
35
+ "does not support response format",
36
  "response_format",
37
  "json_schema",
38
  )
 
113
 
114
  def _is_response_format_rejection(exc: BaseException) -> bool:
115
  msg = str(exc).lower()
116
+ # Providers signal "I can't honor this response_format" with different
117
+ # HTTP codes: 422 (hf-inference/Together grammar compiler) or 400
118
+ # (Groq "does not support response format json_schema"). Accept both.
119
+ if not any(code in msg for code in ("422", "unprocessable", "400", "bad request")):
120
  return False
121
  return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)