Spaces:

build-small-hackathon
/

labrats

Sleeping

minor-bugfix

by Smestern - opened 18 days ago

←

This PR is in draft mode

Files changed (2) hide show

src/labrats/live.py CHANGED Viewed

@@ -264,12 +264,24 @@ class LiveRunner:
         store = None
         if config.memory:
-            from .memory import BGEEmbedder, ChromaMemoryStore
             # reset=True so each episode starts from a clean store rather
             # than retrieving stale notes from a prior run's persist dir.
             store = ChromaMemoryStore(
-                BGEEmbedder(), persist_dir=run_dir / "chroma", reset=True
             )
         dialogue_enabled = config.dialogue and n_agents > 1

         store = None
         if config.memory:
+            from .memory import ChromaMemoryStore
+            # Pick the embedder by backend. The hf backend embeds remotely via
+            # HF Inference (no torch/sentence-transformers — fits the free CPU
+            # Space). The local backend uses the in-process BGE model.
+            if config.backend == "local":
+                from .memory import BGEEmbedder
+                embedder = BGEEmbedder()
+            else:
+                from .memory import HFInferenceEmbedder
+                embedder = HFInferenceEmbedder()
             # reset=True so each episode starts from a clean store rather
             # than retrieving stale notes from a prior run's persist dir.
             store = ChromaMemoryStore(
+                embedder, persist_dir=run_dir / "chroma", reset=True
             )
         dialogue_enabled = config.dialogue and n_agents > 1

src/labrats/models/hf_provider.py CHANGED Viewed

@@ -20,7 +20,11 @@ from huggingface_hub import InferenceClient
 from .client import Message
 DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
-DEFAULT_PROVIDER = "hf-inference"
 _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
 # Substrings that indicate the provider rejected our response_format/grammar.
 # Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
@@ -28,6 +32,7 @@ _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
 _RESPONSE_FORMAT_REJECT_HINTS = (
     "grammar is not valid",
     "failed to compile grammar",
     "response_format",
     "json_schema",
 )
@@ -108,6 +113,9 @@ def _is_transient(exc: BaseException) -> bool:
 def _is_response_format_rejection(exc: BaseException) -> bool:
     msg = str(exc).lower()
-    if "422" not in msg and "unprocessable" not in msg:
         return False
     return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)

 from .client import Message
 DEFAULT_MODEL = "Qwen/Qwen2.5-7B-Instruct"
+# "auto" lets HF route to a provider that actually serves the chosen model.
+# The legacy "hf-inference" provider does not serve many 32B models (e.g.
+# Qwen2.5-Coder-32B), which would make every live call 400 and the agent
+# blind-fall-back to MOVE — looking like random stepping.
+DEFAULT_PROVIDER = "auto"
 _TRANSIENT_HINTS = ("timeout", "rate limit", "429", "503", "502", "temporarily")
 # Substrings that indicate the provider rejected our response_format/grammar.
 # Seen on auto-routed Together/Fireworks/etc. when the JSON schema uses
 _RESPONSE_FORMAT_REJECT_HINTS = (
     "grammar is not valid",
     "failed to compile grammar",
+    "does not support response format",
     "response_format",
     "json_schema",
 )
 def _is_response_format_rejection(exc: BaseException) -> bool:
     msg = str(exc).lower()
+    # Providers signal "I can't honor this response_format" with different
+    # HTTP codes: 422 (hf-inference/Together grammar compiler) or 400
+    # (Groq "does not support response format json_schema"). Accept both.
+    if not any(code in msg for code in ("422", "unprocessable", "400", "bad request")):
         return False
     return any(h in msg for h in _RESPONSE_FORMAT_REJECT_HINTS)