Spaces:

Chris4K
/

agent-nexus

Running

App Files Files Community

Chris4K commited on about 5 hours ago

Commit

fdd48fc

verified ·

1 Parent(s): c17cc08

Update main.py

Browse files

Files changed (1) hide show

main.py +31 -11

main.py CHANGED Viewed

@@ -253,7 +253,10 @@ async def call_ki_fusion(messages, model, max_tokens=1024, temperature=0.7, stre
     if KF_KEY: headers["Authorization"] = f"Bearer {KF_KEY}"
     payload = {"model": model, "messages": messages,
                 "max_tokens": max_tokens, "temperature": temperature, "stream": stream}
-    async with httpx.AsyncClient(timeout=60) as client:
         if stream:
             async with client.stream("POST", f"{KF_BASE}/chat/completions",
                                       headers=headers, json=payload) as resp:
@@ -287,10 +290,22 @@ async def call_hf_api(messages, model, max_tokens=1024, temperature=0.7, stream=
 async def call_local_cpu(messages, model, max_tokens=512, temperature=0.7, stream=False):
     loop = asyncio.get_event_loop()
     def _run():
         pipe = get_local_pipe()
         if not pipe:
-            raise Exception("Local model not available")
         # Build prompt from messages
         chat_messages = [{"role": m.get("role","user"),
                           "content": m.get("content","") if isinstance(m.get("content"), str) else ""}
@@ -339,10 +354,14 @@ async def route_inference(messages: list, max_tokens: int = 1024, temperature: f
     tried = []
     providers_to_try = [provider]
-    # Build fallback chain
-    for fb in ["ki_fusion","hf_api","local_cpu"]:
         if fb not in providers_to_try and provider_health.get(fb, True):
             providers_to_try.append(fb)
     last_err = None
     for p in providers_to_try:
@@ -357,11 +376,9 @@ async def route_inference(messages: list, max_tokens: int = 1024, temperature: f
                 reason += f" | fallback to {p}"
             if stream:
-                # Streaming: yield raw SSE bytes
                 async def _stream_gen():
                     async for chunk in caller(messages, fb_model, max_tokens, temperature, stream=True):
                         yield chunk
-                # Return a special marker with the generator
                 ms = int((time.time()-t0)*1000)
                 record(p, task, cost_mode, True, ms, 0, fb_model, reason)
                 return {
@@ -379,7 +396,6 @@ async def route_inference(messages: list, max_tokens: int = 1024, temperature: f
             ms = int((time.time()-t0)*1000)
             if isinstance(result, dict):
                 tokens = result.get("usage",{}).get("total_tokens", 0)
-                # Inject routing metadata into response
                 result.setdefault("_nexus", {})
                 result["_nexus"] = {"provider":p,"model":fb_model,"task":task,
                                      "complexity":complexity,"reason":reason,
@@ -389,10 +405,14 @@ async def route_inference(messages: list, max_tokens: int = 1024, temperature: f
         except Exception as e:
             last_err = str(e)
-            log.warning(f"Provider {p} failed: {e}")
-            provider_health[p] = False
             ok = False
-            asyncio.get_event_loop().call_later(60, lambda pr=p: provider_health.update({pr: True}))
     ms = int((time.time()-t0)*1000)
     record(tried[-1] if tried else "none", task, cost_mode, False, ms, 0, model, reason)
@@ -1149,4 +1169,4 @@ loadLog();
 setInterval(function(){loadLog();},8000);
 </script>
 </body>
-</html>"""

     if KF_KEY: headers["Authorization"] = f"Bearer {KF_KEY}"
     payload = {"model": model, "messages": messages,
                 "max_tokens": max_tokens, "temperature": temperature, "stream": stream}
+    # Fast-fail connect: 6s tells us immediately if your server is off.
+    # Read stays at 90s to handle long inference when server IS on.
+    timeout = httpx.Timeout(connect=6.0, read=90.0, write=10.0, pool=5.0)
+    async with httpx.AsyncClient(timeout=timeout) as client:
         if stream:
             async with client.stream("POST", f"{KF_BASE}/chat/completions",
                                       headers=headers, json=payload) as resp:
 async def call_local_cpu(messages, model, max_tokens=512, temperature=0.7, stream=False):
     loop = asyncio.get_event_loop()
+    # Bug fix: if model is still loading (_local_loading=True), wait up to 90s
+    # instead of failing immediately. This is the guaranteed last-resort provider.
+    waited = 0
+    while _local_loading and waited < 90:
+        log.info(f"[local_cpu] Model still loading, waiting… ({waited}s)")
+        await asyncio.sleep(3)
+        waited += 3
+    # If not loaded yet, trigger a load attempt now (synchronously in thread)
+    if not _local_pipe and not _local_loading:
+        log.info("[local_cpu] Triggering model load now (first request)")
+        await loop.run_in_executor(None, get_local_pipe)
     def _run():
         pipe = get_local_pipe()
         if not pipe:
+            raise Exception("Local model not available — transformers load failed. Check logs for OOM or missing dependencies.")
         # Build prompt from messages
         chat_messages = [{"role": m.get("role","user"),
                           "content": m.get("content","") if isinstance(m.get("content"), str) else ""}
     tried = []
     providers_to_try = [provider]
+    # Build fallback chain: ki_fusion → hf_api can be skipped if health=False,
+    # but local_cpu is ALWAYS added last — it's the guaranteed offline fallback.
+    for fb in ["ki_fusion", "hf_api"]:
         if fb not in providers_to_try and provider_health.get(fb, True):
             providers_to_try.append(fb)
+    # local_cpu: always last, always tried — never skip it
+    if "local_cpu" not in providers_to_try:
+        providers_to_try.append("local_cpu")
     last_err = None
     for p in providers_to_try:
                 reason += f" | fallback to {p}"
             if stream:
                 async def _stream_gen():
                     async for chunk in caller(messages, fb_model, max_tokens, temperature, stream=True):
                         yield chunk
                 ms = int((time.time()-t0)*1000)
                 record(p, task, cost_mode, True, ms, 0, fb_model, reason)
                 return {
             ms = int((time.time()-t0)*1000)
             if isinstance(result, dict):
                 tokens = result.get("usage",{}).get("total_tokens", 0)
                 result.setdefault("_nexus", {})
                 result["_nexus"] = {"provider":p,"model":fb_model,"task":task,
                                      "complexity":complexity,"reason":reason,
         except Exception as e:
             last_err = str(e)
+            # Log full error so it appears in HF logs — key diagnostic info
+            log.error(f"[NEXUS] Provider '{p}' FAILED: {last_err}")
+            # Don't permanently disable local_cpu — it's the guaranteed fallback.
+            # Disabling it means ALL subsequent requests fail until 60s restore.
+            if p != "local_cpu":
+                provider_health[p] = False
+                asyncio.get_event_loop().call_later(60, lambda pr=p: provider_health.update({pr: True}))
             ok = False
     ms = int((time.time()-t0)*1000)
     record(tried[-1] if tried else "none", task, cost_mode, False, ms, 0, model, reason)
 setInterval(function(){loadLog();},8000);
 </script>
 </body>
+</html>"""