Spaces:

edyxapi
/

situationaware

Sleeping

App Files Files Community

Adi362 commited on Mar 11

Commit

7ef84f5

verified ·

1 Parent(s): 491b51c

Update main.py

Browse files

Files changed (1) hide show

main.py +64 -6

main.py CHANGED Viewed

@@ -6,6 +6,8 @@ from pydantic import BaseModel
 import httpx
 from duckduckgo_search import DDGS
 from dotenv import load_dotenv
 load_dotenv()
@@ -26,8 +28,24 @@ class ChatRequest(BaseModel):
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
-# Fallback check - if we were actually deploying on HF with a local GGUF,
-# we would load llama-cpp-python here. For this stage, we'll setup the Groq primary pipeline.
 async def evaluate_needs_search(query: str) -> bool:
     """Uses a fast, small model to determine if the query requires real-time data."""
@@ -82,8 +100,8 @@ def perform_search(query: str, max_results: int = 3) -> str:
 @app.post("/chat/completions")
 async def situation_aware_chat(request: ChatRequest):
-    if not GROQ_API_KEY:
-        raise HTTPException(status_code=500, detail="GROQ_API_KEY is not set in the environment.")
     # 1. Evaluate if search is needed
     user_query = request.message
@@ -113,6 +131,9 @@ async def situation_aware_chat(request: ChatRequest):
     # 3. Call Primary LLM
     try:
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 "https://api.groq.com/openai/v1/chat/completions",
@@ -130,7 +151,44 @@ async def situation_aware_chat(request: ChatRequest):
             return result
     except Exception as e:
         print(f"Primary LLM Error: {e}")
-        # Here we would fallback to `llama-cpp-python` local inference
         raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
 @app.get("/health")
@@ -138,4 +196,4 @@ def health_check():
     return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
 if __name__ == "__main__":
-    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

 import httpx
 from duckduckgo_search import DDGS
 from dotenv import load_dotenv
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 load_dotenv()
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+# --- Local Fallback LLM Setup ---
+fallback_llm = None
+try:
+    print("Initializing Local Fallback Model (Qwen 0.5B GGUF)...")
+    model_path = hf_hub_download(
+        repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
+        filename="qwen1_5-0_5b-chat-q4_k_m.gguf",
+        cache_dir="./models"
+    )
+    fallback_llm = Llama(
+        model_path=model_path,
+        n_ctx=2048,
+        n_gpu_layers=0,  # CPU only on basic spaces
+        verbose=False
+    )
+    print("Local Fallback Model ready.")
+except Exception as e:
+    print(f"Failed to initialize local fallback LLM: {e}")
 async def evaluate_needs_search(query: str) -> bool:
     """Uses a fast, small model to determine if the query requires real-time data."""
 @app.post("/chat/completions")
 async def situation_aware_chat(request: ChatRequest):
+    if not GROQ_API_KEY and not fallback_llm:
+        raise HTTPException(status_code=500, detail="No AI service is currently available.")
     # 1. Evaluate if search is needed
     user_query = request.message
     # 3. Call Primary LLM
     try:
+        if not GROQ_API_KEY:
+            raise Exception("GROQ API Key missing, forcing fallback.")
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 "https://api.groq.com/openai/v1/chat/completions",
             return result
     except Exception as e:
         print(f"Primary LLM Error: {e}")
+        # 4. Execute Local Fallback
+        if fallback_llm:
+            print("Primary API failed. Firing local fallback inference...")
+            try:
+                # Format for huggingface chat template (basic approximation)
+                prompt_text = "\n".join([f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>" for m in final_messages])
+                prompt_text += "\n<|im_start|>assistant\n"
+                output = fallback_llm(
+                    prompt_text,
+                    max_tokens=1024,
+                    temperature=0.7,
+                    stop=["<|im_end|>", "<|im_start|>"]
+                )
+                # Format to match OpenAI API Spec
+                return {
+                    "id": output.get("id", "fallback_id"),
+                    "object": "chat.completion",
+                    "created": output.get("created", 0),
+                    "model": "qwen-0.5b-local-fallback",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": output["choices"][0]["text"].strip()
+                            },
+                            "finish_reason": "stop"
+                        }
+                    ],
+                    "usage": output.get("usage", {})
+                }
+            except Exception as fallback_e:
+                print(f"Fallback LLM Error: {fallback_e}")
+                raise HTTPException(status_code=503, detail="Primary AI and Local Fallback are currently unavailable.")
         raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
 @app.get("/health")
     return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
 if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)