Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -6,6 +6,8 @@ from pydantic import BaseModel
|
|
| 6 |
import httpx
|
| 7 |
from duckduckgo_search import DDGS
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
|
@@ -26,8 +28,24 @@ class ChatRequest(BaseModel):
|
|
| 26 |
|
| 27 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
async def evaluate_needs_search(query: str) -> bool:
|
| 33 |
"""Uses a fast, small model to determine if the query requires real-time data."""
|
|
@@ -82,8 +100,8 @@ def perform_search(query: str, max_results: int = 3) -> str:
|
|
| 82 |
|
| 83 |
@app.post("/chat/completions")
|
| 84 |
async def situation_aware_chat(request: ChatRequest):
|
| 85 |
-
if not GROQ_API_KEY:
|
| 86 |
-
raise HTTPException(status_code=500, detail="
|
| 87 |
|
| 88 |
# 1. Evaluate if search is needed
|
| 89 |
user_query = request.message
|
|
@@ -113,6 +131,9 @@ async def situation_aware_chat(request: ChatRequest):
|
|
| 113 |
|
| 114 |
# 3. Call Primary LLM
|
| 115 |
try:
|
|
|
|
|
|
|
|
|
|
| 116 |
async with httpx.AsyncClient() as client:
|
| 117 |
response = await client.post(
|
| 118 |
"https://api.groq.com/openai/v1/chat/completions",
|
|
@@ -130,7 +151,44 @@ async def situation_aware_chat(request: ChatRequest):
|
|
| 130 |
return result
|
| 131 |
except Exception as e:
|
| 132 |
print(f"Primary LLM Error: {e}")
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
|
| 135 |
|
| 136 |
@app.get("/health")
|
|
@@ -138,4 +196,4 @@ def health_check():
|
|
| 138 |
return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
|
| 139 |
|
| 140 |
if __name__ == "__main__":
|
| 141 |
-
uvicorn.run("main:app", host="0.0.0.0", port=
|
|
|
|
| 6 |
import httpx
|
| 7 |
from duckduckgo_search import DDGS
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
+
from llama_cpp import Llama
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
|
|
|
| 28 |
|
| 29 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
| 30 |
|
| 31 |
+
# --- Local Fallback LLM Setup ---
|
| 32 |
+
fallback_llm = None
|
| 33 |
+
try:
|
| 34 |
+
print("Initializing Local Fallback Model (Qwen 0.5B GGUF)...")
|
| 35 |
+
model_path = hf_hub_download(
|
| 36 |
+
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
|
| 37 |
+
filename="qwen1_5-0_5b-chat-q4_k_m.gguf",
|
| 38 |
+
cache_dir="./models"
|
| 39 |
+
)
|
| 40 |
+
fallback_llm = Llama(
|
| 41 |
+
model_path=model_path,
|
| 42 |
+
n_ctx=2048,
|
| 43 |
+
n_gpu_layers=0, # CPU only on basic spaces
|
| 44 |
+
verbose=False
|
| 45 |
+
)
|
| 46 |
+
print("Local Fallback Model ready.")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Failed to initialize local fallback LLM: {e}")
|
| 49 |
|
| 50 |
async def evaluate_needs_search(query: str) -> bool:
|
| 51 |
"""Uses a fast, small model to determine if the query requires real-time data."""
|
|
|
|
| 100 |
|
| 101 |
@app.post("/chat/completions")
|
| 102 |
async def situation_aware_chat(request: ChatRequest):
|
| 103 |
+
if not GROQ_API_KEY and not fallback_llm:
|
| 104 |
+
raise HTTPException(status_code=500, detail="No AI service is currently available.")
|
| 105 |
|
| 106 |
# 1. Evaluate if search is needed
|
| 107 |
user_query = request.message
|
|
|
|
| 131 |
|
| 132 |
# 3. Call Primary LLM
|
| 133 |
try:
|
| 134 |
+
if not GROQ_API_KEY:
|
| 135 |
+
raise Exception("GROQ API Key missing, forcing fallback.")
|
| 136 |
+
|
| 137 |
async with httpx.AsyncClient() as client:
|
| 138 |
response = await client.post(
|
| 139 |
"https://api.groq.com/openai/v1/chat/completions",
|
|
|
|
| 151 |
return result
|
| 152 |
except Exception as e:
|
| 153 |
print(f"Primary LLM Error: {e}")
|
| 154 |
+
|
| 155 |
+
# 4. Execute Local Fallback
|
| 156 |
+
if fallback_llm:
|
| 157 |
+
print("Primary API failed. Firing local fallback inference...")
|
| 158 |
+
try:
|
| 159 |
+
# Format for huggingface chat template (basic approximation)
|
| 160 |
+
prompt_text = "\n".join([f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>" for m in final_messages])
|
| 161 |
+
prompt_text += "\n<|im_start|>assistant\n"
|
| 162 |
+
|
| 163 |
+
output = fallback_llm(
|
| 164 |
+
prompt_text,
|
| 165 |
+
max_tokens=1024,
|
| 166 |
+
temperature=0.7,
|
| 167 |
+
stop=["<|im_end|>", "<|im_start|>"]
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Format to match OpenAI API Spec
|
| 171 |
+
return {
|
| 172 |
+
"id": output.get("id", "fallback_id"),
|
| 173 |
+
"object": "chat.completion",
|
| 174 |
+
"created": output.get("created", 0),
|
| 175 |
+
"model": "qwen-0.5b-local-fallback",
|
| 176 |
+
"choices": [
|
| 177 |
+
{
|
| 178 |
+
"index": 0,
|
| 179 |
+
"message": {
|
| 180 |
+
"role": "assistant",
|
| 181 |
+
"content": output["choices"][0]["text"].strip()
|
| 182 |
+
},
|
| 183 |
+
"finish_reason": "stop"
|
| 184 |
+
}
|
| 185 |
+
],
|
| 186 |
+
"usage": output.get("usage", {})
|
| 187 |
+
}
|
| 188 |
+
except Exception as fallback_e:
|
| 189 |
+
print(f"Fallback LLM Error: {fallback_e}")
|
| 190 |
+
raise HTTPException(status_code=503, detail="Primary AI and Local Fallback are currently unavailable.")
|
| 191 |
+
|
| 192 |
raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
|
| 193 |
|
| 194 |
@app.get("/health")
|
|
|
|
| 196 |
return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|
| 199 |
+
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
|