Adi362 commited on
Commit
7ef84f5
·
verified ·
1 Parent(s): 491b51c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +64 -6
main.py CHANGED
@@ -6,6 +6,8 @@ from pydantic import BaseModel
6
  import httpx
7
  from duckduckgo_search import DDGS
8
  from dotenv import load_dotenv
 
 
9
 
10
  load_dotenv()
11
 
@@ -26,8 +28,24 @@ class ChatRequest(BaseModel):
26
 
27
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
28
 
29
- # Fallback check - if we were actually deploying on HF with a local GGUF,
30
- # we would load llama-cpp-python here. For this stage, we'll setup the Groq primary pipeline.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  async def evaluate_needs_search(query: str) -> bool:
33
  """Uses a fast, small model to determine if the query requires real-time data."""
@@ -82,8 +100,8 @@ def perform_search(query: str, max_results: int = 3) -> str:
82
 
83
  @app.post("/chat/completions")
84
  async def situation_aware_chat(request: ChatRequest):
85
- if not GROQ_API_KEY:
86
- raise HTTPException(status_code=500, detail="GROQ_API_KEY is not set in the environment.")
87
 
88
  # 1. Evaluate if search is needed
89
  user_query = request.message
@@ -113,6 +131,9 @@ async def situation_aware_chat(request: ChatRequest):
113
 
114
  # 3. Call Primary LLM
115
  try:
 
 
 
116
  async with httpx.AsyncClient() as client:
117
  response = await client.post(
118
  "https://api.groq.com/openai/v1/chat/completions",
@@ -130,7 +151,44 @@ async def situation_aware_chat(request: ChatRequest):
130
  return result
131
  except Exception as e:
132
  print(f"Primary LLM Error: {e}")
133
- # Here we would fallback to `llama-cpp-python` local inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
135
 
136
  @app.get("/health")
@@ -138,4 +196,4 @@ def health_check():
138
  return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
139
 
140
  if __name__ == "__main__":
141
- uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
6
  import httpx
7
  from duckduckgo_search import DDGS
8
  from dotenv import load_dotenv
9
+ from huggingface_hub import hf_hub_download
10
+ from llama_cpp import Llama
11
 
12
  load_dotenv()
13
 
 
28
 
29
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
30
 
31
+ # --- Local Fallback LLM Setup ---
32
+ fallback_llm = None
33
+ try:
34
+ print("Initializing Local Fallback Model (Qwen 0.5B GGUF)...")
35
+ model_path = hf_hub_download(
36
+ repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
37
+ filename="qwen1_5-0_5b-chat-q4_k_m.gguf",
38
+ cache_dir="./models"
39
+ )
40
+ fallback_llm = Llama(
41
+ model_path=model_path,
42
+ n_ctx=2048,
43
+ n_gpu_layers=0, # CPU only on basic spaces
44
+ verbose=False
45
+ )
46
+ print("Local Fallback Model ready.")
47
+ except Exception as e:
48
+ print(f"Failed to initialize local fallback LLM: {e}")
49
 
50
  async def evaluate_needs_search(query: str) -> bool:
51
  """Uses a fast, small model to determine if the query requires real-time data."""
 
100
 
101
  @app.post("/chat/completions")
102
  async def situation_aware_chat(request: ChatRequest):
103
+ if not GROQ_API_KEY and not fallback_llm:
104
+ raise HTTPException(status_code=500, detail="No AI service is currently available.")
105
 
106
  # 1. Evaluate if search is needed
107
  user_query = request.message
 
131
 
132
  # 3. Call Primary LLM
133
  try:
134
+ if not GROQ_API_KEY:
135
+ raise Exception("GROQ API Key missing, forcing fallback.")
136
+
137
  async with httpx.AsyncClient() as client:
138
  response = await client.post(
139
  "https://api.groq.com/openai/v1/chat/completions",
 
151
  return result
152
  except Exception as e:
153
  print(f"Primary LLM Error: {e}")
154
+
155
+ # 4. Execute Local Fallback
156
+ if fallback_llm:
157
+ print("Primary API failed. Firing local fallback inference...")
158
+ try:
159
+ # Format for huggingface chat template (basic approximation)
160
+ prompt_text = "\n".join([f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>" for m in final_messages])
161
+ prompt_text += "\n<|im_start|>assistant\n"
162
+
163
+ output = fallback_llm(
164
+ prompt_text,
165
+ max_tokens=1024,
166
+ temperature=0.7,
167
+ stop=["<|im_end|>", "<|im_start|>"]
168
+ )
169
+
170
+ # Format to match OpenAI API Spec
171
+ return {
172
+ "id": output.get("id", "fallback_id"),
173
+ "object": "chat.completion",
174
+ "created": output.get("created", 0),
175
+ "model": "qwen-0.5b-local-fallback",
176
+ "choices": [
177
+ {
178
+ "index": 0,
179
+ "message": {
180
+ "role": "assistant",
181
+ "content": output["choices"][0]["text"].strip()
182
+ },
183
+ "finish_reason": "stop"
184
+ }
185
+ ],
186
+ "usage": output.get("usage", {})
187
+ }
188
+ except Exception as fallback_e:
189
+ print(f"Fallback LLM Error: {fallback_e}")
190
+ raise HTTPException(status_code=503, detail="Primary AI and Local Fallback are currently unavailable.")
191
+
192
  raise HTTPException(status_code=503, detail="Primary AI service is currently unavailable.")
193
 
194
  @app.get("/health")
 
196
  return {"status": "ok", "service": "edyx-situation-aware-pipeline"}
197
 
198
  if __name__ == "__main__":
199
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)