MohitGupta41 commited on
Commit
d4b40f7
·
1 Parent(s): 19111af

Add application file

Browse files
Files changed (5) hide show
  1. .env +7 -2
  2. Dockerfile +9 -19
  3. app.py +376 -125
  4. requirements.txt +2 -1
  5. start.sh +0 -42
.env CHANGED
@@ -1,2 +1,7 @@
1
- # MODEL_NAME=llama3:8b
2
- MODEL_NAME=mistral:instruct
 
 
 
 
 
 
1
+ # --- Optional fallbacks (only if you DON'T send keys from the client) ---
2
+ GEMINI_API_KEY=your_gemini_key_here
3
+ HF_API_KEY=hf_your_hf_key_here
4
+
5
+ # --- Optional default models (used if the request doesn't specify `model`) ---
6
+ DEFAULT_GEMINI_MODEL=gemini-1.5-flash
7
+ DEFAULT_HF_MODEL=google/gemma-3-27b-it
Dockerfile CHANGED
@@ -1,26 +1,16 @@
1
  FROM python:3.11-slim
2
 
3
- RUN apt-get update && apt-get install -y curl ca-certificates && rm -rf /var/lib/apt/lists/*
 
4
 
5
- # Install Ollama
6
- RUN curl -fsSL https://ollama.com/install.sh | sh
7
 
8
- # Non-root user
9
- RUN useradd -m -u 1000 appuser
10
- USER appuser
11
-
12
- # ✅ Use absolute paths here (do NOT use $HOME interpolation)
13
- ENV HOME=/home/appuser
14
- ENV PATH="/home/appuser/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/sbin:/bin"
15
- ENV OLLAMA_MODELS="/home/appuser/.ollama"
16
- ENV PYTHONDONTWRITEBYTECODE=1
17
- ENV PYTHONUNBUFFERED=1
18
-
19
- WORKDIR /home/appuser/app
20
- COPY --chown=appuser requirements.txt .
21
  RUN pip install --no-cache-dir -r requirements.txt
22
- COPY --chown=appuser . .
 
 
23
 
24
  EXPOSE 7860
25
- RUN chmod +x start.sh
26
- CMD ["./start.sh"]
 
1
  FROM python:3.11-slim
2
 
3
+ # System deps (certs only)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
5
 
6
+ WORKDIR /app
 
7
 
8
+ # Python deps
9
+ COPY requirements.txt .
 
 
 
 
 
 
 
 
 
 
 
10
  RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # App code
13
+ COPY . .
14
 
15
  EXPOSE 7860
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
app.py CHANGED
@@ -1,22 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import time
3
  import logging
4
- from typing import Optional
5
 
6
- from fastapi import FastAPI, HTTPException
7
  from fastapi.responses import JSONResponse
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, Field, ConfigDict
10
- import ollama
11
-
12
- # --- Config ---
13
- # MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2:3b-instruct-q4_K_M") # small & CPU-friendly
14
- # MODEL_NAME = os.getenv("MODEL_NAME", "mistral:instruct") # small & CPU-friendly
15
- MODEL_NAME = os.getenv("MODEL_NAME", "smallthinker:latest") # small & CPU-friendly
16
- PROFILE_MD_PATH = os.path.join("Data", "profile_data.md")
17
 
 
18
  logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
 
 
20
 
21
  def load_profile_md() -> str:
22
  if os.path.exists(PROFILE_MD_PATH):
@@ -24,143 +190,228 @@ def load_profile_md() -> str:
24
  return f.read()
25
  return ""
26
 
27
- def load_profile_text():
28
- with open("Data/profile_data.txt", "r", encoding="utf-8") as f:
29
- return f.read()
30
-
31
  PROFILE_MD = load_profile_md()
32
- # PROFILE_MD = load_profile_text()
33
- # print(PROFILE_MD)
34
-
35
- SYSTEM_PROMPT = f"""You are Mohit Gupta's AI voice twin, built to assist in interviews and Q&A sessions.
36
- Your job is to answer truthfully, factually, and in a friendly but professional tone using the context provided.
37
 
38
- The context is formatted in Markdown with sections (e.g., # About Me, ## Projects, ### Features).
39
- Use these sections to give structured and relevant answers.
40
- Do not invent details not present in the context. If asked about something outside this context, politely clarify.
 
 
41
 
42
  Guidelines:
43
- - Answer concisely but include specific details when relevant (projects, metrics, tech stack).
44
- - If multiple related sections exist, combine their info naturally.
45
- - Do not repeat the entire context; summarize what is relevant to the question.
46
- - Maintain first-person voice (“I have worked on…”) as you are representing Mohit Gupta.
47
 
48
- Context about Mohit (Markdown format):
49
  {PROFILE_MD}
50
- """
51
 
52
- app = FastAPI(title="Voice Agent API", version="0.1.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  app.add_middleware(
54
  CORSMiddleware,
55
- allow_origins=["*"], # tighten for prod
56
  allow_credentials=True,
57
  allow_methods=["*"],
58
  allow_headers=["*"],
59
  )
60
 
61
  class ChatIn(BaseModel):
62
- question: str = Field(..., examples=["Give me a one-line intro about me."])
63
- session_id: str | None = Field(None, examples=["abc123"])
 
 
 
 
 
 
 
 
 
 
 
64
  model_config = ConfigDict(json_schema_extra={
65
- "examples": [
66
- {"question": "Summarize your projects briefly.", "session_id": "demo-1"}
67
- ]
 
 
 
68
  })
69
 
70
  class ChatOut(BaseModel):
71
  answer: str
72
 
73
- def _ollama_ok(timeout=15):
74
- """Wait until ollama serve is ready."""
75
- t0 = time.time()
76
- while time.time() - t0 < timeout:
77
- try:
78
- _ = ollama.list() # hits http://127.0.0.1:11434 by default
79
- return True
80
- except Exception:
81
- time.sleep(0.5)
82
- return False
83
-
84
- @app.on_event("startup")
85
- async def on_start():
86
- logger.info(f"Starting API with model: {MODEL_NAME}")
87
- if not _ollama_ok():
88
- logger.warning("Ollama not ready after wait; requests may fail.")
89
-
90
  @app.get("/")
91
  def root():
92
- return JSONResponse({"ok": True, "message": "Voice Agent API"})
93
 
94
  @app.get("/api/health")
95
  def health():
96
- try:
97
- models = [m["name"] for m in ollama.list().get("models", [])]
98
- print(ollama.list())
99
- return {"ok": True, "model": MODEL_NAME, "available_models": models}
100
- except Exception as e:
101
- return {"ok": False, "error": str(e)}
102
-
103
- @app.post("/api/chat", response_model=ChatOut,
104
- tags=["Chat"], summary="Ask the agent",
105
- description="Send a question; returns a concise first-person answer.")
106
- def chat(payload: ChatIn):
107
- try:
108
- # res = ollama.chat(
109
- # model=MODEL_NAME,
110
- # messages=[
111
- # {"role": "system", "content": SYSTEM_PROMPT},
112
- # {"role": "user", "content": payload.question},
113
- # ],
114
- # )
115
- def build_prompt(question: str) -> str:
116
- # return f"""
117
- # You are Mohit Gupta's AI voice twin...
118
-
119
- # ### Guidelines
120
- # - Answer concisely...
121
- # - First-person voice...
122
-
123
- # ### Context (use this only; do not invent):
124
- return f"""
125
- You are Mohit Gupta's AI voice twin, built to assist in interviews and Q&A sessions.
126
- Your job is to answer truthfully, factually, and in a friendly but professional tone using the context provided.
127
-
128
- The context is formatted in Markdown with sections (e.g., # About Me, ## Projects, ### Features).
129
- Use these sections to give structured and relevant answers.
130
- Do not invent details not present in the context. If asked about something outside this context, politely clarify.
131
-
132
- Guidelines:
133
- - Answer concisely but include specific details when relevant (projects, metrics, tech stack).
134
- - If multiple related sections exist, combine their info naturally.
135
- - Do not repeat the entire context; summarize what is relevant to the question.
136
- - Maintain first-person voice (“I have worked on…”) as you are representing Mohit Gupta.
137
-
138
- Context about Mohit (Markdown format):
139
- {PROFILE_MD}
140
-
141
- ### Task
142
- Answer the user question using ONLY the context above.
143
-
144
- ### Question
145
- {question}
146
- """
147
-
148
- res = ollama.chat(
149
- model=MODEL_NAME,
150
- messages=[{"role": "user", "content": build_prompt(payload.question)}],
151
- options={"num_ctx": 7000} # give yourself room
152
- )
153
-
154
- print(SYSTEM_PROMPT)
155
- print('*'*50)
156
- print(res)
157
- print('*'*50)
158
- print(payload)
159
- text = res.get("message", {}).get("content", "").strip()
160
- return ChatOut(answer=text or "Sorry, I didn’t catch that.")
161
- except Exception as e:
162
- # Show a useful error if the model is missing
163
- if "model" in str(e).lower() and "not found" in str(e).lower():
164
- raise HTTPException(500, f"Model '{MODEL_NAME}' not found in Ollama. "
165
- f"Make sure it’s pulled at start. Error: {e}")
166
- raise
 
1
+ # import os
2
+ # import time
3
+ # import logging
4
+ # from typing import Optional
5
+
6
+ # from fastapi import FastAPI, HTTPException
7
+ # from fastapi.responses import JSONResponse
8
+ # from fastapi.middleware.cors import CORSMiddleware
9
+ # from pydantic import BaseModel, Field, ConfigDict
10
+ # import ollama
11
+
12
+ # # --- Config ---
13
+ # # MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2:3b-instruct-q4_K_M") # small & CPU-friendly
14
+ # # MODEL_NAME = os.getenv("MODEL_NAME", "mistral:instruct") # small & CPU-friendly
15
+ # MODEL_NAME = os.getenv("MODEL_NAME", "smallthinker:latest") # small & CPU-friendly
16
+ # PROFILE_MD_PATH = os.path.join("Data", "profile_data.md")
17
+
18
+ # logging.basicConfig(level=logging.INFO)
19
+ # logger = logging.getLogger(__name__)
20
+
21
+ # def load_profile_md() -> str:
22
+ # if os.path.exists(PROFILE_MD_PATH):
23
+ # with open(PROFILE_MD_PATH, "r", encoding="utf-8") as f:
24
+ # return f.read()
25
+ # return ""
26
+
27
+ # def load_profile_text():
28
+ # with open("Data/profile_data.txt", "r", encoding="utf-8") as f:
29
+ # return f.read()
30
+
31
+ # PROFILE_MD = load_profile_md()
32
+ # # PROFILE_MD = load_profile_text()
33
+ # # print(PROFILE_MD)
34
+
35
+ # SYSTEM_PROMPT = f"""You are Mohit Gupta's AI voice twin, built to assist in interviews and Q&A sessions.
36
+ # Your job is to answer truthfully, factually, and in a friendly but professional tone using the context provided.
37
+
38
+ # The context is formatted in Markdown with sections (e.g., # About Me, ## Projects, ### Features).
39
+ # Use these sections to give structured and relevant answers.
40
+ # Do not invent details not present in the context. If asked about something outside this context, politely clarify.
41
+
42
+ # Guidelines:
43
+ # - Answer concisely but include specific details when relevant (projects, metrics, tech stack).
44
+ # - If multiple related sections exist, combine their info naturally.
45
+ # - Do not repeat the entire context; summarize what is relevant to the question.
46
+ # - Maintain first-person voice (“I have worked on…”) as you are representing Mohit Gupta.
47
+
48
+ # Context about Mohit (Markdown format):
49
+ # {PROFILE_MD}
50
+ # """
51
+
52
+ # app = FastAPI(title="Voice Agent API", version="0.1.0")
53
+ # app.add_middleware(
54
+ # CORSMiddleware,
55
+ # allow_origins=["*"], # tighten for prod
56
+ # allow_credentials=True,
57
+ # allow_methods=["*"],
58
+ # allow_headers=["*"],
59
+ # )
60
+
61
+ # class ChatIn(BaseModel):
62
+ # question: str = Field(..., examples=["Give me a one-line intro about me."])
63
+ # session_id: str | None = Field(None, examples=["abc123"])
64
+ # model_config = ConfigDict(json_schema_extra={
65
+ # "examples": [
66
+ # {"question": "Summarize your projects briefly.", "session_id": "demo-1"}
67
+ # ]
68
+ # })
69
+
70
+ # class ChatOut(BaseModel):
71
+ # answer: str
72
+
73
+ # def _ollama_ok(timeout=15):
74
+ # """Wait until ollama serve is ready."""
75
+ # t0 = time.time()
76
+ # while time.time() - t0 < timeout:
77
+ # try:
78
+ # _ = ollama.list() # hits http://127.0.0.1:11434 by default
79
+ # return True
80
+ # except Exception:
81
+ # time.sleep(0.5)
82
+ # return False
83
+
84
+ # @app.on_event("startup")
85
+ # async def on_start():
86
+ # logger.info(f"Starting API with model: {MODEL_NAME}")
87
+ # if not _ollama_ok():
88
+ # logger.warning("Ollama not ready after wait; requests may fail.")
89
+
90
+ # @app.get("/")
91
+ # def root():
92
+ # return JSONResponse({"ok": True, "message": "Voice Agent API"})
93
+
94
+ # @app.get("/api/health")
95
+ # def health():
96
+ # try:
97
+ # models = [m["name"] for m in ollama.list().get("models", [])]
98
+ # print(ollama.list())
99
+ # return {"ok": True, "model": MODEL_NAME, "available_models": models}
100
+ # except Exception as e:
101
+ # return {"ok": False, "error": str(e)}
102
+
103
+ # @app.post("/api/chat", response_model=ChatOut,
104
+ # tags=["Chat"], summary="Ask the agent",
105
+ # description="Send a question; returns a concise first-person answer.")
106
+ # def chat(payload: ChatIn):
107
+ # try:
108
+ # # res = ollama.chat(
109
+ # # model=MODEL_NAME,
110
+ # # messages=[
111
+ # # {"role": "system", "content": SYSTEM_PROMPT},
112
+ # # {"role": "user", "content": payload.question},
113
+ # # ],
114
+ # # )
115
+ # def build_prompt(question: str) -> str:
116
+ # # return f"""
117
+ # # You are Mohit Gupta's AI voice twin...
118
+
119
+ # # ### Guidelines
120
+ # # - Answer concisely...
121
+ # # - First-person voice...
122
+
123
+ # # ### Context (use this only; do not invent):
124
+ # return f"""
125
+ # You are Mohit Gupta's AI voice twin, built to assist in interviews and Q&A sessions.
126
+ # Your job is to answer truthfully, factually, and in a friendly but professional tone using the context provided.
127
+
128
+ # The context is formatted in Markdown with sections (e.g., # About Me, ## Projects, ### Features).
129
+ # Use these sections to give structured and relevant answers.
130
+ # Do not invent details not present in the context. If asked about something outside this context, politely clarify.
131
+
132
+ # Guidelines:
133
+ # - Answer concisely but include specific details when relevant (projects, metrics, tech stack).
134
+ # - If multiple related sections exist, combine their info naturally.
135
+ # - Do not repeat the entire context; summarize what is relevant to the question.
136
+ # - Maintain first-person voice (“I have worked on…”) as you are representing Mohit Gupta.
137
+
138
+ # Context about Mohit (Markdown format):
139
+ # {PROFILE_MD}
140
+
141
+ # ### Task
142
+ # Answer the user question using ONLY the context above.
143
+
144
+ # ### Question
145
+ # {question}
146
+ # """
147
+
148
+ # res = ollama.chat(
149
+ # model=MODEL_NAME,
150
+ # messages=[{"role": "user", "content": build_prompt(payload.question)}],
151
+ # options={"num_ctx": 7000} # give yourself room
152
+ # )
153
+
154
+ # print(SYSTEM_PROMPT)
155
+ # print('*'*50)
156
+ # print(res)
157
+ # print('*'*50)
158
+ # print(payload)
159
+ # text = res.get("message", {}).get("content", "").strip()
160
+ # return ChatOut(answer=text or "Sorry, I didn’t catch that.")
161
+ # except Exception as e:
162
+ # # Show a useful error if the model is missing
163
+ # if "model" in str(e).lower() and "not found" in str(e).lower():
164
+ # raise HTTPException(500, f"Model '{MODEL_NAME}' not found in Ollama. "
165
+ # f"Make sure it’s pulled at start. Error: {e}")
166
+ # raise
167
+
168
+
169
+
170
+ # app.py
171
  import os
 
172
  import logging
173
+ from typing import Optional, Literal, Dict, Any
174
 
175
+ from fastapi import FastAPI, HTTPException, Header
176
  from fastapi.responses import JSONResponse
177
  from fastapi.middleware.cors import CORSMiddleware
178
  from pydantic import BaseModel, Field, ConfigDict
179
+ import httpx
 
 
 
 
 
 
180
 
181
+ # ---------- Config ----------
182
  logging.basicConfig(level=logging.INFO)
183
+ logger = logging.getLogger("voice-agent")
184
+
185
+ PROFILE_MD_PATH = os.path.join("Data", "profile_data.md")
186
 
187
  def load_profile_md() -> str:
188
  if os.path.exists(PROFILE_MD_PATH):
 
190
  return f.read()
191
  return ""
192
 
 
 
 
 
193
  PROFILE_MD = load_profile_md()
 
 
 
 
 
194
 
195
+ def build_prompt(question: str) -> str:
196
+ """Single-message prompt so it works reliably across providers."""
197
+ return f"""
198
+ You are Mohit Gupta's AI voice twin, built to assist in interviews and Q&A sessions.
199
+ Answer truthfully, factually, and in a friendly but professional tone using ONLY the context provided.
200
 
201
  Guidelines:
202
+ - Be concise but include specifics when relevant (projects, metrics, tech).
203
+ - Combine related details naturally.
204
+ - Do NOT invent facts outside the context.
205
+ - Speak in first person (“I have worked on…”).
206
 
207
+ ### Context (Markdown)
208
  {PROFILE_MD}
 
209
 
210
+ ### Task
211
+ Answer the question using ONLY the context above.
212
+
213
+ ### Question
214
+ {question}
215
+
216
+ ### Answer
217
+ """.strip()
218
+
219
+ # ---------- Provider Clients ----------
220
+ # We prefer Gemini by default. If user chooses Hugging Face, we call HF Inference API for the specified model.
221
+
222
+ async def call_gemini(
223
+ api_key: str,
224
+ model: str,
225
+ prompt: str,
226
+ generation_config: Optional[Dict[str, Any]] = None
227
+ ) -> str:
228
+ """
229
+ Calls Google Gemini via the official python SDK if available; falls back to REST if not.
230
+ We DON'T log the API key.
231
+ """
232
+ generation_config = generation_config or {"temperature": 0.2, "max_output_tokens": 512}
233
+
234
+ try:
235
+ # Prefer python SDK (google-generativeai)
236
+ import google.generativeai as genai # type: ignore
237
+ genai.configure(api_key=api_key)
238
+ gm = genai.GenerativeModel(model)
239
+ resp = gm.generate_content(prompt, generation_config=generation_config)
240
+ # SDK returns .text on success; may carry safety blocks otherwise.
241
+ text = getattr(resp, "text", None) or ""
242
+ if not text:
243
+ # Try to surface blocked / empty output reasons
244
+ raise HTTPException(502, "Gemini returned empty response.")
245
+ return text.strip()
246
+ except ModuleNotFoundError:
247
+ # Fallback to REST (models may differ in REST naming, e.g., "models/gemini-1.5-flash")
248
+ # We’ll try both forms automatically.
249
+ model_names = [model, f"models/{model}"]
250
+ last_err = None
251
+ for m in model_names:
252
+ url = f"https://generativelanguage.googleapis.com/v1beta/{m}:generateContent"
253
+ payload = {
254
+ "contents": [{"parts": [{"text": prompt}]}],
255
+ "generationConfig": generation_config,
256
+ }
257
+ headers = {"x-goog-api-key": api_key}
258
+ try:
259
+ async with httpx.AsyncClient(timeout=60) as client:
260
+ r = await client.post(url, json=payload, headers=headers)
261
+ if r.status_code == 200:
262
+ data = r.json()
263
+ # Extract first candidate text
264
+ candidates = (data.get("candidates") or [])
265
+ if not candidates:
266
+ raise HTTPException(502, f"Gemini returned no candidates: {data}")
267
+ parts = candidates[0].get("content", {}).get("parts", [])
268
+ text = "".join(p.get("text", "") for p in parts).strip()
269
+ if not text:
270
+ raise HTTPException(502, "Gemini returned empty text.")
271
+ return text
272
+ else:
273
+ last_err = HTTPException(r.status_code, f"Gemini error: {r.text}")
274
+ except Exception as e:
275
+ last_err = e
276
+ # If we got here, all attempts failed
277
+ raise last_err or HTTPException(502, "Gemini request failed")
278
+
279
+ async def call_huggingface_inference(
280
+ hf_api_key: str,
281
+ model: str,
282
+ prompt: str,
283
+ parameters: Optional[Dict[str, Any]] = None
284
+ ) -> str:
285
+ """
286
+ Calls Hugging Face Inference API for text generation models (e.g., google/gemma-3-27b-it).
287
+ """
288
+ parameters = parameters or {
289
+ "max_new_tokens": 512,
290
+ "temperature": 0.2,
291
+ "return_full_text": False,
292
+ "repetition_penalty": 1.1,
293
+ }
294
+
295
+ url = f"https://api-inference.huggingface.co/models/{model}"
296
+ headers = {"Authorization": f"Bearer {hf_api_key}"}
297
+ payload = {"inputs": prompt, "parameters": parameters}
298
+
299
+ async with httpx.AsyncClient(timeout=120) as client:
300
+ r = await client.post(url, headers=headers, json=payload)
301
+
302
+ if r.status_code == 200:
303
+ data = r.json()
304
+ # HF returns either a list[{"generated_text": "..."}] or a dict with error/stream info
305
+ if isinstance(data, list) and data and "generated_text" in data[0]:
306
+ return data[0]["generated_text"].strip()
307
+ # Some pipelines return dict with "generated_text"
308
+ if isinstance(data, dict) and "generated_text" in data:
309
+ return data["generated_text"].strip()
310
+ # Some models return plain string
311
+ if isinstance(data, str):
312
+ return data.strip()
313
+ raise HTTPException(502, f"Unexpected HF response format: {data}")
314
+ elif r.status_code == 503:
315
+ # Model is loading or warming up
316
+ raise HTTPException(503, "Hugging Face model is loading. Please retry.")
317
+ else:
318
+ raise HTTPException(r.status_code, f"Hugging Face error: {r.text}")
319
+
320
+ # ---------- FastAPI ----------
321
+ app = FastAPI(title="Voice Agent API", version="0.2.0")
322
  app.add_middleware(
323
  CORSMiddleware,
324
+ allow_origins=["*"], # tighten for prod
325
  allow_credentials=True,
326
  allow_methods=["*"],
327
  allow_headers=["*"],
328
  )
329
 
330
  class ChatIn(BaseModel):
331
+ question: str = Field(..., examples=["Summarize my projects briefly."])
332
+ session_id: Optional[str] = Field(None, examples=["demo-1"])
333
+ # Which provider to use — default Gemini
334
+ provider: Optional[Literal["gemini", "huggingface"]] = "gemini"
335
+ # Optional: model override per provider
336
+ model: Optional[str] = Field(
337
+ None,
338
+ examples=["gemini-1.5-flash", "google/gemma-3-27b-it"]
339
+ )
340
+ # Per-request API keys (frontend supplies these)
341
+ gemini_api_key: Optional[str] = None
342
+ hf_api_key: Optional[str] = None
343
+
344
  model_config = ConfigDict(json_schema_extra={
345
+ "examples": [{
346
+ "question": "Give me a one-line intro about me.",
347
+ "provider": "gemini",
348
+ "model": "gemini-1.5-flash",
349
+ "gemini_api_key": "YOUR_GEMINI_KEY"
350
+ }]
351
  })
352
 
353
  class ChatOut(BaseModel):
354
  answer: str
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  @app.get("/")
357
  def root():
358
+ return JSONResponse({"ok": True, "message": "Voice Agent API (Gemini / Hugging Face)"})
359
 
360
  @app.get("/api/health")
361
  def health():
362
+ # No external calls here — just server status & profile presence.
363
+ return {
364
+ "ok": True,
365
+ "profile_loaded": bool(PROFILE_MD),
366
+ "default_context_chars": len(PROFILE_MD),
367
+ "providers": {
368
+ "gemini": "supported",
369
+ "huggingface": "supported"
370
+ }
371
+ }
372
+
373
+ @app.post("/api/chat", response_model=ChatOut, tags=["Chat"], summary="Ask the agent")
374
+ async def chat(
375
+ payload: ChatIn,
376
+ # optional: accept keys via headers (frontend can send them this way instead of JSON)
377
+ x_gemini_api_key: Optional[str] = Header(None),
378
+ x_hf_api_key: Optional[str] = Header(None),
379
+ authorization: Optional[str] = Header(None), # e.g. "Bearer hf_xxx"
380
+ ):
381
+ question = payload.question.strip()
382
+ if not question:
383
+ raise HTTPException(400, "Question is required.")
384
+
385
+ prompt = build_prompt(question)
386
+
387
+ provider = payload.provider or "gemini"
388
+ if provider == "gemini":
389
+ model = payload.model or os.getenv("DEFAULT_GEMINI_MODEL", "gemini-1.5-flash")
390
+ # choose key from body > header > env
391
+ gemini_key = payload.gemini_api_key or x_gemini_api_key or os.getenv("GEMINI_API_KEY")
392
+ if not gemini_key:
393
+ raise HTTPException(400, "Gemini API key is required (send gemini_api_key or X-Gemini-Api-Key).")
394
+ text = await call_gemini(gemini_key, model, prompt)
395
+ return ChatOut(answer=text or "Sorry, I didn't catch that.")
396
+
397
+ elif provider == "huggingface":
398
+ model = payload.model or os.getenv("DEFAULT_HF_MODEL", "google/gemma-3-27b-it")
399
+ # choose key from body > header (X-Hf-Api-Key) > Authorization Bearer > env
400
+ hf_key = payload.hf_api_key or x_hf_api_key
401
+ if not hf_key and authorization and authorization.lower().startswith("bearer "):
402
+ hf_key = authorization.split(" ", 1)[1].strip()
403
+ if not hf_key:
404
+ hf_key = os.getenv("HF_API_KEY")
405
+ if not hf_key:
406
+ raise HTTPException(400, "Hugging Face API key is required (send hf_api_key, X-Hf-Api-Key, or Authorization: Bearer).")
407
+ text = await call_huggingface_inference(hf_key, model, prompt)
408
+ return ChatOut(answer=text or "Sorry, I didn't catch that.")
409
+
410
+ else:
411
+ raise HTTPException(400, f"Unknown provider: {provider}")
412
+
413
+ # Optional: peek at the exact prompt we send (for debugging)
414
+ @app.post("/api/debug/prompt")
415
+ def debug_prompt(payload: ChatIn):
416
+ p = build_prompt(payload.question or "")
417
+ return {"length": len(p), "preview": p[:1200] + ("…[truncated]" if len(p) > 1200 else "")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  fastapi
2
  uvicorn
3
- ollama
4
  pydantic
 
 
1
  fastapi
2
  uvicorn
3
+ httpx
4
  pydantic
5
+ google-generativeai
start.sh DELETED
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- echo "HOME=${HOME}"
5
- echo "PATH=${PATH}"
6
- echo "OLLAMA_MODELS=${OLLAMA_MODELS:-<not set>}"
7
-
8
- # ✅ Force a safe, writable models dir if it's wrong or unset
9
- if [ -z "${OLLAMA_MODELS:-}" ] || [ "${OLLAMA_MODELS}" = "/.ollama" ]; then
10
- export OLLAMA_MODELS="/home/appuser/.ollama"
11
- fi
12
-
13
- mkdir -p "${OLLAMA_MODELS}"
14
- echo "Using OLLAMA_MODELS=${OLLAMA_MODELS}"
15
- ls -ld "${OLLAMA_MODELS}"
16
-
17
- echo "Starting ollama serve..."
18
- ollama serve &
19
-
20
- echo -n "Waiting for Ollama"
21
- for i in $(seq 1 60); do
22
- if curl -s http://127.0.0.1:11434/api/tags >/dev/null; then
23
- echo " - ready"
24
- break
25
- fi
26
- echo -n "."
27
- sleep 1
28
- if [ "$i" -eq 60 ]; then
29
- echo "Failed to start Ollama in time"; exit 1
30
- fi
31
- done
32
-
33
- # MODEL_TAG="${MODEL_NAME:-llama3.2:3b-instruct-q4_K_M}"
34
- # MODEL_TAG="${MODEL_NAME:-mistral:instruct}"
35
- MODEL_TAG="${MODEL_NAME:-smallthinker:latest}"
36
- if ! ollama list | grep -q "$MODEL_TAG"; then
37
- echo "Pulling model: $MODEL_TAG"
38
- ollama pull "$MODEL_TAG"
39
- fi
40
-
41
- echo "Starting FastAPI on :7860"
42
- exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1