Kalpokoch commited on
Commit
4dd3f6e
·
verified ·
1 Parent(s): 8193d78

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +87 -38
app/app.py CHANGED
@@ -72,17 +72,18 @@ except Exception as e:
72
  db_ready = False
73
 
74
  # -----------------------------
75
- # ✅ Load TinyLlama GGUF Model
76
  # -----------------------------
77
  logger.info(f"Loading GGUF model from: {MODEL_PATH}")
78
  try:
79
  llm = Llama(
80
  model_path=MODEL_PATH,
81
  n_ctx=2048,
82
- n_threads=1,
83
- n_batch=512,
84
  use_mlock=True,
85
- verbose=False
 
86
  )
87
  logger.info("GGUF model loaded successfully.")
88
  model_ready = True
@@ -174,17 +175,79 @@ def detect_filters(question_lower: str) -> tuple:
174
 
175
  return section_filter, chunk_type_filter
176
 
177
- async def generate_llm_response(prompt: str, request_id: str):
178
- loop = asyncio.get_running_loop()
179
- response = await loop.run_in_executor(
180
- None,
181
- lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", ""], temperature=0.05, echo=False)
182
- )
 
 
 
 
 
 
 
 
 
 
183
 
184
- answer = response["choices"][0]["text"].strip()
185
- if not answer:
186
- raise ValueError("Empty response from LLM")
187
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # -----------------------------
190
  # ✅ Endpoints
@@ -252,6 +315,7 @@ async def chat(query: Query, request: Request):
252
  if not search_results:
253
  adapter.warning("No relevant context found in vector DB.")
254
  return {
 
255
  "question": query.question,
256
  "context_used": "No relevant context found.",
257
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
@@ -292,42 +356,27 @@ async def chat(query: Query, request: Request):
292
 
293
  adapter.info(f"Selected context metadata: {context_metadata}")
294
 
295
- # 6. Build Prompt
296
- prompt = f"""<|system|>
297
- You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
298
- Your task is to answer the user's question based ONLY on the provided context.
299
-
300
- - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
301
-
302
- - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
303
-
304
- <|user|>
305
-
306
- ### Relevant Context:
307
-
308
- {context}
309
- ```
310
-
311
- ### Question:
312
 
313
- {query.question}
314
 
315
- <|assistant|>
316
 
317
- ### Detailed Answer:
318
 
319
- """
320
 
321
  # 7. Generate Response
322
  answer = "An error occurred while processing your request."
323
  try:
324
  adapter.info("Sending prompt to LLM for generation...")
325
  raw_answer = await asyncio.wait_for(
326
- generate_llm_response(prompt, request.state.request_id),
327
  timeout=LLM_TIMEOUT_SECONDS
328
  )
329
 
330
- adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
331
 
332
  # --- POST-PROCESSING LOGIC ---
333
  if '|' in raw_answer:
@@ -402,4 +451,4 @@ async def collect_feedback(feedback: Feedback, request: Request):
402
  }
403
 
404
  adapter.info(json.dumps(feedback_log))
405
- return {"status": "✅ Feedback recorded. Thank you!"}
 
72
  db_ready = False
73
 
74
  # -----------------------------
75
+ # ✅ Load TinyLlama GGUF Model with Improved Settings
76
  # -----------------------------
77
  logger.info(f"Loading GGUF model from: {MODEL_PATH}")
78
  try:
79
  llm = Llama(
80
  model_path=MODEL_PATH,
81
  n_ctx=2048,
82
+ n_threads=2, # Increased threads for better performance
83
+ n_batch=256, # Reduced batch size for stability
84
  use_mlock=True,
85
+ verbose=False,
86
+ seed=42 # Added seed for reproducible results
87
  )
88
  logger.info("GGUF model loaded successfully.")
89
  model_ready = True
 
175
 
176
  return section_filter, chunk_type_filter
177
 
178
+ def clean_llm_response(raw_response: str) -> str:
179
+ """Clean and validate LLM response"""
180
+ if not raw_response:
181
+ return ""
182
+
183
+ # Remove common unwanted patterns
184
+ cleaned = raw_response.strip()
185
+
186
+ # Remove incomplete sentences at the end
187
+ if cleaned and not cleaned.endswith(('.', '!', '?', ':', '|')):
188
+ # Find the last complete sentence
189
+ sentences = re.split(r'[.!?]', cleaned)
190
+ if len(sentences) > 1:
191
+ cleaned = '.'.join(sentences[:-1]) + '.'
192
+
193
+ return cleaned
194
 
195
+ async def generate_llm_response(prompt: str, request_id: str, adapter: RequestIdAdapter):
196
+ """Improved LLM response generation with better error handling"""
197
+ loop = asyncio.get_running_loop()
198
+
199
+ # Multiple generation attempts with different parameters
200
+ generation_configs = [
201
+ {
202
+ "max_tokens": 512,
203
+ "temperature": 0.1,
204
+ "top_p": 0.9,
205
+ "repeat_penalty": 1.1,
206
+ "stop": ["</s>", "[INST]", "[/INST]", "Question:", "Context:", "###"]
207
+ },
208
+ {
209
+ "max_tokens": 256,
210
+ "temperature": 0.3,
211
+ "top_p": 0.8,
212
+ "repeat_penalty": 1.2,
213
+ "stop": ["</s>", "\n\n", "Question:", "Context:"]
214
+ },
215
+ {
216
+ "max_tokens": 128,
217
+ "temperature": 0.5,
218
+ "top_p": 0.7,
219
+ "repeat_penalty": 1.15,
220
+ "stop": ["</s>"]
221
+ }
222
+ ]
223
+
224
+ for attempt, config in enumerate(generation_configs, 1):
225
+ try:
226
+ adapter.info(f"LLM generation attempt {attempt}/{len(generation_configs)} with config: {config}")
227
+
228
+ response = await loop.run_in_executor(
229
+ None,
230
+ lambda: llm(prompt, echo=False, **config)
231
+ )
232
+
233
+ raw_answer = response["choices"][0]["text"]
234
+ cleaned_answer = clean_llm_response(raw_answer)
235
+
236
+ adapter.info(f"Attempt {attempt} - Raw response length: {len(raw_answer)}, Cleaned length: {len(cleaned_answer)}")
237
+
238
+ if cleaned_answer and len(cleaned_answer.strip()) > 10: # Minimum meaningful response
239
+ adapter.info(f"Successful generation on attempt {attempt}")
240
+ return cleaned_answer
241
+ else:
242
+ adapter.warning(f"Attempt {attempt} produced insufficient response: '{cleaned_answer}'")
243
+
244
+ except Exception as e:
245
+ adapter.error(f"Attempt {attempt} failed: {e}")
246
+ continue
247
+
248
+ # If all attempts fail, return a fallback message
249
+ adapter.error("All LLM generation attempts failed")
250
+ raise ValueError("Unable to generate a meaningful response after multiple attempts")
251
 
252
  # -----------------------------
253
  # ✅ Endpoints
 
315
  if not search_results:
316
  adapter.warning("No relevant context found in vector DB.")
317
  return {
318
+ "request_id": request.state.request_id,
319
  "question": query.question,
320
  "context_used": "No relevant context found.",
321
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
 
356
 
357
  adapter.info(f"Selected context metadata: {context_metadata}")
358
 
359
+ # 6. Build Improved Prompt for TinyLlama
360
+ prompt = f"""[INST] You are a helpful assistant for NEEPCO's Delegation of Powers policy. Answer the question using only the provided context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ Context: {context}
363
 
364
+ Question: {query.question}
365
 
366
+ Provide a clear, direct answer based only on the context above. If the context doesn't contain the information, say "The provided policy context does not contain information on this topic."
367
 
368
+ Answer: [/INST]"""
369
 
370
  # 7. Generate Response
371
  answer = "An error occurred while processing your request."
372
  try:
373
  adapter.info("Sending prompt to LLM for generation...")
374
  raw_answer = await asyncio.wait_for(
375
+ generate_llm_response(prompt, request.state.request_id, adapter),
376
  timeout=LLM_TIMEOUT_SECONDS
377
  )
378
 
379
+ adapter.info(f"LLM generation successful. Response length: {len(raw_answer)}")
380
 
381
  # --- POST-PROCESSING LOGIC ---
382
  if '|' in raw_answer:
 
451
  }
452
 
453
  adapter.info(json.dumps(feedback_log))
454
+ return {"status": "✅ Feedback recorded. Thank you!"}