Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 22, 2025

Commit

4dd3f6e

verified ·

1 Parent(s): 8193d78

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +87 -38

app/app.py CHANGED Viewed

@@ -72,17 +72,18 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Load TinyLlama GGUF Model
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=2048,
-        n_threads=1,
-        n_batch=512,
         use_mlock=True,
-        verbose=False
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
@@ -174,17 +175,79 @@ def detect_filters(question_lower: str) -> tuple:
     return section_filter, chunk_type_filter
-async def generate_llm_response(prompt: str, request_id: str):
-    loop = asyncio.get_running_loop()
-    response = await loop.run_in_executor(
-        None,
-        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", ""], temperature=0.05, echo=False)
-    )
-    answer = response["choices"][0]["text"].strip()
-    if not answer:
-        raise ValueError("Empty response from LLM")
-    return answer
 # -----------------------------
 # ✅ Endpoints
@@ -252,6 +315,7 @@ async def chat(query: Query, request: Request):
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
             "question": query.question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
@@ -292,42 +356,27 @@ async def chat(query: Query, request: Request):
     adapter.info(f"Selected context metadata: {context_metadata}")
-    # 6. Build Prompt
-    prompt = f"""<|system|>
-You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
-Your task is to answer the user's question based ONLY on the provided context.
-- **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
-- **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
-<|user|>
-### Relevant Context:
-{context}
-```
-### Question:
-{query.question}
-<|assistant|>
-### Detailed Answer:
-"""
     # 7. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")
         raw_answer = await asyncio.wait_for(
-            generate_llm_response(prompt, request.state.request_id),
             timeout=LLM_TIMEOUT_SECONDS
         )
-        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
         # --- POST-PROCESSING LOGIC ---
         if '|' in raw_answer:
@@ -402,4 +451,4 @@ async def collect_feedback(feedback: Feedback, request: Request):
     }
     adapter.info(json.dumps(feedback_log))
-    return {"status": "✅ Feedback recorded. Thank you!"}

     db_ready = False
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model with Improved Settings
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=2048,
+        n_threads=2,  # Increased threads for better performance
+        n_batch=256,  # Reduced batch size for stability
         use_mlock=True,
+        verbose=False,
+        seed=42  # Added seed for reproducible results
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
     return section_filter, chunk_type_filter
+def clean_llm_response(raw_response: str) -> str:
+    """Clean and validate LLM response"""
+    if not raw_response:
+        return ""
+    # Remove common unwanted patterns
+    cleaned = raw_response.strip()
+    # Remove incomplete sentences at the end
+    if cleaned and not cleaned.endswith(('.', '!', '?', ':', '|')):
+        # Find the last complete sentence
+        sentences = re.split(r'[.!?]', cleaned)
+        if len(sentences) > 1:
+            cleaned = '.'.join(sentences[:-1]) + '.'
+    return cleaned
+async def generate_llm_response(prompt: str, request_id: str, adapter: RequestIdAdapter):
+    """Improved LLM response generation with better error handling"""
+    loop = asyncio.get_running_loop()
+    # Multiple generation attempts with different parameters
+    generation_configs = [
+        {
+            "max_tokens": 512,
+            "temperature": 0.1,
+            "top_p": 0.9,
+            "repeat_penalty": 1.1,
+            "stop": ["</s>", "[INST]", "[/INST]", "Question:", "Context:", "###"]
+        },
+        {
+            "max_tokens": 256,
+            "temperature": 0.3,
+            "top_p": 0.8,
+            "repeat_penalty": 1.2,
+            "stop": ["</s>", "\n\n", "Question:", "Context:"]
+        },
+        {
+            "max_tokens": 128,
+            "temperature": 0.5,
+            "top_p": 0.7,
+            "repeat_penalty": 1.15,
+            "stop": ["</s>"]
+        }
+    ]
+    for attempt, config in enumerate(generation_configs, 1):
+        try:
+            adapter.info(f"LLM generation attempt {attempt}/{len(generation_configs)} with config: {config}")
+            response = await loop.run_in_executor(
+                None,
+                lambda: llm(prompt, echo=False, **config)
+            )
+            raw_answer = response["choices"][0]["text"]
+            cleaned_answer = clean_llm_response(raw_answer)
+            adapter.info(f"Attempt {attempt} - Raw response length: {len(raw_answer)}, Cleaned length: {len(cleaned_answer)}")
+            if cleaned_answer and len(cleaned_answer.strip()) > 10:  # Minimum meaningful response
+                adapter.info(f"Successful generation on attempt {attempt}")
+                return cleaned_answer
+            else:
+                adapter.warning(f"Attempt {attempt} produced insufficient response: '{cleaned_answer}'")
+        except Exception as e:
+            adapter.error(f"Attempt {attempt} failed: {e}")
+            continue
+    # If all attempts fail, return a fallback message
+    adapter.error("All LLM generation attempts failed")
+    raise ValueError("Unable to generate a meaningful response after multiple attempts")
 # -----------------------------
 # ✅ Endpoints
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
+            "request_id": request.state.request_id,
             "question": query.question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
     adapter.info(f"Selected context metadata: {context_metadata}")
+    # 6. Build Improved Prompt for TinyLlama
+    prompt = f"""[INST] You are a helpful assistant for NEEPCO's Delegation of Powers policy. Answer the question using only the provided context.
+Context: {context}
+Question: {query.question}
+Provide a clear, direct answer based only on the context above. If the context doesn't contain the information, say "The provided policy context does not contain information on this topic."
+Answer: [/INST]"""
     # 7. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")
         raw_answer = await asyncio.wait_for(
+            generate_llm_response(prompt, request.state.request_id, adapter),
             timeout=LLM_TIMEOUT_SECONDS
         )
+        adapter.info(f"LLM generation successful. Response length: {len(raw_answer)}")
         # --- POST-PROCESSING LOGIC ---
         if '|' in raw_answer:
     }
     adapter.info(json.dumps(feedback_log))
+    return {"status": "✅ Feedback recorded. Thank you!"}