Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 18, 2025

Commit

1267728

verified ·

1 Parent(s): 01687a9

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +102 -294

app/app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import re
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
@@ -21,25 +22,20 @@ class RequestIdAdapter(logging.LoggerAdapter):
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Configuration - Restored Original Efficient Settings
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))  # Back to original timeout
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
-TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))  # Keep reduced for efficiency
-TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))  # Keep reduced for efficiency
-# ✅ Single request processing without blocking semaphore
-MAX_CONCURRENT_REQUESTS = 1
-request_in_progress = False
-request_lock = asyncio.Lock()
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
-app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.5.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
@@ -50,7 +46,7 @@ async def add_request_id(request: Request, call_next):
     return response
 # -----------------------------
-# ✅ Vector DB Initialization
 # -----------------------------
 logger.info("Initializing vector DB...")
 try:
@@ -71,16 +67,16 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Load GGUF Model - Restored Original Efficient Settings
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=4096,  # ✅ Restored original context size
-        n_threads=4,  # ✅ Restored original thread count for efficient CPU usage
-        n_batch=512,  # ✅ Restored original batch size
-        use_mlock=True,  # ✅ Restored original memory settings
         verbose=False
     )
     logger.info("GGUF model loaded successfully.")
@@ -105,261 +101,84 @@ class Feedback(BaseModel):
     comment: str | None = None
 # -----------------------------
-# ✅ Enhanced Query Processing Functions
-# -----------------------------
-def classify_query_type(question: str) -> str:
-    """Classify the type of query to choose appropriate search strategy."""
-    question_lower = question.lower()
-    if re.search(r'₹|crore|lakh|\d+.*approve|limit.*\d+', question_lower):
-        return "monetary"
-    if any(word in question_lower for word in ["who can", "who approve", "authority", "delegation"]):
-        return "authority"
-    if any(word in question_lower for word in ["how to", "procedure", "process", "steps", "requirement"]):
-        return "procedure"
-    if re.search(r'section|annexure|clause', question_lower):
-        return "section_specific"
-    return "general"
-def extract_monetary_amount(question: str) -> float:
-    """Extract monetary amount from question for specialized search."""
-    patterns = [
-        r'₹\s*(\d+(?:,\d+)*(?:\.\d+)?)\s*crore',
-        r'(\d+(?:,\d+)*(?:\.\d+)?)\s*crore',
-        r'₹\s*(\d+(?:,\d+)*(?:\.\d+)?)\s*lakh',
-        r'(\d+(?:,\d+)*(?:\.\d+)?)\s*lakh',
-        r'₹\s*(\d+(?:,\d+)*(?:\.\d+)?)'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, question, re.IGNORECASE)
-        if match:
-            amount = float(match.group(1).replace(',', ''))
-            if 'crore' in pattern:
-                return amount * 1e7
-            elif 'lakh' in pattern:
-                return amount * 1e5
-            else:
-                return amount
-    return None
-def build_enhanced_prompt(question: str, context: str, query_type: str, search_results: list) -> str:
-    """Build context-aware prompt based on query type and metadata."""
-    roles_mentioned = set()
-    sections_mentioned = set()
-    for result in search_results:
-        metadata = result.get('metadata', {})
-        if 'role' in metadata:
-            roles_mentioned.add(metadata['role'])
-        if 'section' in metadata:
-            sections_mentioned.add(metadata['section'])
-    type_instructions = {
-        "monetary": "Focus on monetary limits, delegation amounts, and approval authorities for the specified amount.",
-        "authority": "Clearly identify the specific roles/positions and their delegation limits.",
-        "procedure": "Provide step-by-step procedures and requirements in a logical order.",
-        "section_specific": "Reference the specific sections, clauses, and policy provisions mentioned.",
-        "general": "Provide comprehensive information based on the policy context."
-    }
-    instruction = type_instructions.get(query_type, type_instructions["general"])
-    metadata_context = ""
-    if roles_mentioned:
-        metadata_context += f"\nRoles involved: {', '.join(roles_mentioned)}"
-    if sections_mentioned:
-        metadata_context += f"\nSections referenced: {', '.join(sections_mentioned)}"
-    prompt = f"""<|system|>
-You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
-Your task is to answer the user's question based ONLY on the provided context.
-**Query Type**: {query_type}
-**Specific Instructions**: {instruction}
-**Formatting Rules**:
-- For lists or multiple items: Separate each item with a pipe symbol (|)
-- For monetary amounts: Always specify the exact amount and currency
-- For authorities: Always specify the exact role/position and their limits
-- If information is not in context: Reply with "The provided policy context does not contain information on this topic."
-{metadata_context}
-</s>
-<|user|>
-### Relevant Policy Context:
-{context}
-### Question:
-{question}
-</s>
-<|assistant|>
-### Answer:
-"""
-    return prompt
-# -----------------------------
-# ✅ Efficient LLM Response Generation - Restored Original Async Pattern
-# -----------------------------
-async def generate_llm_response(prompt: str, request_id: str):
-    """Async LLM generation using original efficient pattern."""
-    loop = asyncio.get_running_loop()
-    def llm_call():
-        return llm(
-            prompt,
-            max_tokens=2048,  # ✅ Restored original token limit
-            stop=["###", "Question:", "Context:", "</s>"],
-            temperature=0.05,  # ✅ Restored original temperature
-            echo=False
-        )
-    # ✅ Use original async executor pattern for efficient CPU usage
-    response = await loop.run_in_executor(None, llm_call)
-    if response and "choices" in response and len(response["choices"]) > 0:
-        answer = response["choices"][0]["text"].strip()
-        if not answer:
-            raise ValueError("Empty response from LLM")
-        return answer
-    else:
-        raise ValueError("Invalid response from LLM")
-# -----------------------------
-# ✅ Endpoints with Lightweight Request Management
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
-    return {
-        "status": "✅ Server is running efficiently",
-        "mode": "CPU optimized for Hugging Face",
-        "model_loaded": model_ready,
-        "db_ready": db_ready
-    }
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
-        "model_status": "ready" if model_ready else "error",
-        "processing_mode": "efficient_cpu_usage"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
 @app.post("/chat")
 async def chat(query: Query, request: Request):
-    global request_in_progress
-    # ✅ Lightweight request management - reject if busy instead of blocking
-    async with request_lock:
-        if request_in_progress:
-            raise HTTPException(status_code=429, detail="Server is busy processing another request. Please try again in a moment.")
-        request_in_progress = True
-    try:
-        adapter = get_logger_adapter(request)
-        question_lower = query.question.strip().lower()
-        # --- GREETING & INTRO HANDLING ---
-        greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
-        if question_lower in greeting_keywords:
-            adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
-            intro_message = (
-                "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
-                "I can help you find accurate information about approval authorities, monetary limits, procedures, and policy requirements. "
-                "How can I assist you with the DoP policy today?"
-            )
-            return {
-                "request_id": getattr(request.state, 'request_id', 'N/A'),
-                "question": query.question,
-                "context_used": "NA - Greeting",
-                "answer": intro_message
-            }
-        if not db_ready or not model_ready:
-            adapter.error("Service unavailable due to initialization failure.")
-            raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
-        adapter.info(f"Received query: '{query.question}'")
-        # Query classification and search
-        query_type = classify_query_type(query.question)
-        adapter.info(f"Query classified as: {query_type}")
-        search_results = []
-        # Enhanced search strategy
-        if query_type == "monetary":
-            amount = extract_monetary_amount(query.question)
-            if amount:
-                adapter.info(f"Extracted monetary amount: ₹{amount}")
-                try:
-                    monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
-                    if monetary_results:
-                        search_results = monetary_results
-                        adapter.info(f"Found {len(search_results)} results using monetary search")
-                except:
-                    adapter.info("Monetary search not available, falling back to semantic search")
-        if not search_results:
-            # Use enhanced search if available, otherwise fallback to basic search
-            try:
-                search_results = db.search_with_context(
-                    query.question,
-                    top_k=TOP_K_SEARCH,
-                    include_related=True
-                )
-                adapter.info(f"Found {len(search_results)} results using enhanced semantic search")
-            except:
-                # Fallback to basic search
-                search_results = db.search(query.question, top_k=TOP_K_SEARCH)
-                adapter.info(f"Found {len(search_results)} results using basic search")
-        if not search_results:
-            adapter.warning("No relevant context found in vector DB.")
-            return {
-                "request_id": getattr(request.state, 'request_id', 'N/A'),
-                "question": query.question,
-                "context_used": "No relevant context found.",
-                "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
-            }
-        # Log search results
-        scores = [f"{result.get('relevance_score', 0):.4f}" for result in search_results]
-        adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
-        # Prepare context with metadata if available
-        context_chunks = []
-        for result in search_results[:TOP_K_CONTEXT]:
-            chunk_text = result['text']
-            metadata = result.get('metadata', {})
-            if metadata and (metadata.get('section') or metadata.get('role')):
-                metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
-                chunk_text = metadata_prefix + chunk_text
-            context_chunks.append(chunk_text)
-        context = "\n---\n".join(context_chunks)
-        # Build prompt - use enhanced if search results have metadata, otherwise simple
-        if any(result.get('metadata') for result in search_results):
-            prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
-            adapter.info(f"Using enhanced prompt for {query_type} query")
-        else:
-            # Fallback to original simple prompt
-            prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
@@ -378,52 +197,45 @@ Your task is to answer the user's question based ONLY on the provided context.
 <|assistant|>
 ### Detailed Answer:
 """
-            adapter.info("Using original simple prompt")
-        # Generate response using original efficient async pattern
-        answer = "An error occurred while processing your request."
-        try:
-            adapter.info("Sending prompt to LLM for generation...")
-            raw_answer = await asyncio.wait_for(
-                generate_llm_response(prompt, request.state.request_id),
-                timeout=LLM_TIMEOUT_SECONDS
-            )
-            adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
-            # Post-processing logic
-            if '|' in raw_answer:
-                adapter.info("Pipe separator found. Formatting response as a bulleted list.")
-                items = raw_answer.split('|')
-                cleaned_items = [f"• {item.strip()}" for item in items if item.strip()]
-                answer = "\n".join(cleaned_items)
-            else:
-                answer = raw_answer.strip()
-                # Add monetary context if needed
-                if query_type == "monetary" and "₹" not in answer and extract_monetary_amount(query.question):
-                    amount = extract_monetary_amount(query.question)
-                    answer = f"For amounts of ₹{amount:,.0f}:\n\n{answer}"
-        except asyncio.TimeoutError:
-            adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
-            answer = "Sorry, the request took too long to process. Please try again with a simpler question."
-        except Exception as e:
-            adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
-            answer = "Sorry, an unexpected error occurred while generating a response."
-        adapter.info(f"Final answer prepared for {query_type} query. Returning to client.")
-        return {
-            "request_id": request.state.request_id,
-            "question": query.question,
-            "context_used": context,
-            "answer": answer,
-            "query_type": query_type if 'query_type' in locals() else "general"
-        }
-    finally:
-        # �� Always release the lock
-        async with request_lock:
-            request_in_progress = False
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
@@ -439,7 +251,3 @@ async def collect_feedback(feedback: Feedback, request: Request):
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
-@app.on_event("shutdown")
-async def shutdown_event():
-    logger.info("Application shutting down.")

 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
+# Correctly reference the module within the 'app' package
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
+TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
+TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     return response
 # -----------------------------
+# ✅ Vector DB and Data Initialization
 # -----------------------------
 logger.info("Initializing vector DB...")
 try:
     db_ready = False
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=4096,
+        n_threads=4,
+        n_batch=512,
+        use_mlock=True,
         verbose=False
     )
     logger.info("GGUF model loaded successfully.")
     comment: str | None = None
 # -----------------------------
+# ✅ Endpoints
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
+    return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
+        "model_status": "ready" if model_ready else "error"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
+async def generate_llm_response(prompt: str, request_id: str):
+    loop = asyncio.get_running_loop()
+    response = await loop.run_in_executor(
+        None,
+        lambda: llm(prompt, max_tokens=2048, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
+    )
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
 @app.post("/chat")
 async def chat(query: Query, request: Request):
+    adapter = get_logger_adapter(request)
+    question_lower = query.question.strip().lower()
+    # --- GREETING & INTRO HANDLING ---
+    greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
+    if question_lower in greeting_keywords:
+        adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
+        intro_message = (
+            "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
+            "My purpose is to help you find accurate information and answer questions based on this specific dataset. "
+            "I am currently running on a CPU-based environment. How can I assist you with the DoP policy today?"
+        )
+        return {
+            "request_id": getattr(request.state, 'request_id', 'N/A'),
+            "question": query.question,
+            "context_used": "NA - Greeting",
+            "answer": intro_message
+        }
+    if not db_ready or not model_ready:
+        adapter.error("Service unavailable due to initialization failure.")
+        raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
+    adapter.info(f"Received query: '{query.question}'")
+    # 1. Search Vector DB
+    search_results = db.search(query.question, top_k=TOP_K_SEARCH)
+    if not search_results:
+        adapter.warning("No relevant context found in vector DB.")
+        return {
+            "question": query.question,
+            "context_used": "No relevant context found.",
+            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
+        }
+    scores = [f"{result['relevance_score']:.4f}" for result in search_results]
+    adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+    # 2. Prepare Context
+    context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
+    context = "\n---\n".join(context_chunks)
+    # 3. Build Prompt with Separator Instruction
+    prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
 <|assistant|>
 ### Detailed Answer:
 """
+    # 4. Generate Response
+    answer = "An error occurred while processing your request."
+    try:
+        adapter.info("Sending prompt to LLM for generation...")
+        raw_answer = await asyncio.wait_for(
+            generate_llm_response(prompt, request.state.request_id),
+            timeout=LLM_TIMEOUT_SECONDS
+        )
+        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
+        # --- POST-PROCESSING LOGIC ---
+        # Check if the model used the pipe separator, indicating a list.
+        if '|' in raw_answer:
+            adapter.info("Pipe separator found. Formatting response as a bulleted list.")
+            # Split the string into a list of items
+            items = raw_answer.split('|')
+            # Clean up each item and format it as a bullet point
+            cleaned_items = [f"* {item.strip()}" for item in items if item.strip()]
+            # Join them back together with newlines
+            answer = "\n".join(cleaned_items)
+        else:
+            # If no separator, use the answer as is.
+            answer = raw_answer
+    except asyncio.TimeoutError:
+        adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
+        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
+    except Exception as e:
+        adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
+        answer = "Sorry, an unexpected error occurred while generating a response."
+    adapter.info(f"Final answer prepared. Returning to client.")
+    return {
+        "request_id": request.state.request_id,
+        "question": query.question,
+        "context_used": context,
+        "answer": answer
+    }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}