Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 5

Commit

08a91ba

verified ·

1 Parent(s): 9d9eece

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +69 -50

app/app.py CHANGED Viewed

@@ -12,13 +12,10 @@ from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
-# ✅ IMPROVEMENT: More detailed and structured logging format.
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
-# ✅ IMPROVEMENT: Custom adapter to inject a request ID into every log message for better traceability.
 class RequestIdAdapter(logging.LoggerAdapter):
     def process(self, msg, kwargs):
-        # The request_id is injected into the 'extra' dict.
         return '[%s] %s' % (self.extra['request_id'], msg), kwargs
 logger = logging.getLogger("app")
@@ -26,11 +23,10 @@ logger = logging.getLogger("app")
 # -----------------------------
 # ✅ Configuration
 # -----------------------------
-# ✅ IMPROVEMENT: Centralized configuration using environment variables with sensible defaults.
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
-CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_improved.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
@@ -38,17 +34,13 @@ TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
-app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.1.0")
-# ✅ IMPROVEMENT: Middleware to add a unique request ID to each incoming request.
-# This helps in tracing a request's entire lifecycle through the logs.
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
-    # Make the request_id available to the logger
     request.state.request_id = request_id
     response = await call_next(request)
-    # Add the request_id to the response headers
     response.headers["X-Request-ID"] = request_id
     return response
@@ -63,7 +55,7 @@ try:
         relevance_threshold=RELEVANCE_THRESHOLD
     )
     if not ensure_db_populated(db, CHUNKS_FILE_PATH):
-        logger.warning("DB not populated on startup. RAG will not function correctly until data is loaded.")
         db_ready = False
     else:
         logger.info("Vector DB is populated and ready.")
@@ -80,11 +72,11 @@ logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=2048,       # Context window size
-        n_threads=4,      # Number of CPU threads to use
-        n_batch=512,      # Batch size for prompt processing
-        use_mlock=True,   # Use mlock to keep model in memory
-        verbose=False     # Suppress verbose output from llama.cpp
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
@@ -104,24 +96,21 @@ class Feedback(BaseModel):
     question: str
     answer: str
     context_used: str
-    feedback: str # e.g., "correct", "incorrect", "helpful", "not-helpful"
     comment: str | None = None
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
 def get_logger_adapter(request: Request):
-    """Helper to get a logger adapter with the current request_id."""
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
     return {"status": "✅ Server is running."}
-# ✅ IMPROVEMENT: Added a health check endpoint for monitoring.
 @app.get("/health")
 async def health_check():
-    """Provides a detailed health status of the application components."""
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
@@ -131,17 +120,12 @@ async def health_check():
         raise HTTPException(status_code=503, detail=status)
     return status
-# ✅ IMPROVEMENT: Run synchronous LLM calls in a separate thread to avoid blocking the event loop.
 async def generate_llm_response(prompt: str, request_id: str):
-    """Helper function to run synchronous LLM inference in a thread-safe manner."""
     loop = asyncio.get_running_loop()
-    # Use to_thread to run the blocking I/O call in a separate thread
     response = await loop.run_in_executor(
-        None,  # Use the default thread pool executor
-        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:"], temperature=0.1, echo=False)
     )
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
@@ -149,57 +133,93 @@ async def generate_llm_response(prompt: str, request_id: str):
 @app.post("/chat")
 async def chat(query: Query, request: Request):
-    # ✅ IMPROVEMENT: Get a logger adapter with the request ID for this specific request.
     adapter = get_logger_adapter(request)
     if not db_ready or not model_ready:
         adapter.error("Service unavailable due to initialization failure.")
         raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
-    question = query.question.strip()
-    adapter.info(f"Received query: '{question}'")
     # 1. Search Vector DB
-    adapter.info(f"Searching vector DB with top_k={TOP_K_SEARCH} and threshold={RELEVANCE_THRESHOLD}")
-    search_results = db.search(question, top_k=TOP_K_SEARCH)
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
-            "question": question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
-    # ✅ IMPROVEMENT: Detailed logging of search results.
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
     adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
     # 2. Prepare Context
     context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
-    adapter.info(f"Using top {len(context_chunks)} contexts for prompt.")
-    # For debugging, you can log the full context, but be mindful of log size.
-    # adapter.debug(f"Full context being used:\n{context}")
     # 3. Build Prompt
     prompt = f"""<|system|>
-    You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP). Your rules are:
-    1.  **Answer ONLY using the information from the 'Relevant Context' provided.**
-    2.  **Use Markdown for formatting.** When the context contains a list of items, composition, or steps, present them as bullet points (`*` or `-`). Do not use plain text for lists.
-    3.  If the answer is in the context, formulate a concise answer based *exclusively* on the text. Cite the relevant clause if possible.
-    4.  **If the answer is NOT in the context, you MUST reply with: "The provided policy context does not contain information on this topic."** Do not use your own knowledge or guess.</s>
-    <|user|>
 ### Relevant Context:
 {context}
 ### Question:
-{question}</s>
 <|assistant|>
 ### Detailed Answer:
 """
-    adapter.info("Generated prompt for LLM.")
-    # adapter.debug(f"Full prompt for LLM:\n{prompt}")
     # 4. Generate Response
     answer = "An error occurred while processing your request."
@@ -220,7 +240,7 @@ async def chat(query: Query, request: Request):
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
         "request_id": request.state.request_id,
-        "question": question,
         "context_used": context,
         "answer": answer
     }
@@ -228,7 +248,6 @@ async def chat(query: Query, request: Request):
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter = get_logger_adapter(request)
-    # ✅ IMPROVEMENT: Log feedback as a structured JSON object for easier parsing and analysis later.
     feedback_log = {
         "type": "USER_FEEDBACK",
         "request_id": feedback.request_id,

 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
 class RequestIdAdapter(logging.LoggerAdapter):
     def process(self, msg, kwargs):
         return '[%s] %s' % (self.extra['request_id'], msg), kwargs
 logger = logging.getLogger("app")
 # -----------------------------
 # ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
+CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.2.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
     request.state.request_id = request_id
     response = await call_next(request)
     response.headers["X-Request-ID"] = request_id
     return response
         relevance_threshold=RELEVANCE_THRESHOLD
     )
     if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+        logger.warning("DB not populated on startup. RAG will not function correctly.")
         db_ready = False
     else:
         logger.info("Vector DB is populated and ready.")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=2048,
+        n_threads=4,
+        n_batch=512,
+        use_mlock=True,
+        verbose=False
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
     question: str
     answer: str
     context_used: str
+    feedback: str
     comment: str | None = None
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
     return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
         raise HTTPException(status_code=503, detail=status)
     return status
 async def generate_llm_response(prompt: str, request_id: str):
     loop = asyncio.get_running_loop()
     response = await loop.run_in_executor(
+        None,
+        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", "</s>"], temperature=0.1, echo=False)
     )
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
+    question_lower = query.question.strip().lower()
+    # --- NEW: GREETING & INTRO HANDLING ---
+    greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
+    # Check if the question is a simple greeting
+    if question_lower in greeting_keywords:
+        adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
+        intro_message = (
+            "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
+            "My purpose is to help you find accurate information and answer questions based on this specific dataset. "
+            "I am currently running on a CPU-based environment. How can I assist you with the DoP policy today?"
+        )
+        return {
+            "request_id": getattr(request.state, 'request_id', 'N/A'),
+            "question": query.question,
+            "context_used": "N/A - Greeting",
+            "answer": intro_message
+        }
+    # --- END OF GREETING HANDLING ---
     if not db_ready or not model_ready:
         adapter.error("Service unavailable due to initialization failure.")
         raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
+    adapter.info(f"Received query: '{query.question}'")
     # 1. Search Vector DB
+    search_results = db.search(query.question, top_k=TOP_K_SEARCH)
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
+            "question": query.question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
     adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
     # 2. Prepare Context
     context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
     # 3. Build Prompt
     prompt = f"""<|system|>
+You are a precise and factual assistant. Follow the user's instructions exactly as shown in the example.
+---
+**EXAMPLE START**
+### Relevant Context:
+```json
+{{"section": "Urgent Local Purchases", "title": "Committee for Purchases Below ₹50,000", "clause": "LPC-2", "composition": [{{"Chairman": "Senior Manager"}}, {{"Members": ["One member from Finance", "One member from Indenter side (not below the rank of Deputy Manager)"]}}]}}
+```
+### Question:
+What is the composition of the LPC-2 committee?
+### Detailed Answer:
+According to the policy on Urgent Local Purchases (Clause LPC-2), the committee is composed of:
+* **Chairman:** Senior Manager
+* **Members:**
+    * One member from Finance
+    * One member from the Indenter side (not below the rank of Deputy Manager)
+**EXAMPLE END**
+---
+</s>
+<|user|>
 ### Relevant Context:
+```json
 {context}
+```
 ### Question:
+{query.question}
+### INSTRUCTIONS FOR YOUR ANSWER:
+1.  Based **ONLY** on the "Relevant Context" above, provide a detailed answer to the "Question".
+2.  If the context contains a list of items, rules, or procedures, you **MUST list ALL of them**. Do not skip or summarize.
+3.  Format your list using Markdown bullet points (`*`).
+4.  If the context does not contain the answer, reply **ONLY** with: "The provided policy context does not contain information on this topic."
+</s>
 <|assistant|>
 ### Detailed Answer:
 """
     # 4. Generate Response
     answer = "An error occurred while processing your request."
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
         "request_id": request.state.request_id,
+        "question": query.question,
         "context_used": context,
         "answer": answer
     }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter = get_logger_adapter(request)
     feedback_log = {
         "type": "USER_FEEDBACK",
         "request_id": feedback.request_id,