Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 5

Commit

3aba0c5

verified ·

1 Parent(s): c2db6b3

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +171 -67

app/app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import os
 import json
 import asyncio
 import logging
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
 # Correctly reference the module within the 'app' package
@@ -11,52 +12,86 @@ from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Initialize FastAPI App
 # -----------------------------
-app = FastAPI()
-@app.get("/")
-async def root():
-    return {"status": "✅ Server is running and ready."}
 # -----------------------------
-# ✅ Vector DB and Data Configuration
 # -----------------------------
-DB_PERSIST_DIRECTORY = "/app/vector_database"
-# ✅ CORRECTED FILENAME: Match the output of your chunking script
-CHUNKS_FILE_PATH = "/app/granular_chunks_improved.jsonl"
-logger.info("[INFO] Initializing vector DB...")
-db = PolicyVectorDB(
-    persist_directory=DB_PERSIST_DIRECTORY,
-    top_k_default=5,
-    relevance_threshold=0.2 # This threshold is now applied inside the search method
-)
-if not ensure_db_populated(db, CHUNKS_FILE_PATH):
-    logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
-else:
-    logger.info("[INFO] Vector DB is ready.")
 # -----------------------------
 # ✅ Load TinyLlama GGUF Model
 # -----------------------------
-MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
-logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
-llm = Llama(
-    model_path=MODEL_PATH,
-    n_ctx=2048,
-    n_threads=2,
-    n_batch=8,
-    use_mlock=False,
-    verbose=False
-)
-logger.info("[INFO] Model loaded successfully.")
 # -----------------------------
 # ✅ API Schemas
@@ -65,75 +100,144 @@ class Query(BaseModel):
     question: str
 class Feedback(BaseModel):
     question: str
     answer: str
-    feedback: str
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
-logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
-async def generate_llm_response(prompt: str):
-    """Helper function to run synchronous LLM inference."""
-    response = llm(prompt, max_tokens=1024, stop=["###"], temperature=0.2, echo=False)
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
     return answer
 @app.post("/chat")
-async def chat(query: Query):
     question = query.question.strip()
-    logger.info(f"[QUERY] {question}")
-    # The search method now handles filtering internally
-    search_results = db.search(question, top_k=5)
     if not search_results:
-        logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
-    # ✅ RECOMMENDED CHANGE: Combine the top 3 contexts for a richer prompt
-    top_k_for_context = 3
-    context_chunks = [result['text'] for result in search_results[:top_k_for_context]]
-    context = "\n---\n".join(context_chunks)
-    top_score = search_results[0]['relevance_score']
-    logger.info(f"[INFO] Using top {len(context_chunks)} contexts (top score: {top_score:.4f})")
-    prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.
-    Only use the context provided to answer the question. Do not assume anything not stated in the context.
-    When applicable, explain why the action is not allowed, referring to the exact rule or limitation.
-    Be professional, factual, and concise.
 ### Relevant Context:
 {context}
-### Question: {question}
-### Detailed Answer:"""
-    answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:
-        answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
     except asyncio.TimeoutError:
-        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
         answer = "Sorry, the request took too long to process. Please try again with a simpler question."
     except Exception as e:
-        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
         answer = "Sorry, an unexpected error occurred while generating a response."
-    logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
     return {
         "question": question,
         "context_used": context,
         "answer": answer
     }
 @app.post("/feedback")
-async def collect_feedback(feedback: Feedback):
-    logger.info(f"[FEEDBACK] Question: {feedback.question} | Answer: {feedback.answer} | Feedback: {feedback.feedback}")
-    return {"status": "✅ Feedback recorded. Thank you!"}

 import json
 import asyncio
 import logging
+import uuid
+from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 # Correctly reference the module within the 'app' package
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
+# ✅ IMPROVEMENT: More detailed and structured logging format.
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
+# ✅ IMPROVEMENT: Custom adapter to inject a request ID into every log message for better traceability.
+class RequestIdAdapter(logging.LoggerAdapter):
+    def process(self, msg, kwargs):
+        # The request_id is injected into the 'extra' dict.
+        return '[%s] %s' % (self.extra['request_id'], msg), kwargs
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Configuration
 # -----------------------------
+# ✅ IMPROVEMENT: Centralized configuration using environment variables with sensible defaults.
+DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
+CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_improved.jsonl")
+MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
+RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
+TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
+TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
+# -----------------------------
+# ✅ Initialize FastAPI App
+# -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.1.0")
+# ✅ IMPROVEMENT: Middleware to add a unique request ID to each incoming request.
+# This helps in tracing a request's entire lifecycle through the logs.
+@app.middleware("http")
+async def add_request_id(request: Request, call_next):
+    request_id = str(uuid.uuid4())
+    # Make the request_id available to the logger
+    request.state.request_id = request_id
+    response = await call_next(request)
+    # Add the request_id to the response headers
+    response.headers["X-Request-ID"] = request_id
+    return response
 # -----------------------------
+# ✅ Vector DB and Data Initialization
 # -----------------------------
+logger.info("Initializing vector DB...")
+try:
+    db = PolicyVectorDB(
+        persist_directory=DB_PERSIST_DIRECTORY,
+        top_k_default=TOP_K_SEARCH,
+        relevance_threshold=RELEVANCE_THRESHOLD
+    )
+    if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+        logger.warning("DB not populated on startup. RAG will not function correctly until data is loaded.")
+        db_ready = False
+    else:
+        logger.info("Vector DB is populated and ready.")
+        db_ready = True
+except Exception as e:
+    logger.error(f"FATAL: Failed to initialize Vector DB: {e}", exc_info=True)
+    db = None
+    db_ready = False
 # -----------------------------
 # ✅ Load TinyLlama GGUF Model
 # -----------------------------
+logger.info(f"Loading GGUF model from: {MODEL_PATH}")
+try:
+    llm = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=2048,       # Context window size
+        n_threads=4,      # Number of CPU threads to use
+        n_batch=512,      # Batch size for prompt processing
+        use_mlock=True,   # Use mlock to keep model in memory
+        verbose=False     # Suppress verbose output from llama.cpp
+    )
+    logger.info("GGUF model loaded successfully.")
+    model_ready = True
+except Exception as e:
+    logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
+    llm = None
+    model_ready = False
 # -----------------------------
 # ✅ API Schemas
     question: str
 class Feedback(BaseModel):
+    request_id: str
     question: str
     answer: str
+    context_used: str
+    feedback: str # e.g., "correct", "incorrect", "helpful", "not-helpful"
+    comment: str | None = None
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
+def get_logger_adapter(request: Request):
+    """Helper to get a logger adapter with the current request_id."""
+    return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
+@app.get("/")
+async def root():
+    return {"status": "✅ Server is running."}
+# ✅ IMPROVEMENT: Added a health check endpoint for monitoring.
+@app.get("/health")
+async def health_check():
+    """Provides a detailed health status of the application components."""
+    status = {
+        "status": "ok",
+        "database_status": "ready" if db_ready else "error",
+        "model_status": "ready" if model_ready else "error"
+    }
+    if not db_ready or not model_ready:
+        raise HTTPException(status_code=503, detail=status)
+    return status
+# ✅ IMPROVEMENT: Run synchronous LLM calls in a separate thread to avoid blocking the event loop.
+async def generate_llm_response(prompt: str, request_id: str):
+    """Helper function to run synchronous LLM inference in a thread-safe manner."""
+    loop = asyncio.get_running_loop()
+    # Use to_thread to run the blocking I/O call in a separate thread
+    response = await loop.run_in_executor(
+        None,  # Use the default thread pool executor
+        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:"], temperature=0.1, echo=False)
+    )
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
     return answer
 @app.post("/chat")
+async def chat(query: Query, request: Request):
+    # ✅ IMPROVEMENT: Get a logger adapter with the request ID for this specific request.
+    adapter = get_logger_adapter(request)
+    if not db_ready or not model_ready:
+        adapter.error("Service unavailable due to initialization failure.")
+        raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
     question = query.question.strip()
+    adapter.info(f"Received query: '{question}'")
+    # 1. Search Vector DB
+    adapter.info(f"Searching vector DB with top_k={TOP_K_SEARCH} and threshold={RELEVANCE_THRESHOLD}")
+    search_results = db.search(question, top_k=TOP_K_SEARCH)
     if not search_results:
+        adapter.warning("No relevant context found in vector DB.")
         return {
             "question": question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
+    # ✅ IMPROVEMENT: Detailed logging of search results.
+    scores = [f"{result['relevance_score']:.4f}" for result in search_results]
+    adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+    # 2. Prepare Context
+    context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
+    context = "\n---\n".join(context_chunks)
+    adapter.info(f"Using top {len(context_chunks)} contexts for prompt.")
+    # For debugging, you can log the full context, but be mindful of log size.
+    # adapter.debug(f"Full context being used:\n{context}")
+    # 3. Build Prompt
+    prompt = f"""<|system|>
+You are an expert assistant for NEEPCO's Delegation of Powers (DoP) policies. Your task is to answer questions based ONLY on the provided context.
+- If the context contains the answer, provide a detailed and factual response.
+- If the context does not contain the answer, state that the information is not available in the provided policy context.
+- Do not make up information or use external knowledge.
+- Cite the relevant clause or section from the context if possible.
+- Be professional, factual, and concise.</s>
+<|user|>
 ### Relevant Context:
 {context}
+### Question:
+{question}</s>
+<|assistant|>
+### Detailed Answer:
+"""
+    adapter.info("Generated prompt for LLM.")
+    # adapter.debug(f"Full prompt for LLM:\n{prompt}")
+    # 4. Generate Response
+    answer = "An error occurred while processing your request."
     try:
+        adapter.info("Sending prompt to LLM for generation...")
+        answer = await asyncio.wait_for(
+            generate_llm_response(prompt, request.state.request_id),
+            timeout=LLM_TIMEOUT_SECONDS
+        )
+        adapter.info(f"LLM generation successful. Raw answer: {answer[:150]}...")
     except asyncio.TimeoutError:
+        adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
         answer = "Sorry, the request took too long to process. Please try again with a simpler question."
     except Exception as e:
+        adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
         answer = "Sorry, an unexpected error occurred while generating a response."
+    adapter.info(f"Final answer prepared. Returning to client.")
     return {
+        "request_id": request.state.request_id,
         "question": question,
         "context_used": context,
         "answer": answer
     }
 @app.post("/feedback")
+async def collect_feedback(feedback: Feedback, request: Request):
+    adapter = get_logger_adapter(request)
+    # ✅ IMPROVEMENT: Log feedback as a structured JSON object for easier parsing and analysis later.
+    feedback_log = {
+        "type": "USER_FEEDBACK",
+        "request_id": feedback.request_id,
+        "question": feedback.question,
+        "answer": feedback.answer,
+        "context_used": feedback.context_used,
+        "feedback": feedback.feedback,
+        "comment": feedback.comment
+    }
+    adapter.info(json.dumps(feedback_log))
+    return {"status": "✅ Feedback recorded. Thank you!"}