Kalpokoch commited on
Commit
3aba0c5
Β·
verified Β·
1 Parent(s): c2db6b3

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +171 -67
app/app.py CHANGED
@@ -2,7 +2,8 @@ import os
2
  import json
3
  import asyncio
4
  import logging
5
- from fastapi import FastAPI, HTTPException
 
6
  from pydantic import BaseModel
7
  from llama_cpp import Llama
8
  # Correctly reference the module within the 'app' package
@@ -11,52 +12,86 @@ from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
11
  # -----------------------------
12
  # βœ… Logging Configuration
13
  # -----------------------------
14
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
15
  logger = logging.getLogger("app")
16
 
17
  # -----------------------------
18
- # βœ… Initialize FastAPI App
19
  # -----------------------------
20
- app = FastAPI()
 
 
 
 
 
 
 
21
 
22
- @app.get("/")
23
- async def root():
24
- return {"status": "βœ… Server is running and ready."}
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # -----------------------------
27
- # βœ… Vector DB and Data Configuration
28
  # -----------------------------
29
- DB_PERSIST_DIRECTORY = "/app/vector_database"
30
- # βœ… CORRECTED FILENAME: Match the output of your chunking script
31
- CHUNKS_FILE_PATH = "/app/granular_chunks_improved.jsonl"
32
-
33
- logger.info("[INFO] Initializing vector DB...")
34
- db = PolicyVectorDB(
35
- persist_directory=DB_PERSIST_DIRECTORY,
36
- top_k_default=5,
37
- relevance_threshold=0.2 # This threshold is now applied inside the search method
38
- )
39
-
40
- if not ensure_db_populated(db, CHUNKS_FILE_PATH):
41
- logger.warning("[WARNING] DB not populated. RAG will not function correctly.")
42
- else:
43
- logger.info("[INFO] Vector DB is ready.")
 
 
44
 
45
  # -----------------------------
46
  # βœ… Load TinyLlama GGUF Model
47
  # -----------------------------
48
- MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
49
- logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
50
-
51
- llm = Llama(
52
- model_path=MODEL_PATH,
53
- n_ctx=2048,
54
- n_threads=2,
55
- n_batch=8,
56
- use_mlock=False,
57
- verbose=False
58
- )
59
- logger.info("[INFO] Model loaded successfully.")
 
 
 
 
60
 
61
  # -----------------------------
62
  # βœ… API Schemas
@@ -65,75 +100,144 @@ class Query(BaseModel):
65
  question: str
66
 
67
  class Feedback(BaseModel):
 
68
  question: str
69
  answer: str
70
- feedback: str
 
 
71
 
72
  # -----------------------------
73
  # βœ… Endpoints
74
  # -----------------------------
75
- LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
76
- logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- async def generate_llm_response(prompt: str):
79
- """Helper function to run synchronous LLM inference."""
80
- response = llm(prompt, max_tokens=1024, stop=["###"], temperature=0.2, echo=False)
81
  answer = response["choices"][0]["text"].strip()
82
  if not answer:
83
  raise ValueError("Empty response from LLM")
84
  return answer
85
 
86
  @app.post("/chat")
87
- async def chat(query: Query):
 
 
 
 
 
 
 
88
  question = query.question.strip()
89
- logger.info(f"[QUERY] {question}")
90
 
91
- # The search method now handles filtering internally
92
- search_results = db.search(question, top_k=5)
 
93
 
94
  if not search_results:
95
- logger.info("[RESPONSE] No relevant context found.")
96
  return {
97
  "question": question,
98
  "context_used": "No relevant context found.",
99
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
100
  }
101
-
102
- # βœ… RECOMMENDED CHANGE: Combine the top 3 contexts for a richer prompt
103
- top_k_for_context = 3
104
- context_chunks = [result['text'] for result in search_results[:top_k_for_context]]
105
- context = "\n---\n".join(context_chunks)
106
 
107
- top_score = search_results[0]['relevance_score']
108
- logger.info(f"[INFO] Using top {len(context_chunks)} contexts (top score: {top_score:.4f})")
 
109
 
110
- prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.
111
- Only use the context provided to answer the question. Do not assume anything not stated in the context.
112
- When applicable, explain why the action is not allowed, referring to the exact rule or limitation.
113
- Be professional, factual, and concise.
 
 
 
 
 
 
 
 
 
 
 
 
114
  ### Relevant Context:
115
  {context}
116
- ### Question: {question}
117
- ### Detailed Answer:"""
118
 
119
- answer = "Sorry, I couldn't process your request right now. Please try again later."
 
 
 
 
 
 
 
 
 
120
  try:
121
- answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
 
 
 
 
 
122
  except asyncio.TimeoutError:
123
- logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
124
  answer = "Sorry, the request took too long to process. Please try again with a simpler question."
125
  except Exception as e:
126
- logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
127
  answer = "Sorry, an unexpected error occurred while generating a response."
128
 
129
- logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
130
  return {
 
131
  "question": question,
132
  "context_used": context,
133
  "answer": answer
134
  }
135
 
136
  @app.post("/feedback")
137
- async def collect_feedback(feedback: Feedback):
138
- logger.info(f"[FEEDBACK] Question: {feedback.question} | Answer: {feedback.answer} | Feedback: {feedback.feedback}")
139
- return {"status": "βœ… Feedback recorded. Thank you!"}
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import asyncio
4
  import logging
5
+ import uuid
6
+ from fastapi import FastAPI, HTTPException, Request
7
  from pydantic import BaseModel
8
  from llama_cpp import Llama
9
  # Correctly reference the module within the 'app' package
 
12
  # -----------------------------
13
  # βœ… Logging Configuration
14
  # -----------------------------
15
+ # βœ… IMPROVEMENT: More detailed and structured logging format.
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
17
+
18
+ # βœ… IMPROVEMENT: Custom adapter to inject a request ID into every log message for better traceability.
19
+ class RequestIdAdapter(logging.LoggerAdapter):
20
+ def process(self, msg, kwargs):
21
+ # The request_id is injected into the 'extra' dict.
22
+ return '[%s] %s' % (self.extra['request_id'], msg), kwargs
23
+
24
  logger = logging.getLogger("app")
25
 
26
  # -----------------------------
27
+ # βœ… Configuration
28
  # -----------------------------
29
+ # βœ… IMPROVEMENT: Centralized configuration using environment variables with sensible defaults.
30
+ DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
31
+ CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_improved.jsonl")
32
+ MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
33
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
34
+ RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
35
+ TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
36
+ TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
37
 
38
+ # -----------------------------
39
+ # βœ… Initialize FastAPI App
40
+ # -----------------------------
41
+ app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.1.0")
42
+
43
+ # βœ… IMPROVEMENT: Middleware to add a unique request ID to each incoming request.
44
+ # This helps in tracing a request's entire lifecycle through the logs.
45
+ @app.middleware("http")
46
+ async def add_request_id(request: Request, call_next):
47
+ request_id = str(uuid.uuid4())
48
+ # Make the request_id available to the logger
49
+ request.state.request_id = request_id
50
+ response = await call_next(request)
51
+ # Add the request_id to the response headers
52
+ response.headers["X-Request-ID"] = request_id
53
+ return response
54
 
55
  # -----------------------------
56
+ # βœ… Vector DB and Data Initialization
57
  # -----------------------------
58
+ logger.info("Initializing vector DB...")
59
+ try:
60
+ db = PolicyVectorDB(
61
+ persist_directory=DB_PERSIST_DIRECTORY,
62
+ top_k_default=TOP_K_SEARCH,
63
+ relevance_threshold=RELEVANCE_THRESHOLD
64
+ )
65
+ if not ensure_db_populated(db, CHUNKS_FILE_PATH):
66
+ logger.warning("DB not populated on startup. RAG will not function correctly until data is loaded.")
67
+ db_ready = False
68
+ else:
69
+ logger.info("Vector DB is populated and ready.")
70
+ db_ready = True
71
+ except Exception as e:
72
+ logger.error(f"FATAL: Failed to initialize Vector DB: {e}", exc_info=True)
73
+ db = None
74
+ db_ready = False
75
 
76
  # -----------------------------
77
  # βœ… Load TinyLlama GGUF Model
78
  # -----------------------------
79
+ logger.info(f"Loading GGUF model from: {MODEL_PATH}")
80
+ try:
81
+ llm = Llama(
82
+ model_path=MODEL_PATH,
83
+ n_ctx=2048, # Context window size
84
+ n_threads=4, # Number of CPU threads to use
85
+ n_batch=512, # Batch size for prompt processing
86
+ use_mlock=True, # Use mlock to keep model in memory
87
+ verbose=False # Suppress verbose output from llama.cpp
88
+ )
89
+ logger.info("GGUF model loaded successfully.")
90
+ model_ready = True
91
+ except Exception as e:
92
+ logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
93
+ llm = None
94
+ model_ready = False
95
 
96
  # -----------------------------
97
  # βœ… API Schemas
 
100
  question: str
101
 
102
  class Feedback(BaseModel):
103
+ request_id: str
104
  question: str
105
  answer: str
106
+ context_used: str
107
+ feedback: str # e.g., "correct", "incorrect", "helpful", "not-helpful"
108
+ comment: str | None = None
109
 
110
  # -----------------------------
111
  # βœ… Endpoints
112
  # -----------------------------
113
+ def get_logger_adapter(request: Request):
114
+ """Helper to get a logger adapter with the current request_id."""
115
+ return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
116
+
117
+ @app.get("/")
118
+ async def root():
119
+ return {"status": "βœ… Server is running."}
120
+
121
+ # βœ… IMPROVEMENT: Added a health check endpoint for monitoring.
122
+ @app.get("/health")
123
+ async def health_check():
124
+ """Provides a detailed health status of the application components."""
125
+ status = {
126
+ "status": "ok",
127
+ "database_status": "ready" if db_ready else "error",
128
+ "model_status": "ready" if model_ready else "error"
129
+ }
130
+ if not db_ready or not model_ready:
131
+ raise HTTPException(status_code=503, detail=status)
132
+ return status
133
+
134
+ # βœ… IMPROVEMENT: Run synchronous LLM calls in a separate thread to avoid blocking the event loop.
135
+ async def generate_llm_response(prompt: str, request_id: str):
136
+ """Helper function to run synchronous LLM inference in a thread-safe manner."""
137
+ loop = asyncio.get_running_loop()
138
+
139
+ # Use to_thread to run the blocking I/O call in a separate thread
140
+ response = await loop.run_in_executor(
141
+ None, # Use the default thread pool executor
142
+ lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:"], temperature=0.1, echo=False)
143
+ )
144
 
 
 
 
145
  answer = response["choices"][0]["text"].strip()
146
  if not answer:
147
  raise ValueError("Empty response from LLM")
148
  return answer
149
 
150
  @app.post("/chat")
151
+ async def chat(query: Query, request: Request):
152
+ # βœ… IMPROVEMENT: Get a logger adapter with the request ID for this specific request.
153
+ adapter = get_logger_adapter(request)
154
+
155
+ if not db_ready or not model_ready:
156
+ adapter.error("Service unavailable due to initialization failure.")
157
+ raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
158
+
159
  question = query.question.strip()
160
+ adapter.info(f"Received query: '{question}'")
161
 
162
+ # 1. Search Vector DB
163
+ adapter.info(f"Searching vector DB with top_k={TOP_K_SEARCH} and threshold={RELEVANCE_THRESHOLD}")
164
+ search_results = db.search(question, top_k=TOP_K_SEARCH)
165
 
166
  if not search_results:
167
+ adapter.warning("No relevant context found in vector DB.")
168
  return {
169
  "question": question,
170
  "context_used": "No relevant context found.",
171
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
172
  }
 
 
 
 
 
173
 
174
+ # βœ… IMPROVEMENT: Detailed logging of search results.
175
+ scores = [f"{result['relevance_score']:.4f}" for result in search_results]
176
+ adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
177
 
178
+ # 2. Prepare Context
179
+ context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
180
+ context = "\n---\n".join(context_chunks)
181
+ adapter.info(f"Using top {len(context_chunks)} contexts for prompt.")
182
+ # For debugging, you can log the full context, but be mindful of log size.
183
+ # adapter.debug(f"Full context being used:\n{context}")
184
+
185
+ # 3. Build Prompt
186
+ prompt = f"""<|system|>
187
+ You are an expert assistant for NEEPCO's Delegation of Powers (DoP) policies. Your task is to answer questions based ONLY on the provided context.
188
+ - If the context contains the answer, provide a detailed and factual response.
189
+ - If the context does not contain the answer, state that the information is not available in the provided policy context.
190
+ - Do not make up information or use external knowledge.
191
+ - Cite the relevant clause or section from the context if possible.
192
+ - Be professional, factual, and concise.</s>
193
+ <|user|>
194
  ### Relevant Context:
195
  {context}
 
 
196
 
197
+ ### Question:
198
+ {question}</s>
199
+ <|assistant|>
200
+ ### Detailed Answer:
201
+ """
202
+ adapter.info("Generated prompt for LLM.")
203
+ # adapter.debug(f"Full prompt for LLM:\n{prompt}")
204
+
205
+ # 4. Generate Response
206
+ answer = "An error occurred while processing your request."
207
  try:
208
+ adapter.info("Sending prompt to LLM for generation...")
209
+ answer = await asyncio.wait_for(
210
+ generate_llm_response(prompt, request.state.request_id),
211
+ timeout=LLM_TIMEOUT_SECONDS
212
+ )
213
+ adapter.info(f"LLM generation successful. Raw answer: {answer[:150]}...")
214
  except asyncio.TimeoutError:
215
+ adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
216
  answer = "Sorry, the request took too long to process. Please try again with a simpler question."
217
  except Exception as e:
218
+ adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
219
  answer = "Sorry, an unexpected error occurred while generating a response."
220
 
221
+ adapter.info(f"Final answer prepared. Returning to client.")
222
  return {
223
+ "request_id": request.state.request_id,
224
  "question": question,
225
  "context_used": context,
226
  "answer": answer
227
  }
228
 
229
  @app.post("/feedback")
230
+ async def collect_feedback(feedback: Feedback, request: Request):
231
+ adapter = get_logger_adapter(request)
232
+ # βœ… IMPROVEMENT: Log feedback as a structured JSON object for easier parsing and analysis later.
233
+ feedback_log = {
234
+ "type": "USER_FEEDBACK",
235
+ "request_id": feedback.request_id,
236
+ "question": feedback.question,
237
+ "answer": feedback.answer,
238
+ "context_used": feedback.context_used,
239
+ "feedback": feedback.feedback,
240
+ "comment": feedback.comment
241
+ }
242
+ adapter.info(json.dumps(feedback_log))
243
+ return {"status": "βœ… Feedback recorded. Thank you!"}