Kalpokoch commited on
Commit
0194a83
·
1 Parent(s): 8e47dc8

improvements dec

Browse files
Files changed (3) hide show
  1. app/app.py +41 -14
  2. app/policy_vector_db.py +64 -27
  3. create_granular_chunks.py +18 -2
app/app.py CHANGED
@@ -344,7 +344,23 @@ async def startup_event():
344
 
345
  # -----------------------------
346
  # ✅ Core Processing Function
347
- # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  def get_logger_adapter(request_id: str):
349
  return RequestIdAdapter(logger, {'request_id': request_id})
350
 
@@ -352,7 +368,14 @@ async def generate_llm_response(prompt: str, request_id: str):
352
  loop = asyncio.get_running_loop()
353
  response = await loop.run_in_executor(
354
  None,
355
- lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
 
 
 
 
 
 
 
356
  )
357
  answer = response["choices"][0]["text"].strip()
358
  if not answer:
@@ -386,8 +409,11 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
386
 
387
  adapter.info(f"Received query: '{question}'")
388
 
389
- # 1. Search Vector DB
390
  search_results = db.search(question, top_k=TOP_K_SEARCH)
 
 
 
391
 
392
  if not search_results:
393
  adapter.warning("No relevant context found in vector DB.")
@@ -401,30 +427,31 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
401
  scores = [f"{result['relevance_score']:.4f}" for result in search_results]
402
  adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
403
 
404
- # 2. Prepare Context
405
  context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
406
  context = "\n---\n".join(context_chunks)
407
 
408
- # 3. Build Prompt with Separator Instruction
409
  prompt = f"""<|system|>
410
- You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
411
- Your task is to answer the user's question based ONLY on the provided context.
412
- - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
413
- - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
 
414
  </s>
415
  <|user|>
416
- ### Relevant Context:
417
- ```
418
  {context}
419
- ```
420
  ### Question:
421
  {question}
 
 
422
  </s>
423
  <|assistant|>
424
- ### Detailed Answer:
425
  """
426
 
427
- # 4. Generate Response
428
  answer = "An error occurred while processing your request."
429
  try:
430
  adapter.info("Sending prompt to LLM for generation...")
 
344
 
345
  # -----------------------------
346
  # ✅ Core Processing Function
347
+ # ✅ Re-ranking function for improving relevance
348
+ def re_rank_by_relevance(results: List[Dict], question: str) -> List[Dict]:
349
+ """Simple heuristic re-ranking based on question keyword overlap"""
350
+ question_terms = set(term.lower() for term in question.split() if len(term) > 3)
351
+
352
+ for result in results:
353
+ chunk_terms = set(term.lower() for term in result['text'].split() if len(term) > 3)
354
+ if question_terms:
355
+ keyword_overlap = len(question_terms & chunk_terms) / len(question_terms)
356
+ else:
357
+ keyword_overlap = 0
358
+ # Boost score if chunk contains question keywords
359
+ result['relevance_score'] *= (1 + 0.15 * keyword_overlap)
360
+
361
+ return sorted(results, key=lambda x: x['relevance_score'], reverse=True)
362
+
363
+
364
  def get_logger_adapter(request_id: str):
365
  return RequestIdAdapter(logger, {'request_id': request_id})
366
 
 
368
  loop = asyncio.get_running_loop()
369
  response = await loop.run_in_executor(
370
  None,
371
+ lambda: llm(
372
+ prompt,
373
+ max_tokens=512, # Optimized for CPU performance
374
+ stop=["###", "Question:", "Context:", "</s>"],
375
+ temperature=0.1, # Lower for factuality
376
+ top_p=0.9, # Nucleus sampling for consistency
377
+ echo=False
378
+ )
379
  )
380
  answer = response["choices"][0]["text"].strip()
381
  if not answer:
 
409
 
410
  adapter.info(f"Received query: '{question}'")
411
 
412
+ # 1. Search Vector DB with query expansion
413
  search_results = db.search(question, top_k=TOP_K_SEARCH)
414
+
415
+ # 2. Re-rank results by keyword overlap for better relevance
416
+ search_results = re_rank_by_relevance(search_results, question)
417
 
418
  if not search_results:
419
  adapter.warning("No relevant context found in vector DB.")
 
427
  scores = [f"{result['relevance_score']:.4f}" for result in search_results]
428
  adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
429
 
430
+ # 3. Prepare Context
431
  context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
432
  context = "\n---\n".join(context_chunks)
433
 
434
+ # 4. Build Enhanced Prompt
435
  prompt = f"""<|system|>
436
+ You are NEEPCO's Delegation of Powers (DoP) policy expert. Answer ONLY using the provided context.
437
+ - Be concise and factual
438
+ - For lists/steps, use pipe separators: `Item1|Item2|Item3`
439
+ - If information is absent, say: "The provided policy context does not contain information on this topic."
440
+ - Do not assume or infer beyond what is stated
441
  </s>
442
  <|user|>
443
+ ### Context:
 
444
  {context}
445
+
446
  ### Question:
447
  {question}
448
+
449
+ Answer based strictly on the context above.
450
  </s>
451
  <|assistant|>
 
452
  """
453
 
454
+ # 5. Generate Response
455
  answer = "An error occurred while processing your request."
456
  try:
457
  adapter.info("Sending prompt to LLM for generation...")
app/policy_vector_db.py CHANGED
@@ -46,6 +46,37 @@ class PolicyVectorDB:
46
  """Ensures all metadata values are strings, as required by some ChromaDB versions."""
47
  return {key: str(value) for key, value in metadata.items()}
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def add_chunks(self, chunks: List[Dict]):
50
  """
51
  Adds a list of chunks to the vector database, skipping any that already exist.
@@ -89,41 +120,47 @@ class PolicyVectorDB:
89
 
90
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
91
  """
92
- Searches the vector database for a given query text.
93
  Returns a list of results filtered by a relevance threshold.
94
  """
95
  collection = self._get_collection()
96
-
97
- # ✅ IMPROVEMENT: Add the recommended instruction prefix for BGE retrieval models.
98
- instructed_query = f"Represent this sentence for searching relevant passages: {query_text}"
99
-
100
- # ✅ IMPROVEMENT: Normalize embeddings for more accurate similarity search.
101
- query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
102
-
103
  k = top_k if top_k is not None else self.top_k_default
104
 
105
- # Retrieve more results initially to allow for filtering
106
- results = collection.query(
107
- query_embeddings=query_embedding,
108
- n_results=k * 2, # Retrieve more to filter by threshold
109
- include=["documents", "metadatas", "distances"]
110
- )
111
 
112
- search_results = []
113
- if results and results.get('documents') and results['documents'][0]:
114
- for i, doc in enumerate(results['documents'][0]):
115
- # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
116
- relevance_score = 1 - results['distances'][0][i]
117
-
118
- if relevance_score >= self.relevance_threshold:
119
- search_results.append({
120
- 'text': doc,
121
- 'metadata': results['metadatas'][0][i],
122
- 'relevance_score': relevance_score
123
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Sort by relevance score and return the top_k results
126
- return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
127
 
128
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
129
  """
 
46
  """Ensures all metadata values are strings, as required by some ChromaDB versions."""
47
  return {key: str(value) for key, value in metadata.items()}
48
 
49
+ def expand_query(self, query_text: str) -> List[str]:
50
+ """
51
+ Generates query variations to improve retrieval.
52
+ Uses simple heuristics - zero LLM cost.
53
+ """
54
+ queries = [query_text]
55
+
56
+ # Expand with synonyms for policy-related terms
57
+ synonyms = {
58
+ "approval": ["approval", "consent", "authorization", "permission"],
59
+ "limit": ["limit", "threshold", "ceiling", "maximum"],
60
+ "authority": ["authority", "official", "person", "representative"],
61
+ "delegate": ["delegate", "authorize", "empower", "assign"],
62
+ "power": ["power", "authority", "delegation", "responsibility"],
63
+ "financial": ["financial", "monetary", "funds", "budget"],
64
+ }
65
+
66
+ for term, variants in synonyms.items():
67
+ if term in query_text.lower():
68
+ for variant in variants:
69
+ if variant.lower() not in query_text.lower():
70
+ expanded = query_text.replace(term, variant)
71
+ if expanded not in queries:
72
+ queries.append(expanded)
73
+ if len(queries) >= 4:
74
+ break
75
+ if len(queries) >= 4:
76
+ break
77
+
78
+ return queries[:4] # Limit to 4 variations
79
+
80
  def add_chunks(self, chunks: List[Dict]):
81
  """
82
  Adds a list of chunks to the vector database, skipping any that already exist.
 
120
 
121
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
122
  """
123
+ Searches the vector database for a given query text with expansion.
124
  Returns a list of results filtered by a relevance threshold.
125
  """
126
  collection = self._get_collection()
 
 
 
 
 
 
 
127
  k = top_k if top_k is not None else self.top_k_default
128
 
129
+ # Expand query for better recall
130
+ queries = self.expand_query(query_text)
131
+ all_results = {}
 
 
 
132
 
133
+ for query in queries:
134
+ # Add the recommended instruction prefix for BGE retrieval models.
135
+ instructed_query = f"Represent this sentence for searching relevant passages: {query}"
136
+
137
+ # Normalize embeddings for more accurate similarity search.
138
+ query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
139
+
140
+ # Retrieve more results initially to allow for filtering
141
+ results = collection.query(
142
+ query_embeddings=query_embedding,
143
+ n_results=k * 2, # Retrieve more to filter by threshold
144
+ include=["documents", "metadatas", "distances"]
145
+ )
146
+
147
+ if results and results.get('documents') and results['documents'][0]:
148
+ for i, doc in enumerate(results['documents'][0]):
149
+ # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
150
+ relevance_score = 1 - results['distances'][0][i]
151
+
152
+ if relevance_score >= self.relevance_threshold:
153
+ key = doc # Use document text as key
154
+ # Keep highest relevance score for duplicate documents
155
+ if key not in all_results or relevance_score > all_results[key]['relevance_score']:
156
+ all_results[key] = {
157
+ 'text': doc,
158
+ 'metadata': results['metadatas'][0][i],
159
+ 'relevance_score': relevance_score
160
+ }
161
 
162
  # Sort by relevance score and return the top_k results
163
+ return sorted(all_results.values(), key=lambda x: x['relevance_score'], reverse=True)[:k]
164
 
165
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
166
  """
create_granular_chunks.py CHANGED
@@ -70,6 +70,21 @@ def format_remarks(remarks: Any) -> str:
70
  return str(remarks)
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def build_descriptive_text(context: Dict) -> str:
74
  """
75
  Builds a clear, descriptive, natural language text by combining fields.
@@ -168,8 +183,9 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
168
  # Handler 3: Leaf nodes with delegation, composition or description
169
  if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
170
  text = build_descriptive_text(context)
171
- # Split long descriptive text intelligently
172
- for chunk_text in split_text_into_chunks(text):
 
173
  chunks.append(create_chunk(context, chunk_text))
174
 
175
  return chunks
 
70
  return str(remarks)
71
 
72
 
73
+ def smart_chunk_size(context: Dict) -> int:
74
+ """
75
+ Adaptive chunk sizing based on content type.
76
+ Smaller chunks for dense information, larger for descriptive.
77
+ """
78
+ if "delegation" in context:
79
+ return 1000 # Smaller for dense financial/delegation info
80
+ elif "composition" in context:
81
+ return 800 # Smaller for structural/hierarchical info
82
+ elif "items" in context or "exclusions" in context:
83
+ return 600 # Smaller for list-based info
84
+ else:
85
+ return 1500 # Default for descriptive content
86
+
87
+
88
  def build_descriptive_text(context: Dict) -> str:
89
  """
90
  Builds a clear, descriptive, natural language text by combining fields.
 
183
  # Handler 3: Leaf nodes with delegation, composition or description
184
  if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
185
  text = build_descriptive_text(context)
186
+ # Split long descriptive text intelligently with adaptive chunk size
187
+ max_size = smart_chunk_size(data)
188
+ for chunk_text in split_text_into_chunks(text, max_char_length=max_size):
189
  chunks.append(create_chunk(context, chunk_text))
190
 
191
  return chunks