Spaces:
Sleeping
Sleeping
improvements dec
Browse files- app/app.py +41 -14
- app/policy_vector_db.py +64 -27
- create_granular_chunks.py +18 -2
app/app.py
CHANGED
|
@@ -344,7 +344,23 @@ async def startup_event():
|
|
| 344 |
|
| 345 |
# -----------------------------
|
| 346 |
# ✅ Core Processing Function
|
| 347 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
def get_logger_adapter(request_id: str):
|
| 349 |
return RequestIdAdapter(logger, {'request_id': request_id})
|
| 350 |
|
|
@@ -352,7 +368,14 @@ async def generate_llm_response(prompt: str, request_id: str):
|
|
| 352 |
loop = asyncio.get_running_loop()
|
| 353 |
response = await loop.run_in_executor(
|
| 354 |
None,
|
| 355 |
-
lambda: llm(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
)
|
| 357 |
answer = response["choices"][0]["text"].strip()
|
| 358 |
if not answer:
|
|
@@ -386,8 +409,11 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
|
|
| 386 |
|
| 387 |
adapter.info(f"Received query: '{question}'")
|
| 388 |
|
| 389 |
-
# 1. Search Vector DB
|
| 390 |
search_results = db.search(question, top_k=TOP_K_SEARCH)
|
|
|
|
|
|
|
|
|
|
| 391 |
|
| 392 |
if not search_results:
|
| 393 |
adapter.warning("No relevant context found in vector DB.")
|
|
@@ -401,30 +427,31 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
|
|
| 401 |
scores = [f"{result['relevance_score']:.4f}" for result in search_results]
|
| 402 |
adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
|
| 403 |
|
| 404 |
-
#
|
| 405 |
context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
|
| 406 |
context = "\n---\n".join(context_chunks)
|
| 407 |
|
| 408 |
-
#
|
| 409 |
prompt = f"""<|system|>
|
| 410 |
-
You are
|
| 411 |
-
|
| 412 |
-
-
|
| 413 |
-
-
|
|
|
|
| 414 |
</s>
|
| 415 |
<|user|>
|
| 416 |
-
###
|
| 417 |
-
```
|
| 418 |
{context}
|
| 419 |
-
|
| 420 |
### Question:
|
| 421 |
{question}
|
|
|
|
|
|
|
| 422 |
</s>
|
| 423 |
<|assistant|>
|
| 424 |
-
### Detailed Answer:
|
| 425 |
"""
|
| 426 |
|
| 427 |
-
#
|
| 428 |
answer = "An error occurred while processing your request."
|
| 429 |
try:
|
| 430 |
adapter.info("Sending prompt to LLM for generation...")
|
|
|
|
| 344 |
|
| 345 |
# -----------------------------
|
| 346 |
# ✅ Core Processing Function
|
| 347 |
+
# ✅ Re-ranking function for improving relevance
|
| 348 |
+
def re_rank_by_relevance(results: List[Dict], question: str) -> List[Dict]:
|
| 349 |
+
"""Simple heuristic re-ranking based on question keyword overlap"""
|
| 350 |
+
question_terms = set(term.lower() for term in question.split() if len(term) > 3)
|
| 351 |
+
|
| 352 |
+
for result in results:
|
| 353 |
+
chunk_terms = set(term.lower() for term in result['text'].split() if len(term) > 3)
|
| 354 |
+
if question_terms:
|
| 355 |
+
keyword_overlap = len(question_terms & chunk_terms) / len(question_terms)
|
| 356 |
+
else:
|
| 357 |
+
keyword_overlap = 0
|
| 358 |
+
# Boost score if chunk contains question keywords
|
| 359 |
+
result['relevance_score'] *= (1 + 0.15 * keyword_overlap)
|
| 360 |
+
|
| 361 |
+
return sorted(results, key=lambda x: x['relevance_score'], reverse=True)
|
| 362 |
+
|
| 363 |
+
|
| 364 |
def get_logger_adapter(request_id: str):
|
| 365 |
return RequestIdAdapter(logger, {'request_id': request_id})
|
| 366 |
|
|
|
|
| 368 |
loop = asyncio.get_running_loop()
|
| 369 |
response = await loop.run_in_executor(
|
| 370 |
None,
|
| 371 |
+
lambda: llm(
|
| 372 |
+
prompt,
|
| 373 |
+
max_tokens=512, # Optimized for CPU performance
|
| 374 |
+
stop=["###", "Question:", "Context:", "</s>"],
|
| 375 |
+
temperature=0.1, # Lower for factuality
|
| 376 |
+
top_p=0.9, # Nucleus sampling for consistency
|
| 377 |
+
echo=False
|
| 378 |
+
)
|
| 379 |
)
|
| 380 |
answer = response["choices"][0]["text"].strip()
|
| 381 |
if not answer:
|
|
|
|
| 409 |
|
| 410 |
adapter.info(f"Received query: '{question}'")
|
| 411 |
|
| 412 |
+
# 1. Search Vector DB with query expansion
|
| 413 |
search_results = db.search(question, top_k=TOP_K_SEARCH)
|
| 414 |
+
|
| 415 |
+
# 2. Re-rank results by keyword overlap for better relevance
|
| 416 |
+
search_results = re_rank_by_relevance(search_results, question)
|
| 417 |
|
| 418 |
if not search_results:
|
| 419 |
adapter.warning("No relevant context found in vector DB.")
|
|
|
|
| 427 |
scores = [f"{result['relevance_score']:.4f}" for result in search_results]
|
| 428 |
adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
|
| 429 |
|
| 430 |
+
# 3. Prepare Context
|
| 431 |
context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
|
| 432 |
context = "\n---\n".join(context_chunks)
|
| 433 |
|
| 434 |
+
# 4. Build Enhanced Prompt
|
| 435 |
prompt = f"""<|system|>
|
| 436 |
+
You are NEEPCO's Delegation of Powers (DoP) policy expert. Answer ONLY using the provided context.
|
| 437 |
+
- Be concise and factual
|
| 438 |
+
- For lists/steps, use pipe separators: `Item1|Item2|Item3`
|
| 439 |
+
- If information is absent, say: "The provided policy context does not contain information on this topic."
|
| 440 |
+
- Do not assume or infer beyond what is stated
|
| 441 |
</s>
|
| 442 |
<|user|>
|
| 443 |
+
### Context:
|
|
|
|
| 444 |
{context}
|
| 445 |
+
|
| 446 |
### Question:
|
| 447 |
{question}
|
| 448 |
+
|
| 449 |
+
Answer based strictly on the context above.
|
| 450 |
</s>
|
| 451 |
<|assistant|>
|
|
|
|
| 452 |
"""
|
| 453 |
|
| 454 |
+
# 5. Generate Response
|
| 455 |
answer = "An error occurred while processing your request."
|
| 456 |
try:
|
| 457 |
adapter.info("Sending prompt to LLM for generation...")
|
app/policy_vector_db.py
CHANGED
|
@@ -46,6 +46,37 @@ class PolicyVectorDB:
|
|
| 46 |
"""Ensures all metadata values are strings, as required by some ChromaDB versions."""
|
| 47 |
return {key: str(value) for key, value in metadata.items()}
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def add_chunks(self, chunks: List[Dict]):
|
| 50 |
"""
|
| 51 |
Adds a list of chunks to the vector database, skipping any that already exist.
|
|
@@ -89,41 +120,47 @@ class PolicyVectorDB:
|
|
| 89 |
|
| 90 |
def search(self, query_text: str, top_k: int = None) -> List[Dict]:
|
| 91 |
"""
|
| 92 |
-
Searches the vector database for a given query text.
|
| 93 |
Returns a list of results filtered by a relevance threshold.
|
| 94 |
"""
|
| 95 |
collection = self._get_collection()
|
| 96 |
-
|
| 97 |
-
# ✅ IMPROVEMENT: Add the recommended instruction prefix for BGE retrieval models.
|
| 98 |
-
instructed_query = f"Represent this sentence for searching relevant passages: {query_text}"
|
| 99 |
-
|
| 100 |
-
# ✅ IMPROVEMENT: Normalize embeddings for more accurate similarity search.
|
| 101 |
-
query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
|
| 102 |
-
|
| 103 |
k = top_k if top_k is not None else self.top_k_default
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
n_results=k * 2, # Retrieve more to filter by threshold
|
| 109 |
-
include=["documents", "metadatas", "distances"]
|
| 110 |
-
)
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
for
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Sort by relevance score and return the top_k results
|
| 126 |
-
return sorted(
|
| 127 |
|
| 128 |
def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
|
| 129 |
"""
|
|
|
|
| 46 |
"""Ensures all metadata values are strings, as required by some ChromaDB versions."""
|
| 47 |
return {key: str(value) for key, value in metadata.items()}
|
| 48 |
|
| 49 |
+
def expand_query(self, query_text: str) -> List[str]:
|
| 50 |
+
"""
|
| 51 |
+
Generates query variations to improve retrieval.
|
| 52 |
+
Uses simple heuristics - zero LLM cost.
|
| 53 |
+
"""
|
| 54 |
+
queries = [query_text]
|
| 55 |
+
|
| 56 |
+
# Expand with synonyms for policy-related terms
|
| 57 |
+
synonyms = {
|
| 58 |
+
"approval": ["approval", "consent", "authorization", "permission"],
|
| 59 |
+
"limit": ["limit", "threshold", "ceiling", "maximum"],
|
| 60 |
+
"authority": ["authority", "official", "person", "representative"],
|
| 61 |
+
"delegate": ["delegate", "authorize", "empower", "assign"],
|
| 62 |
+
"power": ["power", "authority", "delegation", "responsibility"],
|
| 63 |
+
"financial": ["financial", "monetary", "funds", "budget"],
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
for term, variants in synonyms.items():
|
| 67 |
+
if term in query_text.lower():
|
| 68 |
+
for variant in variants:
|
| 69 |
+
if variant.lower() not in query_text.lower():
|
| 70 |
+
expanded = query_text.replace(term, variant)
|
| 71 |
+
if expanded not in queries:
|
| 72 |
+
queries.append(expanded)
|
| 73 |
+
if len(queries) >= 4:
|
| 74 |
+
break
|
| 75 |
+
if len(queries) >= 4:
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
return queries[:4] # Limit to 4 variations
|
| 79 |
+
|
| 80 |
def add_chunks(self, chunks: List[Dict]):
|
| 81 |
"""
|
| 82 |
Adds a list of chunks to the vector database, skipping any that already exist.
|
|
|
|
| 120 |
|
| 121 |
def search(self, query_text: str, top_k: int = None) -> List[Dict]:
|
| 122 |
"""
|
| 123 |
+
Searches the vector database for a given query text with expansion.
|
| 124 |
Returns a list of results filtered by a relevance threshold.
|
| 125 |
"""
|
| 126 |
collection = self._get_collection()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
k = top_k if top_k is not None else self.top_k_default
|
| 128 |
|
| 129 |
+
# Expand query for better recall
|
| 130 |
+
queries = self.expand_query(query_text)
|
| 131 |
+
all_results = {}
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
for query in queries:
|
| 134 |
+
# Add the recommended instruction prefix for BGE retrieval models.
|
| 135 |
+
instructed_query = f"Represent this sentence for searching relevant passages: {query}"
|
| 136 |
+
|
| 137 |
+
# Normalize embeddings for more accurate similarity search.
|
| 138 |
+
query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
|
| 139 |
+
|
| 140 |
+
# Retrieve more results initially to allow for filtering
|
| 141 |
+
results = collection.query(
|
| 142 |
+
query_embeddings=query_embedding,
|
| 143 |
+
n_results=k * 2, # Retrieve more to filter by threshold
|
| 144 |
+
include=["documents", "metadatas", "distances"]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if results and results.get('documents') and results['documents'][0]:
|
| 148 |
+
for i, doc in enumerate(results['documents'][0]):
|
| 149 |
+
# The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
|
| 150 |
+
relevance_score = 1 - results['distances'][0][i]
|
| 151 |
+
|
| 152 |
+
if relevance_score >= self.relevance_threshold:
|
| 153 |
+
key = doc # Use document text as key
|
| 154 |
+
# Keep highest relevance score for duplicate documents
|
| 155 |
+
if key not in all_results or relevance_score > all_results[key]['relevance_score']:
|
| 156 |
+
all_results[key] = {
|
| 157 |
+
'text': doc,
|
| 158 |
+
'metadata': results['metadatas'][0][i],
|
| 159 |
+
'relevance_score': relevance_score
|
| 160 |
+
}
|
| 161 |
|
| 162 |
# Sort by relevance score and return the top_k results
|
| 163 |
+
return sorted(all_results.values(), key=lambda x: x['relevance_score'], reverse=True)[:k]
|
| 164 |
|
| 165 |
def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
|
| 166 |
"""
|
create_granular_chunks.py
CHANGED
|
@@ -70,6 +70,21 @@ def format_remarks(remarks: Any) -> str:
|
|
| 70 |
return str(remarks)
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def build_descriptive_text(context: Dict) -> str:
|
| 74 |
"""
|
| 75 |
Builds a clear, descriptive, natural language text by combining fields.
|
|
@@ -168,8 +183,9 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
|
|
| 168 |
# Handler 3: Leaf nodes with delegation, composition or description
|
| 169 |
if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
|
| 170 |
text = build_descriptive_text(context)
|
| 171 |
-
# Split long descriptive text intelligently
|
| 172 |
-
|
|
|
|
| 173 |
chunks.append(create_chunk(context, chunk_text))
|
| 174 |
|
| 175 |
return chunks
|
|
|
|
| 70 |
return str(remarks)
|
| 71 |
|
| 72 |
|
| 73 |
+
def smart_chunk_size(context: Dict) -> int:
|
| 74 |
+
"""
|
| 75 |
+
Adaptive chunk sizing based on content type.
|
| 76 |
+
Smaller chunks for dense information, larger for descriptive.
|
| 77 |
+
"""
|
| 78 |
+
if "delegation" in context:
|
| 79 |
+
return 1000 # Smaller for dense financial/delegation info
|
| 80 |
+
elif "composition" in context:
|
| 81 |
+
return 800 # Smaller for structural/hierarchical info
|
| 82 |
+
elif "items" in context or "exclusions" in context:
|
| 83 |
+
return 600 # Smaller for list-based info
|
| 84 |
+
else:
|
| 85 |
+
return 1500 # Default for descriptive content
|
| 86 |
+
|
| 87 |
+
|
| 88 |
def build_descriptive_text(context: Dict) -> str:
|
| 89 |
"""
|
| 90 |
Builds a clear, descriptive, natural language text by combining fields.
|
|
|
|
| 183 |
# Handler 3: Leaf nodes with delegation, composition or description
|
| 184 |
if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
|
| 185 |
text = build_descriptive_text(context)
|
| 186 |
+
# Split long descriptive text intelligently with adaptive chunk size
|
| 187 |
+
max_size = smart_chunk_size(data)
|
| 188 |
+
for chunk_text in split_text_into_chunks(text, max_char_length=max_size):
|
| 189 |
chunks.append(create_chunk(context, chunk_text))
|
| 190 |
|
| 191 |
return chunks
|