Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on 7 days ago

Commit

0194a83

1 Parent(s): 8e47dc8

improvements dec

Browse files

Files changed (3) hide show

app/app.py +41 -14
app/policy_vector_db.py +64 -27
create_granular_chunks.py +18 -2

app/app.py CHANGED Viewed

@@ -344,7 +344,23 @@ async def startup_event():
 # -----------------------------
 # ✅ Core Processing Function
-# -----------------------------
 def get_logger_adapter(request_id: str):
     return RequestIdAdapter(logger, {'request_id': request_id})
@@ -352,7 +368,14 @@ async def generate_llm_response(prompt: str, request_id: str):
     loop = asyncio.get_running_loop()
     response = await loop.run_in_executor(
         None,
-        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
     )
     answer = response["choices"][0]["text"].strip()
     if not answer:
@@ -386,8 +409,11 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
     adapter.info(f"Received query: '{question}'")
-    # 1. Search Vector DB
     search_results = db.search(question, top_k=TOP_K_SEARCH)
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
@@ -401,30 +427,31 @@ async def process_chat_request(question: str, request_id: str) -> Dict:
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
     adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
-    # 2. Prepare Context
     context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
-    # 3. Build Prompt with Separator Instruction
     prompt = f"""<|system|>
-You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
-Your task is to answer the user's question based ONLY on the provided context.
-- **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
-- **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
 </s>
 <|user|>
-### Relevant Context:
-```
 {context}
-```
 ### Question:
 {question}
 </s>
 <|assistant|>
-### Detailed Answer:
 """
-    # 4. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")

 # -----------------------------
 # ✅ Core Processing Function
+# ✅ Re-ranking function for improving relevance
+def re_rank_by_relevance(results: List[Dict], question: str) -> List[Dict]:
+    """Simple heuristic re-ranking based on question keyword overlap"""
+    question_terms = set(term.lower() for term in question.split() if len(term) > 3)
+    for result in results:
+        chunk_terms = set(term.lower() for term in result['text'].split() if len(term) > 3)
+        if question_terms:
+            keyword_overlap = len(question_terms & chunk_terms) / len(question_terms)
+        else:
+            keyword_overlap = 0
+        # Boost score if chunk contains question keywords
+        result['relevance_score'] *= (1 + 0.15 * keyword_overlap)
+    return sorted(results, key=lambda x: x['relevance_score'], reverse=True)
 def get_logger_adapter(request_id: str):
     return RequestIdAdapter(logger, {'request_id': request_id})
     loop = asyncio.get_running_loop()
     response = await loop.run_in_executor(
         None,
+        lambda: llm(
+            prompt,
+            max_tokens=512,  # Optimized for CPU performance
+            stop=["###", "Question:", "Context:", "</s>"],
+            temperature=0.1,  # Lower for factuality
+            top_p=0.9,  # Nucleus sampling for consistency
+            echo=False
+        )
     )
     answer = response["choices"][0]["text"].strip()
     if not answer:
     adapter.info(f"Received query: '{question}'")
+    # 1. Search Vector DB with query expansion
     search_results = db.search(question, top_k=TOP_K_SEARCH)
+    # 2. Re-rank results by keyword overlap for better relevance
+    search_results = re_rank_by_relevance(search_results, question)
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
     adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+    # 3. Prepare Context
     context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
+    # 4. Build Enhanced Prompt
     prompt = f"""<|system|>
+You are NEEPCO's Delegation of Powers (DoP) policy expert. Answer ONLY using the provided context.
+- Be concise and factual
+- For lists/steps, use pipe separators: `Item1|Item2|Item3`
+- If information is absent, say: "The provided policy context does not contain information on this topic."
+- Do not assume or infer beyond what is stated
 </s>
 <|user|>
+### Context:
 {context}
 ### Question:
 {question}
+Answer based strictly on the context above.
 </s>
 <|assistant|>
 """
+    # 5. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")

app/policy_vector_db.py CHANGED Viewed

@@ -46,6 +46,37 @@ class PolicyVectorDB:
         """Ensures all metadata values are strings, as required by some ChromaDB versions."""
         return {key: str(value) for key, value in metadata.items()}
     def add_chunks(self, chunks: List[Dict]):
         """
         Adds a list of chunks to the vector database, skipping any that already exist.
@@ -89,41 +120,47 @@ class PolicyVectorDB:
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         """
-        Searches the vector database for a given query text.
         Returns a list of results filtered by a relevance threshold.
         """
         collection = self._get_collection()
-        # ✅ IMPROVEMENT: Add the recommended instruction prefix for BGE retrieval models.
-        instructed_query = f"Represent this sentence for searching relevant passages: {query_text}"
-        # ✅ IMPROVEMENT: Normalize embeddings for more accurate similarity search.
-        query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
         k = top_k if top_k is not None else self.top_k_default
-        # Retrieve more results initially to allow for filtering
-        results = collection.query(
-            query_embeddings=query_embedding,
-            n_results=k * 2, # Retrieve more to filter by threshold
-            include=["documents", "metadatas", "distances"]
-        )
-        search_results = []
-        if results and results.get('documents') and results['documents'][0]:
-            for i, doc in enumerate(results['documents'][0]):
-                # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
-                relevance_score = 1 - results['distances'][0][i]
-                if relevance_score >= self.relevance_threshold:
-                    search_results.append({
-                        'text': doc,
-                        'metadata': results['metadatas'][0][i],
-                        'relevance_score': relevance_score
-                    })
         # Sort by relevance score and return the top_k results
-        return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
     """

         """Ensures all metadata values are strings, as required by some ChromaDB versions."""
         return {key: str(value) for key, value in metadata.items()}
+    def expand_query(self, query_text: str) -> List[str]:
+        """
+        Generates query variations to improve retrieval.
+        Uses simple heuristics - zero LLM cost.
+        """
+        queries = [query_text]
+        # Expand with synonyms for policy-related terms
+        synonyms = {
+            "approval": ["approval", "consent", "authorization", "permission"],
+            "limit": ["limit", "threshold", "ceiling", "maximum"],
+            "authority": ["authority", "official", "person", "representative"],
+            "delegate": ["delegate", "authorize", "empower", "assign"],
+            "power": ["power", "authority", "delegation", "responsibility"],
+            "financial": ["financial", "monetary", "funds", "budget"],
+        }
+        for term, variants in synonyms.items():
+            if term in query_text.lower():
+                for variant in variants:
+                    if variant.lower() not in query_text.lower():
+                        expanded = query_text.replace(term, variant)
+                        if expanded not in queries:
+                            queries.append(expanded)
+                        if len(queries) >= 4:
+                            break
+            if len(queries) >= 4:
+                break
+        return queries[:4]  # Limit to 4 variations
     def add_chunks(self, chunks: List[Dict]):
         """
         Adds a list of chunks to the vector database, skipping any that already exist.
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         """
+        Searches the vector database for a given query text with expansion.
         Returns a list of results filtered by a relevance threshold.
         """
         collection = self._get_collection()
         k = top_k if top_k is not None else self.top_k_default
+        # Expand query for better recall
+        queries = self.expand_query(query_text)
+        all_results = {}
+        for query in queries:
+            # Add the recommended instruction prefix for BGE retrieval models.
+            instructed_query = f"Represent this sentence for searching relevant passages: {query}"
+            # Normalize embeddings for more accurate similarity search.
+            query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
+            # Retrieve more results initially to allow for filtering
+            results = collection.query(
+                query_embeddings=query_embedding,
+                n_results=k * 2, # Retrieve more to filter by threshold
+                include=["documents", "metadatas", "distances"]
+            )
+            if results and results.get('documents') and results['documents'][0]:
+                for i, doc in enumerate(results['documents'][0]):
+                    # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
+                    relevance_score = 1 - results['distances'][0][i]
+                    if relevance_score >= self.relevance_threshold:
+                        key = doc  # Use document text as key
+                        # Keep highest relevance score for duplicate documents
+                        if key not in all_results or relevance_score > all_results[key]['relevance_score']:
+                            all_results[key] = {
+                                'text': doc,
+                                'metadata': results['metadatas'][0][i],
+                                'relevance_score': relevance_score
+                            }
         # Sort by relevance score and return the top_k results
+        return sorted(all_results.values(), key=lambda x: x['relevance_score'], reverse=True)[:k]
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
     """

create_granular_chunks.py CHANGED Viewed

@@ -70,6 +70,21 @@ def format_remarks(remarks: Any) -> str:
     return str(remarks)
 def build_descriptive_text(context: Dict) -> str:
     """
     Builds a clear, descriptive, natural language text by combining fields.
@@ -168,8 +183,9 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     # Handler 3: Leaf nodes with delegation, composition or description
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
-        # Split long descriptive text intelligently
-        for chunk_text in split_text_into_chunks(text):
             chunks.append(create_chunk(context, chunk_text))
     return chunks

     return str(remarks)
+def smart_chunk_size(context: Dict) -> int:
+    """
+    Adaptive chunk sizing based on content type.
+    Smaller chunks for dense information, larger for descriptive.
+    """
+    if "delegation" in context:
+        return 1000  # Smaller for dense financial/delegation info
+    elif "composition" in context:
+        return 800   # Smaller for structural/hierarchical info
+    elif "items" in context or "exclusions" in context:
+        return 600   # Smaller for list-based info
+    else:
+        return 1500  # Default for descriptive content
 def build_descriptive_text(context: Dict) -> str:
     """
     Builds a clear, descriptive, natural language text by combining fields.
     # Handler 3: Leaf nodes with delegation, composition or description
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
+        # Split long descriptive text intelligently with adaptive chunk size
+        max_size = smart_chunk_size(data)
+        for chunk_text in split_text_into_chunks(text, max_char_length=max_size):
             chunks.append(create_chunk(context, chunk_text))
     return chunks