Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 4, 2025

Commit

9df2551

1 Parent(s): a47545a

updated to granular context chunks

Browse files

Files changed (5) hide show

Dockerfile +11 -8
app/app.py +21 -14
app/policy_vector_db.py +53 -42
create_granular_chunks.py +85 -24
processed_chunks.json +0 -0

Dockerfile CHANGED Viewed

@@ -1,6 +1,6 @@
 FROM python:3.11-slim
-# Install required system dependencies
 RUN apt-get update && apt-get install -y \
     git curl build-essential cmake \
     && rm -rf /var/lib/apt/lists/*
@@ -8,26 +8,29 @@ RUN apt-get update && apt-get install -y \
 # Set working directory
 WORKDIR /app
-# Create writable directories
 RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
-# Set environment variables
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache \
     CHROMADB_DISABLE_TELEMETRY=true
-# Pre-install the specific, known-working version of llama-cpp-python for TinyLlama
 RUN pip install --no-cache-dir llama-cpp-python==0.2.61
 # Install other dependencies from requirements.txt
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the application code and data file
 COPY ./app ./app
-COPY ./processed_chunks.json .
-# Download your fine-tuned TinyLlama GGUF model
 RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
     https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
     && echo "✅ TinyLlama model downloaded."
@@ -35,5 +38,5 @@ RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
 # Expose the application port
 EXPOSE 7860
-# Run the FastAPI application
 CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
+# Install required system dependencies needed for llama-cpp-python compilation
 RUN apt-get update && apt-get install -y \
     git curl build-essential cmake \
     && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
+# Create writable directories for cache and the persistent vector DB
+# Note: For production, consider using a non-root user and more specific permissions
 RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
+# Set environment variables for huggingface cache and to disable chroma telemetry
 ENV TRANSFORMERS_CACHE=/app/.cache \
     HF_HOME=/app/.cache \
     CHROMADB_DISABLE_TELEMETRY=true
+# ✅ RECOMMENDATION: To avoid version conflicts, it's best to remove 'llama-cpp-python'
+# from your requirements.txt and rely on this explicit, version-pinned installation.
 RUN pip install --no-cache-dir llama-cpp-python==0.2.61
 # Install other dependencies from requirements.txt
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code and the processed data file
 COPY ./app ./app
+# ✅ CORRECTED FILENAME: Ensure this matches the output of your chunking script
+COPY ./granular_chunks_improved.jsonl .
+# Download your fine-tuned TinyLlama GGUF model from Hugging Face
 RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
     https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
     && echo "✅ TinyLlama model downloaded."
 # Expose the application port
 EXPOSE 7860
+# Run the FastAPI application using uvicorn
 CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]

app/app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
@@ -26,13 +27,14 @@ async def root():
 # ✅ Vector DB and Data Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = "/app/vector_database"
-CHUNKS_FILE_PATH = "/app/processed_chunks.json"
 logger.info("[INFO] Initializing vector DB...")
 db = PolicyVectorDB(
     persist_directory=DB_PERSIST_DIRECTORY,
     top_k_default=5,
-    relevance_threshold=0.2
 )
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
@@ -48,7 +50,7 @@ logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=2048, # Increased context window to prevent errors
     n_threads=2,
     n_batch=8,
     use_mlock=False,
@@ -86,14 +88,10 @@ async def chat(query: Query):
     question = query.question.strip()
     logger.info(f"[QUERY] {question}")
-    search_results = db.search(question)
-    filtered = sorted(
-        [r for r in search_results if r["relevance_score"] > db.relevance_threshold],
-        key=lambda x: x["relevance_score"],
-        reverse=True
-    )
-    if not filtered:
         logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
@@ -101,10 +99,19 @@ async def chat(query: Query):
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
-    context = filtered[0]["text"]
-    logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
-    prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
     answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
+# Correctly reference the module within the 'app' package
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Vector DB and Data Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = "/app/vector_database"
+# ✅ CORRECTED FILENAME: Match the output of your chunking script
+CHUNKS_FILE_PATH = "/app/granular_chunks_improved.jsonl"
 logger.info("[INFO] Initializing vector DB...")
 db = PolicyVectorDB(
     persist_directory=DB_PERSIST_DIRECTORY,
     top_k_default=5,
+    relevance_threshold=0.2 # This threshold is now applied inside the search method
 )
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=2048,
     n_threads=2,
     n_batch=8,
     use_mlock=False,
     question = query.question.strip()
     logger.info(f"[QUERY] {question}")
+    # The search method now handles filtering internally
+    search_results = db.search(question, top_k=5)
+    if not search_results:
         logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
+    # ✅ RECOMMENDED CHANGE: Combine the top 3 contexts for a richer prompt
+    top_k_for_context = 3
+    context_chunks = [result['text'] for result in search_results[:top_k_for_context]]
+    context = "\n---\n".join(context_chunks)
+    top_score = search_results[0]['relevance_score']
+    logger.info(f"[INFO] Using top {len(context_chunks)} contexts (top score: {top_score:.4f})")
+    prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies. Only use the context provided to answer the question. Be precise.
+### Relevant Context:
+{context}
+### Question: {question}
+### Answer:"""
     answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:

app/policy_vector_db.py CHANGED Viewed

@@ -14,8 +14,9 @@ class PolicyVectorDB:
         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
         self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
-        self.collection = None
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
@@ -36,29 +37,26 @@ class PolicyVectorDB:
             logger.info("No chunks provided to add.")
             return
-        # Ensure all IDs are strings before checking for existence
-        new_chunks = [chunk for chunk in chunks if chunk.get('id')]
-        existing_ids = set(collection.get(ids=[str(c['id']) for c in new_chunks])['ids'])
-        new_chunks = [chunk for chunk in new_chunks if str(chunk.get('id')) not in existing_ids]
         if not new_chunks:
-            logger.info("No new chunks to add to the database.")
             return
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
         batch_size = 64
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
-            texts = [chunk['text'] for chunk in batch]
             ids = [str(chunk['id']) for chunk in batch]
-            metadatas = []
-            for chunk in batch:
-                meta = chunk.get('metadata')
-                if not meta:
-                    meta = {"description": "General information chunk."}
-                metadatas.append(self._flatten_metadata(meta))
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
@@ -68,45 +66,58 @@ class PolicyVectorDB:
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         collection = self._get_collection()
         query_embedding = self.embedding_model.encode([query_text]).tolist()
-        top_k = top_k if top_k else self.top_k_default
         results = collection.query(
             query_embeddings=query_embedding,
-            n_results=top_k,
             include=["documents", "metadatas", "distances"]
         )
         search_results = []
-        if results and results['documents'] and results['documents'][0]:
             for i, doc in enumerate(results['documents'][0]):
-                relevance_score = 1 - results['distances'][0][i]
-                search_results.append({
-                    'text': doc,
-                    'metadata': results['metadatas'][0][i],
-                    'relevance_score': relevance_score
-                })
-        return search_results
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
     try:
-        if db_instance._get_collection().count() == 0:
-            logger.info("Vector database is empty. Attempting to populate from chunks file.")
-            if not os.path.exists(chunks_file_path):
-                logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
-                return False
-            with open(chunks_file_path, 'r', encoding='utf-8') as f:
-                chunks_to_add = json.load(f)
-            if not chunks_to_add:
-                logger.warning(f"Chunks file at {chunks_file_path} is empty. No data to add to DB.")
-                return False
-            db_instance.add_chunks(chunks_to_add)
-            logger.info("Vector database population attempt complete.")
-            return True
-        else:
             logger.info("Vector database already contains data. Skipping population.")
             return True
     except Exception as e:
         logger.error(f"DB Population Error: {e}", exc_info=True)
         return False

         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
+        # ✅ Use 'cuda' if a GPU is available for better performance
         self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
+        self.collection = self._get_collection()
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
             logger.info("No chunks provided to add.")
             return
+        chunks_with_ids = [c for c in chunks if c.get('id')]
+        if len(chunks) != len(chunks_with_ids):
+            logger.warning(f"Skipped {len(chunks) - len(chunks_with_ids)} chunks that were missing an 'id'.")
+        if not chunks_with_ids:
+            return
+        existing_ids = set(collection.get(ids=[str(c['id']) for c in chunks_with_ids])['ids'])
+        new_chunks = [chunk for chunk in chunks_with_ids if str(chunk.get('id')) not in existing_ids]
         if not new_chunks:
+            logger.info("All provided chunks already exist in the database.")
             return
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
         batch_size = 64
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
             ids = [str(chunk['id']) for chunk in batch]
+            texts = [chunk['text'] for chunk in batch]
+            metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         collection = self._get_collection()
         query_embedding = self.embedding_model.encode([query_text]).tolist()
+        k = top_k if top_k is not None else self.top_k_default
+        # Retrieve more results initially to allow for filtering
         results = collection.query(
             query_embeddings=query_embedding,
+            n_results=k * 2,
             include=["documents", "metadatas", "distances"]
         )
         search_results = []
+        if results and results.get('documents') and results['documents'][0]:
             for i, doc in enumerate(results['documents'][0]):
+                relevance_score = 1 - results['distances'][0][i]
+                # ✅ RECOMMENDED CHANGE: Filter results internally based on the threshold
+                if relevance_score >= self.relevance_threshold:
+                    search_results.append({
+                        'text': doc,
+                        'metadata': results['metadatas'][0][i],
+                        'relevance_score': relevance_score
+                    })
+        # Return the top k results *after* filtering
+        return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
     try:
+        if db_instance._get_collection().count() > 0:
             logger.info("Vector database already contains data. Skipping population.")
             return True
+        logger.info("Vector database is empty. Attempting to populate from chunks file.")
+        if not os.path.exists(chunks_file_path):
+            logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
+            return False
+        # ✅ CORRECTED CODE: Read the JSONL file line-by-line
+        chunks_to_add = []
+        with open(chunks_file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                try:
+                    chunks_to_add.append(json.loads(line))
+                except json.JSONDecodeError:
+                    logger.warning(f"Skipping malformed line in chunks file: {line.strip()}")
+        if not chunks_to_add:
+            logger.warning(f"Chunks file at {chunks_file_path} is empty or invalid. No data to add.")
+            return False
+        db_instance.add_chunks(chunks_to_add)
+        logger.info("Vector database population attempt complete.")
+        return True
     except Exception as e:
         logger.error(f"DB Population Error: {e}", exc_info=True)
         return False

create_granular_chunks.py CHANGED Viewed

@@ -3,13 +3,13 @@ import re
 from typing import List, Dict, Any
 # Define the input and output filenames
-INPUT_FILE = "combined_context.jsonl" # Or your Kaggle path
-OUTPUT_FILE = "granular_chunks.jsonl"
 # Global counter to ensure all generated IDs are unique
 chunk_counter = 0
-def get_unique_id():
     """Returns a unique, incrementing ID."""
     global chunk_counter
     chunk_counter += 1
@@ -36,18 +36,26 @@ def parse_value_to_int(value_str: str) -> int:
         return 0
 def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
-    """Helper function to create a standardized chunk."""
     chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
-    text = text_override if text_override else context.get("description", context.get("title", str(context)))
     metadata = {
         "section": context.get("section"),
         "clause": context.get("clause"),
         "title": context.get("title"),
         "description": context.get("description"),
         "authority": context.get("authority"),
         "limit_text": context.get("limit_text"),
         "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
     }
     return {
@@ -56,50 +64,103 @@ def create_chunk(context: Dict, text_override: str = None, id_override: str = No
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
 def process_chunk(data: Dict, context: Dict) -> List[Dict]:
     """
-    Processes a dictionary from the source file and deconstructs it if possible.
     """
     new_chunks = []
-    # Update context with current data
     current_context = context.copy()
     current_context.update(data)
-    # Case 1: Handle "delegation" structure
     if "delegation" in data and isinstance(data["delegation"], dict):
         for authority, limit_text in data["delegation"].items():
-            text = f"Regarding '{current_context.get('description', current_context.get('title'))}', the power for {authority} is {limit_text}."
             chunk_context = current_context.copy()
             chunk_context["authority"] = authority
-            chunk_context["limit_text"] = limit_text
             new_chunks.append(create_chunk(chunk_context, text_override=text))
         return new_chunks
-    # Case 2: Handle "authority" and "extent_of_power" structure (often in Section II)
     if "authority" in data and "extent_of_power" in data:
-        # This structure is complex, we will create a single, descriptive chunk
-        text = f"Regarding '{current_context.get('title')}', the authority and extent of power are as follows: {json.dumps(data)}."
-        new_chunks.append(create_chunk(current_context, text_override=text))
-        return new_chunks
-    # Recursive step: process nested lists
-    has_nested_chunks = False
     for key, value in data.items():
-        if isinstance(value, list):
-            for item in value:
-                if isinstance(item, dict):
-                    # Recurse and add the results
                     nested_results = process_chunk(item, current_context)
                     if nested_results:
                         new_chunks.extend(nested_results)
                         has_nested_chunks = True
-    # If we processed children, we don't need to keep the parent chunk
     if has_nested_chunks:
         return new_chunks
-    # Base case: If no specific rules were deconstructed, create a single chunk for the item
     new_chunks.append(create_chunk(current_context))
     return new_chunks
@@ -128,7 +189,7 @@ def main():
         for chunk in final_chunks:
             f.write(json.dumps(chunk) + '\n')
-    print(f"Successfully created granular chunks file: '{OUTPUT_FILE}'")
 if __name__ == "__main__":
     main()

 from typing import List, Dict, Any
 # Define the input and output filenames
+INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_improved.jsonl"
 # Global counter to ensure all generated IDs are unique
 chunk_counter = 0
+def get_unique_id() -> int:
     """Returns a unique, incrementing ID."""
     global chunk_counter
     chunk_counter += 1
         return 0
 def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
+    """Helper function to create a standardized chunk with rich metadata."""
     chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
+    # Determine the primary text for the chunk
+    text = text_override
+    if not text:
+        # Create a sensible default text if none is provided
+        text_parts = [context.get("title"), context.get("description")]
+        text = ". ".join(filter(None, text_parts)) or str(context)
     metadata = {
         "section": context.get("section"),
         "clause": context.get("clause"),
+        "subclause_id": context.get("id"),
         "title": context.get("title"),
         "description": context.get("description"),
         "authority": context.get("authority"),
         "limit_text": context.get("limit_text"),
         "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
+        "source": context.get("source"),
     }
     return {
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
+def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
+    """
+    Specifically handles the complex "authority" and "extent_of_power" structures.
+    This logic is complex because the data types for these keys vary.
+    """
+    chunks = []
+    title = context.get("title", "this rule")
+    # Case 1: Authority and Power are simple strings
+    if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
+        text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
+        chunk_context = context.copy()
+        chunk_context["authority"] = data['authority']
+        chunk_context["limit_text"] = data['extent_of_power']
+        chunks.append(create_chunk(chunk_context, text_override=text))
+    # Case 2: Authority and Power are lists of dictionaries (most complex case)
+    elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
+        authorities = data["authority"]
+        powers = data["extent_of_power"]
+        # Assuming the lists correspond to each other
+        for i in range(min(len(authorities), len(powers))):
+            auth_item = authorities[i]
+            power_item = powers[i]
+            # Extract descriptions from the dictionaries
+            auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
+            power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)
+            text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
+            chunk_context = context.copy()
+            chunk_context["authority"] = auth_desc
+            chunk_context["limit_text"] = power_desc
+            chunks.append(create_chunk(chunk_context, text_override=text))
+    # Fallback for any other structure
+    else:
+        text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
+        chunks.append(create_chunk(context, text_override=text))
+    return chunks
 def process_chunk(data: Dict, context: Dict) -> List[Dict]:
     """
+    Processes a dictionary from the source file and deconstructs it into granular chunks.
     """
     new_chunks = []
+    # Update context with current data, giving preference to new keys
     current_context = context.copy()
     current_context.update(data)
+    has_nested_chunks = False
+    # --- Rule-based deconstruction ---
+    # Rule 1: Handle "delegation" structure (most specific)
     if "delegation" in data and isinstance(data["delegation"], dict):
         for authority, limit_text in data["delegation"].items():
+            desc = current_context.get('description') or current_context.get('title')
+            text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
             chunk_context = current_context.copy()
             chunk_context["authority"] = authority
+            chunk_context["limit_text"] = str(limit_text)
             new_chunks.append(create_chunk(chunk_context, text_override=text))
         return new_chunks
+    # Rule 2: Handle "authority" and "extent_of_power" structures
     if "authority" in data and "extent_of_power" in data:
+        return _process_authority_power(data, current_context)
+    # Rule 3: Recursively process nested lists of dictionaries or strings
     for key, value in data.items():
+        if isinstance(value, list) and value:
+            # Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
+            if all(isinstance(item, dict) for item in value):
+                for item in value:
                     nested_results = process_chunk(item, current_context)
                     if nested_results:
                         new_chunks.extend(nested_results)
                         has_nested_chunks = True
+            # Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
+            elif all(isinstance(item, str) for item in value):
+                title = current_context.get('title')
+                for item_text in value:
+                    text = f"Regarding '{title}', a relevant item is: {item_text}."
+                    new_chunks.append(create_chunk(current_context, text_override=text))
+                has_nested_chunks = True
+    # --- Finalization ---
+    # If we created specific chunks from children, we don't need the generic parent.
     if has_nested_chunks:
         return new_chunks
+    # Base case: If no specific rules were matched, create a single chunk for the item.
+    # This happens for "leaf" nodes that cannot be deconstructed further.
     new_chunks.append(create_chunk(current_context))
     return new_chunks
         for chunk in final_chunks:
             f.write(json.dumps(chunk) + '\n')
+    print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
 if __name__ == "__main__":
     main()

processed_chunks.json DELETED Viewed

The diff for this file is too large to render. See raw diff