Spaces:
Sleeping
Sleeping
updated to granular context chunks
Browse files- Dockerfile +11 -8
- app/app.py +21 -14
- app/policy_vector_db.py +53 -42
- create_granular_chunks.py +85 -24
- processed_chunks.json +0 -0
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
# Install required system dependencies
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
git curl build-essential cmake \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
|
@@ -8,26 +8,29 @@ RUN apt-get update && apt-get install -y \
|
|
| 8 |
# Set working directory
|
| 9 |
WORKDIR /app
|
| 10 |
|
| 11 |
-
# Create writable directories
|
|
|
|
| 12 |
RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
|
| 13 |
|
| 14 |
-
# Set environment variables
|
| 15 |
ENV TRANSFORMERS_CACHE=/app/.cache \
|
| 16 |
HF_HOME=/app/.cache \
|
| 17 |
CHROMADB_DISABLE_TELEMETRY=true
|
| 18 |
|
| 19 |
-
#
|
|
|
|
| 20 |
RUN pip install --no-cache-dir llama-cpp-python==0.2.61
|
| 21 |
|
| 22 |
# Install other dependencies from requirements.txt
|
| 23 |
COPY requirements.txt .
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
-
# Copy the application code and data file
|
| 27 |
COPY ./app ./app
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
-
# Download your fine-tuned TinyLlama GGUF model
|
| 31 |
RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
|
| 32 |
https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
|
| 33 |
&& echo "✅ TinyLlama model downloaded."
|
|
@@ -35,5 +38,5 @@ RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
|
|
| 35 |
# Expose the application port
|
| 36 |
EXPOSE 7860
|
| 37 |
|
| 38 |
-
# Run the FastAPI application
|
| 39 |
CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
# Install required system dependencies needed for llama-cpp-python compilation
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
git curl build-essential cmake \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
| 8 |
# Set working directory
|
| 9 |
WORKDIR /app
|
| 10 |
|
| 11 |
+
# Create writable directories for cache and the persistent vector DB
|
| 12 |
+
# Note: For production, consider using a non-root user and more specific permissions
|
| 13 |
RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
|
| 14 |
|
| 15 |
+
# Set environment variables for huggingface cache and to disable chroma telemetry
|
| 16 |
ENV TRANSFORMERS_CACHE=/app/.cache \
|
| 17 |
HF_HOME=/app/.cache \
|
| 18 |
CHROMADB_DISABLE_TELEMETRY=true
|
| 19 |
|
| 20 |
+
# ✅ RECOMMENDATION: To avoid version conflicts, it's best to remove 'llama-cpp-python'
|
| 21 |
+
# from your requirements.txt and rely on this explicit, version-pinned installation.
|
| 22 |
RUN pip install --no-cache-dir llama-cpp-python==0.2.61
|
| 23 |
|
| 24 |
# Install other dependencies from requirements.txt
|
| 25 |
COPY requirements.txt .
|
| 26 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 27 |
|
| 28 |
+
# Copy the application code and the processed data file
|
| 29 |
COPY ./app ./app
|
| 30 |
+
# ✅ CORRECTED FILENAME: Ensure this matches the output of your chunking script
|
| 31 |
+
COPY ./granular_chunks_improved.jsonl .
|
| 32 |
|
| 33 |
+
# Download your fine-tuned TinyLlama GGUF model from Hugging Face
|
| 34 |
RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
|
| 35 |
https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
|
| 36 |
&& echo "✅ TinyLlama model downloaded."
|
|
|
|
| 38 |
# Expose the application port
|
| 39 |
EXPOSE 7860
|
| 40 |
|
| 41 |
+
# Run the FastAPI application using uvicorn
|
| 42 |
CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/app.py
CHANGED
|
@@ -5,6 +5,7 @@ import logging
|
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
from pydantic import BaseModel
|
| 7 |
from llama_cpp import Llama
|
|
|
|
| 8 |
from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
|
| 9 |
|
| 10 |
# -----------------------------
|
|
@@ -26,13 +27,14 @@ async def root():
|
|
| 26 |
# ✅ Vector DB and Data Configuration
|
| 27 |
# -----------------------------
|
| 28 |
DB_PERSIST_DIRECTORY = "/app/vector_database"
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
logger.info("[INFO] Initializing vector DB...")
|
| 32 |
db = PolicyVectorDB(
|
| 33 |
persist_directory=DB_PERSIST_DIRECTORY,
|
| 34 |
top_k_default=5,
|
| 35 |
-
relevance_threshold=0.2
|
| 36 |
)
|
| 37 |
|
| 38 |
if not ensure_db_populated(db, CHUNKS_FILE_PATH):
|
|
@@ -48,7 +50,7 @@ logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
|
|
| 48 |
|
| 49 |
llm = Llama(
|
| 50 |
model_path=MODEL_PATH,
|
| 51 |
-
n_ctx=2048,
|
| 52 |
n_threads=2,
|
| 53 |
n_batch=8,
|
| 54 |
use_mlock=False,
|
|
@@ -86,14 +88,10 @@ async def chat(query: Query):
|
|
| 86 |
question = query.question.strip()
|
| 87 |
logger.info(f"[QUERY] {question}")
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
[r for r in search_results if r["relevance_score"] > db.relevance_threshold],
|
| 92 |
-
key=lambda x: x["relevance_score"],
|
| 93 |
-
reverse=True
|
| 94 |
-
)
|
| 95 |
|
| 96 |
-
if not
|
| 97 |
logger.info("[RESPONSE] No relevant context found.")
|
| 98 |
return {
|
| 99 |
"question": question,
|
|
@@ -101,10 +99,19 @@ async def chat(query: Query):
|
|
| 101 |
"answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
|
| 102 |
}
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
answer = "Sorry, I couldn't process your request right now. Please try again later."
|
| 110 |
try:
|
|
|
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
from pydantic import BaseModel
|
| 7 |
from llama_cpp import Llama
|
| 8 |
+
# Correctly reference the module within the 'app' package
|
| 9 |
from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
|
| 10 |
|
| 11 |
# -----------------------------
|
|
|
|
| 27 |
# ✅ Vector DB and Data Configuration
|
| 28 |
# -----------------------------
|
| 29 |
DB_PERSIST_DIRECTORY = "/app/vector_database"
|
| 30 |
+
# ✅ CORRECTED FILENAME: Match the output of your chunking script
|
| 31 |
+
CHUNKS_FILE_PATH = "/app/granular_chunks_improved.jsonl"
|
| 32 |
|
| 33 |
logger.info("[INFO] Initializing vector DB...")
|
| 34 |
db = PolicyVectorDB(
|
| 35 |
persist_directory=DB_PERSIST_DIRECTORY,
|
| 36 |
top_k_default=5,
|
| 37 |
+
relevance_threshold=0.2 # This threshold is now applied inside the search method
|
| 38 |
)
|
| 39 |
|
| 40 |
if not ensure_db_populated(db, CHUNKS_FILE_PATH):
|
|
|
|
| 50 |
|
| 51 |
llm = Llama(
|
| 52 |
model_path=MODEL_PATH,
|
| 53 |
+
n_ctx=2048,
|
| 54 |
n_threads=2,
|
| 55 |
n_batch=8,
|
| 56 |
use_mlock=False,
|
|
|
|
| 88 |
question = query.question.strip()
|
| 89 |
logger.info(f"[QUERY] {question}")
|
| 90 |
|
| 91 |
+
# The search method now handles filtering internally
|
| 92 |
+
search_results = db.search(question, top_k=5)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
if not search_results:
|
| 95 |
logger.info("[RESPONSE] No relevant context found.")
|
| 96 |
return {
|
| 97 |
"question": question,
|
|
|
|
| 99 |
"answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
|
| 100 |
}
|
| 101 |
|
| 102 |
+
# ✅ RECOMMENDED CHANGE: Combine the top 3 contexts for a richer prompt
|
| 103 |
+
top_k_for_context = 3
|
| 104 |
+
context_chunks = [result['text'] for result in search_results[:top_k_for_context]]
|
| 105 |
+
context = "\n---\n".join(context_chunks)
|
| 106 |
+
|
| 107 |
+
top_score = search_results[0]['relevance_score']
|
| 108 |
+
logger.info(f"[INFO] Using top {len(context_chunks)} contexts (top score: {top_score:.4f})")
|
| 109 |
+
|
| 110 |
+
prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies. Only use the context provided to answer the question. Be precise.
|
| 111 |
+
### Relevant Context:
|
| 112 |
+
{context}
|
| 113 |
+
### Question: {question}
|
| 114 |
+
### Answer:"""
|
| 115 |
|
| 116 |
answer = "Sorry, I couldn't process your request right now. Please try again later."
|
| 117 |
try:
|
app/policy_vector_db.py
CHANGED
|
@@ -14,8 +14,9 @@ class PolicyVectorDB:
|
|
| 14 |
self.persist_directory = persist_directory
|
| 15 |
self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
|
| 16 |
self.collection_name = "neepco_dop_policies"
|
|
|
|
| 17 |
self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
|
| 18 |
-
self.collection =
|
| 19 |
self.top_k_default = top_k_default
|
| 20 |
self.relevance_threshold = relevance_threshold
|
| 21 |
|
|
@@ -36,29 +37,26 @@ class PolicyVectorDB:
|
|
| 36 |
logger.info("No chunks provided to add.")
|
| 37 |
return
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
if not new_chunks:
|
| 46 |
-
logger.info("
|
| 47 |
return
|
| 48 |
|
| 49 |
logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
|
| 50 |
batch_size = 64
|
| 51 |
for i in range(0, len(new_chunks), batch_size):
|
| 52 |
batch = new_chunks[i:i + batch_size]
|
| 53 |
-
texts = [chunk['text'] for chunk in batch]
|
| 54 |
ids = [str(chunk['id']) for chunk in batch]
|
| 55 |
-
|
| 56 |
-
metadatas = []
|
| 57 |
-
for chunk in batch:
|
| 58 |
-
meta = chunk.get('metadata')
|
| 59 |
-
if not meta:
|
| 60 |
-
meta = {"description": "General information chunk."}
|
| 61 |
-
metadatas.append(self._flatten_metadata(meta))
|
| 62 |
|
| 63 |
embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
|
| 64 |
collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
|
|
@@ -68,45 +66,58 @@ class PolicyVectorDB:
|
|
| 68 |
def search(self, query_text: str, top_k: int = None) -> List[Dict]:
|
| 69 |
collection = self._get_collection()
|
| 70 |
query_embedding = self.embedding_model.encode([query_text]).tolist()
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
results = collection.query(
|
| 73 |
query_embeddings=query_embedding,
|
| 74 |
-
n_results=
|
| 75 |
include=["documents", "metadatas", "distances"]
|
| 76 |
)
|
| 77 |
|
| 78 |
search_results = []
|
| 79 |
-
if results and results
|
| 80 |
for i, doc in enumerate(results['documents'][0]):
|
| 81 |
-
relevance_score = 1 - results['distances'][0][i]
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
|
| 90 |
try:
|
| 91 |
-
if db_instance._get_collection().count()
|
| 92 |
-
logger.info("Vector database is empty. Attempting to populate from chunks file.")
|
| 93 |
-
if not os.path.exists(chunks_file_path):
|
| 94 |
-
logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
|
| 95 |
-
return False
|
| 96 |
-
|
| 97 |
-
with open(chunks_file_path, 'r', encoding='utf-8') as f:
|
| 98 |
-
chunks_to_add = json.load(f)
|
| 99 |
-
|
| 100 |
-
if not chunks_to_add:
|
| 101 |
-
logger.warning(f"Chunks file at {chunks_file_path} is empty. No data to add to DB.")
|
| 102 |
-
return False
|
| 103 |
-
|
| 104 |
-
db_instance.add_chunks(chunks_to_add)
|
| 105 |
-
logger.info("Vector database population attempt complete.")
|
| 106 |
-
return True
|
| 107 |
-
else:
|
| 108 |
logger.info("Vector database already contains data. Skipping population.")
|
| 109 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
except Exception as e:
|
| 111 |
logger.error(f"DB Population Error: {e}", exc_info=True)
|
| 112 |
return False
|
|
|
|
| 14 |
self.persist_directory = persist_directory
|
| 15 |
self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
|
| 16 |
self.collection_name = "neepco_dop_policies"
|
| 17 |
+
# ✅ Use 'cuda' if a GPU is available for better performance
|
| 18 |
self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
|
| 19 |
+
self.collection = self._get_collection()
|
| 20 |
self.top_k_default = top_k_default
|
| 21 |
self.relevance_threshold = relevance_threshold
|
| 22 |
|
|
|
|
| 37 |
logger.info("No chunks provided to add.")
|
| 38 |
return
|
| 39 |
|
| 40 |
+
chunks_with_ids = [c for c in chunks if c.get('id')]
|
| 41 |
+
if len(chunks) != len(chunks_with_ids):
|
| 42 |
+
logger.warning(f"Skipped {len(chunks) - len(chunks_with_ids)} chunks that were missing an 'id'.")
|
| 43 |
+
if not chunks_with_ids:
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
existing_ids = set(collection.get(ids=[str(c['id']) for c in chunks_with_ids])['ids'])
|
| 47 |
+
new_chunks = [chunk for chunk in chunks_with_ids if str(chunk.get('id')) not in existing_ids]
|
| 48 |
|
| 49 |
if not new_chunks:
|
| 50 |
+
logger.info("All provided chunks already exist in the database.")
|
| 51 |
return
|
| 52 |
|
| 53 |
logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
|
| 54 |
batch_size = 64
|
| 55 |
for i in range(0, len(new_chunks), batch_size):
|
| 56 |
batch = new_chunks[i:i + batch_size]
|
|
|
|
| 57 |
ids = [str(chunk['id']) for chunk in batch]
|
| 58 |
+
texts = [chunk['text'] for chunk in batch]
|
| 59 |
+
metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
|
| 62 |
collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
|
|
|
|
| 66 |
def search(self, query_text: str, top_k: int = None) -> List[Dict]:
|
| 67 |
collection = self._get_collection()
|
| 68 |
query_embedding = self.embedding_model.encode([query_text]).tolist()
|
| 69 |
+
k = top_k if top_k is not None else self.top_k_default
|
| 70 |
+
|
| 71 |
+
# Retrieve more results initially to allow for filtering
|
| 72 |
results = collection.query(
|
| 73 |
query_embeddings=query_embedding,
|
| 74 |
+
n_results=k * 2,
|
| 75 |
include=["documents", "metadatas", "distances"]
|
| 76 |
)
|
| 77 |
|
| 78 |
search_results = []
|
| 79 |
+
if results and results.get('documents') and results['documents'][0]:
|
| 80 |
for i, doc in enumerate(results['documents'][0]):
|
| 81 |
+
relevance_score = 1 - results['distances'][0][i]
|
| 82 |
+
|
| 83 |
+
# ✅ RECOMMENDED CHANGE: Filter results internally based on the threshold
|
| 84 |
+
if relevance_score >= self.relevance_threshold:
|
| 85 |
+
search_results.append({
|
| 86 |
+
'text': doc,
|
| 87 |
+
'metadata': results['metadatas'][0][i],
|
| 88 |
+
'relevance_score': relevance_score
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
# Return the top k results *after* filtering
|
| 92 |
+
return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
|
| 93 |
|
| 94 |
def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
|
| 95 |
try:
|
| 96 |
+
if db_instance._get_collection().count() > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
logger.info("Vector database already contains data. Skipping population.")
|
| 98 |
return True
|
| 99 |
+
|
| 100 |
+
logger.info("Vector database is empty. Attempting to populate from chunks file.")
|
| 101 |
+
if not os.path.exists(chunks_file_path):
|
| 102 |
+
logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
# ✅ CORRECTED CODE: Read the JSONL file line-by-line
|
| 106 |
+
chunks_to_add = []
|
| 107 |
+
with open(chunks_file_path, 'r', encoding='utf-8') as f:
|
| 108 |
+
for line in f:
|
| 109 |
+
try:
|
| 110 |
+
chunks_to_add.append(json.loads(line))
|
| 111 |
+
except json.JSONDecodeError:
|
| 112 |
+
logger.warning(f"Skipping malformed line in chunks file: {line.strip()}")
|
| 113 |
+
|
| 114 |
+
if not chunks_to_add:
|
| 115 |
+
logger.warning(f"Chunks file at {chunks_file_path} is empty or invalid. No data to add.")
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
db_instance.add_chunks(chunks_to_add)
|
| 119 |
+
logger.info("Vector database population attempt complete.")
|
| 120 |
+
return True
|
| 121 |
except Exception as e:
|
| 122 |
logger.error(f"DB Population Error: {e}", exc_info=True)
|
| 123 |
return False
|
create_granular_chunks.py
CHANGED
|
@@ -3,13 +3,13 @@ import re
|
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
|
| 5 |
# Define the input and output filenames
|
| 6 |
-
INPUT_FILE = "combined_context.jsonl"
|
| 7 |
-
OUTPUT_FILE = "
|
| 8 |
|
| 9 |
# Global counter to ensure all generated IDs are unique
|
| 10 |
chunk_counter = 0
|
| 11 |
|
| 12 |
-
def get_unique_id():
|
| 13 |
"""Returns a unique, incrementing ID."""
|
| 14 |
global chunk_counter
|
| 15 |
chunk_counter += 1
|
|
@@ -36,18 +36,26 @@ def parse_value_to_int(value_str: str) -> int:
|
|
| 36 |
return 0
|
| 37 |
|
| 38 |
def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
|
| 39 |
-
"""Helper function to create a standardized chunk."""
|
| 40 |
chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
|
| 41 |
-
text = text_override if text_override else context.get("description", context.get("title", str(context)))
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
metadata = {
|
| 44 |
"section": context.get("section"),
|
| 45 |
"clause": context.get("clause"),
|
|
|
|
| 46 |
"title": context.get("title"),
|
| 47 |
"description": context.get("description"),
|
| 48 |
"authority": context.get("authority"),
|
| 49 |
"limit_text": context.get("limit_text"),
|
| 50 |
"limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
return {
|
|
@@ -56,50 +64,103 @@ def create_chunk(context: Dict, text_override: str = None, id_override: str = No
|
|
| 56 |
"metadata": {k: v for k, v in metadata.items() if v is not None}
|
| 57 |
}
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def process_chunk(data: Dict, context: Dict) -> List[Dict]:
|
| 60 |
"""
|
| 61 |
-
Processes a dictionary from the source file and deconstructs it
|
| 62 |
"""
|
| 63 |
new_chunks = []
|
| 64 |
|
| 65 |
-
# Update context with current data
|
| 66 |
current_context = context.copy()
|
| 67 |
current_context.update(data)
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if "delegation" in data and isinstance(data["delegation"], dict):
|
| 71 |
for authority, limit_text in data["delegation"].items():
|
| 72 |
-
|
|
|
|
| 73 |
chunk_context = current_context.copy()
|
| 74 |
chunk_context["authority"] = authority
|
| 75 |
-
chunk_context["limit_text"] = limit_text
|
| 76 |
new_chunks.append(create_chunk(chunk_context, text_override=text))
|
| 77 |
return new_chunks
|
| 78 |
|
| 79 |
-
#
|
| 80 |
if "authority" in data and "extent_of_power" in data:
|
| 81 |
-
|
| 82 |
-
text = f"Regarding '{current_context.get('title')}', the authority and extent of power are as follows: {json.dumps(data)}."
|
| 83 |
-
new_chunks.append(create_chunk(current_context, text_override=text))
|
| 84 |
-
return new_chunks
|
| 85 |
|
| 86 |
-
#
|
| 87 |
-
has_nested_chunks = False
|
| 88 |
for key, value in data.items():
|
| 89 |
-
if isinstance(value, list):
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
nested_results = process_chunk(item, current_context)
|
| 94 |
if nested_results:
|
| 95 |
new_chunks.extend(nested_results)
|
| 96 |
has_nested_chunks = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
# If we
|
| 99 |
if has_nested_chunks:
|
| 100 |
return new_chunks
|
| 101 |
|
| 102 |
-
# Base case: If no specific rules were
|
|
|
|
| 103 |
new_chunks.append(create_chunk(current_context))
|
| 104 |
return new_chunks
|
| 105 |
|
|
@@ -128,7 +189,7 @@ def main():
|
|
| 128 |
for chunk in final_chunks:
|
| 129 |
f.write(json.dumps(chunk) + '\n')
|
| 130 |
|
| 131 |
-
print(f"Successfully created granular chunks file: '{OUTPUT_FILE}'")
|
| 132 |
|
| 133 |
if __name__ == "__main__":
|
| 134 |
main()
|
|
|
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
|
| 5 |
# Define the input and output filenames
|
| 6 |
+
INPUT_FILE = "combined_context.jsonl"
|
| 7 |
+
OUTPUT_FILE = "granular_chunks_improved.jsonl"
|
| 8 |
|
| 9 |
# Global counter to ensure all generated IDs are unique
|
| 10 |
chunk_counter = 0
|
| 11 |
|
| 12 |
+
def get_unique_id() -> int:
|
| 13 |
"""Returns a unique, incrementing ID."""
|
| 14 |
global chunk_counter
|
| 15 |
chunk_counter += 1
|
|
|
|
| 36 |
return 0
|
| 37 |
|
| 38 |
def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
|
| 39 |
+
"""Helper function to create a standardized chunk with rich metadata."""
|
| 40 |
chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
|
|
|
|
| 41 |
|
| 42 |
+
# Determine the primary text for the chunk
|
| 43 |
+
text = text_override
|
| 44 |
+
if not text:
|
| 45 |
+
# Create a sensible default text if none is provided
|
| 46 |
+
text_parts = [context.get("title"), context.get("description")]
|
| 47 |
+
text = ". ".join(filter(None, text_parts)) or str(context)
|
| 48 |
+
|
| 49 |
metadata = {
|
| 50 |
"section": context.get("section"),
|
| 51 |
"clause": context.get("clause"),
|
| 52 |
+
"subclause_id": context.get("id"),
|
| 53 |
"title": context.get("title"),
|
| 54 |
"description": context.get("description"),
|
| 55 |
"authority": context.get("authority"),
|
| 56 |
"limit_text": context.get("limit_text"),
|
| 57 |
"limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
|
| 58 |
+
"source": context.get("source"),
|
| 59 |
}
|
| 60 |
|
| 61 |
return {
|
|
|
|
| 64 |
"metadata": {k: v for k, v in metadata.items() if v is not None}
|
| 65 |
}
|
| 66 |
|
| 67 |
+
def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
|
| 68 |
+
"""
|
| 69 |
+
Specifically handles the complex "authority" and "extent_of_power" structures.
|
| 70 |
+
This logic is complex because the data types for these keys vary.
|
| 71 |
+
"""
|
| 72 |
+
chunks = []
|
| 73 |
+
title = context.get("title", "this rule")
|
| 74 |
+
|
| 75 |
+
# Case 1: Authority and Power are simple strings
|
| 76 |
+
if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
|
| 77 |
+
text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
|
| 78 |
+
chunk_context = context.copy()
|
| 79 |
+
chunk_context["authority"] = data['authority']
|
| 80 |
+
chunk_context["limit_text"] = data['extent_of_power']
|
| 81 |
+
chunks.append(create_chunk(chunk_context, text_override=text))
|
| 82 |
+
|
| 83 |
+
# Case 2: Authority and Power are lists of dictionaries (most complex case)
|
| 84 |
+
elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
|
| 85 |
+
authorities = data["authority"]
|
| 86 |
+
powers = data["extent_of_power"]
|
| 87 |
+
# Assuming the lists correspond to each other
|
| 88 |
+
for i in range(min(len(authorities), len(powers))):
|
| 89 |
+
auth_item = authorities[i]
|
| 90 |
+
power_item = powers[i]
|
| 91 |
+
# Extract descriptions from the dictionaries
|
| 92 |
+
auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
|
| 93 |
+
power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)
|
| 94 |
+
|
| 95 |
+
text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
|
| 96 |
+
chunk_context = context.copy()
|
| 97 |
+
chunk_context["authority"] = auth_desc
|
| 98 |
+
chunk_context["limit_text"] = power_desc
|
| 99 |
+
chunks.append(create_chunk(chunk_context, text_override=text))
|
| 100 |
+
|
| 101 |
+
# Fallback for any other structure
|
| 102 |
+
else:
|
| 103 |
+
text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
|
| 104 |
+
chunks.append(create_chunk(context, text_override=text))
|
| 105 |
+
|
| 106 |
+
return chunks
|
| 107 |
+
|
| 108 |
def process_chunk(data: Dict, context: Dict) -> List[Dict]:
|
| 109 |
"""
|
| 110 |
+
Processes a dictionary from the source file and deconstructs it into granular chunks.
|
| 111 |
"""
|
| 112 |
new_chunks = []
|
| 113 |
|
| 114 |
+
# Update context with current data, giving preference to new keys
|
| 115 |
current_context = context.copy()
|
| 116 |
current_context.update(data)
|
| 117 |
|
| 118 |
+
has_nested_chunks = False
|
| 119 |
+
|
| 120 |
+
# --- Rule-based deconstruction ---
|
| 121 |
+
|
| 122 |
+
# Rule 1: Handle "delegation" structure (most specific)
|
| 123 |
if "delegation" in data and isinstance(data["delegation"], dict):
|
| 124 |
for authority, limit_text in data["delegation"].items():
|
| 125 |
+
desc = current_context.get('description') or current_context.get('title')
|
| 126 |
+
text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
|
| 127 |
chunk_context = current_context.copy()
|
| 128 |
chunk_context["authority"] = authority
|
| 129 |
+
chunk_context["limit_text"] = str(limit_text)
|
| 130 |
new_chunks.append(create_chunk(chunk_context, text_override=text))
|
| 131 |
return new_chunks
|
| 132 |
|
| 133 |
+
# Rule 2: Handle "authority" and "extent_of_power" structures
|
| 134 |
if "authority" in data and "extent_of_power" in data:
|
| 135 |
+
return _process_authority_power(data, current_context)
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
# Rule 3: Recursively process nested lists of dictionaries or strings
|
|
|
|
| 138 |
for key, value in data.items():
|
| 139 |
+
if isinstance(value, list) and value:
|
| 140 |
+
# Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
|
| 141 |
+
if all(isinstance(item, dict) for item in value):
|
| 142 |
+
for item in value:
|
| 143 |
nested_results = process_chunk(item, current_context)
|
| 144 |
if nested_results:
|
| 145 |
new_chunks.extend(nested_results)
|
| 146 |
has_nested_chunks = True
|
| 147 |
+
|
| 148 |
+
# Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
|
| 149 |
+
elif all(isinstance(item, str) for item in value):
|
| 150 |
+
title = current_context.get('title')
|
| 151 |
+
for item_text in value:
|
| 152 |
+
text = f"Regarding '{title}', a relevant item is: {item_text}."
|
| 153 |
+
new_chunks.append(create_chunk(current_context, text_override=text))
|
| 154 |
+
has_nested_chunks = True
|
| 155 |
+
|
| 156 |
+
# --- Finalization ---
|
| 157 |
|
| 158 |
+
# If we created specific chunks from children, we don't need the generic parent.
|
| 159 |
if has_nested_chunks:
|
| 160 |
return new_chunks
|
| 161 |
|
| 162 |
+
# Base case: If no specific rules were matched, create a single chunk for the item.
|
| 163 |
+
# This happens for "leaf" nodes that cannot be deconstructed further.
|
| 164 |
new_chunks.append(create_chunk(current_context))
|
| 165 |
return new_chunks
|
| 166 |
|
|
|
|
| 189 |
for chunk in final_chunks:
|
| 190 |
f.write(json.dumps(chunk) + '\n')
|
| 191 |
|
| 192 |
+
print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
|
| 193 |
|
| 194 |
if __name__ == "__main__":
|
| 195 |
main()
|
processed_chunks.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|