Kalpokoch commited on
Commit
9df2551
·
1 Parent(s): a47545a

updated to granular context chunks

Browse files
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  FROM python:3.11-slim
2
 
3
- # Install required system dependencies
4
  RUN apt-get update && apt-get install -y \
5
  git curl build-essential cmake \
6
  && rm -rf /var/lib/apt/lists/*
@@ -8,26 +8,29 @@ RUN apt-get update && apt-get install -y \
8
  # Set working directory
9
  WORKDIR /app
10
 
11
- # Create writable directories
 
12
  RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
13
 
14
- # Set environment variables
15
  ENV TRANSFORMERS_CACHE=/app/.cache \
16
  HF_HOME=/app/.cache \
17
  CHROMADB_DISABLE_TELEMETRY=true
18
 
19
- # Pre-install the specific, known-working version of llama-cpp-python for TinyLlama
 
20
  RUN pip install --no-cache-dir llama-cpp-python==0.2.61
21
 
22
  # Install other dependencies from requirements.txt
23
  COPY requirements.txt .
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
- # Copy the application code and data file
27
  COPY ./app ./app
28
- COPY ./processed_chunks.json .
 
29
 
30
- # Download your fine-tuned TinyLlama GGUF model
31
  RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
32
  https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
33
  && echo "✅ TinyLlama model downloaded."
@@ -35,5 +38,5 @@ RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
35
  # Expose the application port
36
  EXPOSE 7860
37
 
38
- # Run the FastAPI application
39
  CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.11-slim
2
 
3
+ # Install required system dependencies needed for llama-cpp-python compilation
4
  RUN apt-get update && apt-get install -y \
5
  git curl build-essential cmake \
6
  && rm -rf /var/lib/apt/lists/*
 
8
  # Set working directory
9
  WORKDIR /app
10
 
11
+ # Create writable directories for cache and the persistent vector DB
12
+ # Note: For production, consider using a non-root user and more specific permissions
13
  RUN mkdir -p /app/.cache /app/vector_database && chmod -R 777 /app
14
 
15
+ # Set environment variables for huggingface cache and to disable chroma telemetry
16
  ENV TRANSFORMERS_CACHE=/app/.cache \
17
  HF_HOME=/app/.cache \
18
  CHROMADB_DISABLE_TELEMETRY=true
19
 
20
+ # RECOMMENDATION: To avoid version conflicts, it's best to remove 'llama-cpp-python'
21
+ # from your requirements.txt and rely on this explicit, version-pinned installation.
22
  RUN pip install --no-cache-dir llama-cpp-python==0.2.61
23
 
24
  # Install other dependencies from requirements.txt
25
  COPY requirements.txt .
26
  RUN pip install --no-cache-dir -r requirements.txt
27
 
28
+ # Copy the application code and the processed data file
29
  COPY ./app ./app
30
+ # CORRECTED FILENAME: Ensure this matches the output of your chunking script
31
+ COPY ./granular_chunks_improved.jsonl .
32
 
33
+ # Download your fine-tuned TinyLlama GGUF model from Hugging Face
34
  RUN curl -fL -o /app/tinyllama_dop_q4_k_m.gguf \
35
  https://huggingface.co/Kalpokoch/FinetunedQuantizedTinyLama/resolve/main/tinyllama_dop_q4_k_m.gguf \
36
  && echo "✅ TinyLlama model downloaded."
 
38
  # Expose the application port
39
  EXPOSE 7860
40
 
41
+ # Run the FastAPI application using uvicorn
42
  CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
app/app.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  from fastapi import FastAPI, HTTPException
6
  from pydantic import BaseModel
7
  from llama_cpp import Llama
 
8
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
9
 
10
  # -----------------------------
@@ -26,13 +27,14 @@ async def root():
26
  # ✅ Vector DB and Data Configuration
27
  # -----------------------------
28
  DB_PERSIST_DIRECTORY = "/app/vector_database"
29
- CHUNKS_FILE_PATH = "/app/processed_chunks.json"
 
30
 
31
  logger.info("[INFO] Initializing vector DB...")
32
  db = PolicyVectorDB(
33
  persist_directory=DB_PERSIST_DIRECTORY,
34
  top_k_default=5,
35
- relevance_threshold=0.2
36
  )
37
 
38
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
@@ -48,7 +50,7 @@ logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
48
 
49
  llm = Llama(
50
  model_path=MODEL_PATH,
51
- n_ctx=2048, # Increased context window to prevent errors
52
  n_threads=2,
53
  n_batch=8,
54
  use_mlock=False,
@@ -86,14 +88,10 @@ async def chat(query: Query):
86
  question = query.question.strip()
87
  logger.info(f"[QUERY] {question}")
88
 
89
- search_results = db.search(question)
90
- filtered = sorted(
91
- [r for r in search_results if r["relevance_score"] > db.relevance_threshold],
92
- key=lambda x: x["relevance_score"],
93
- reverse=True
94
- )
95
 
96
- if not filtered:
97
  logger.info("[RESPONSE] No relevant context found.")
98
  return {
99
  "question": question,
@@ -101,10 +99,19 @@ async def chat(query: Query):
101
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
102
  }
103
 
104
- context = filtered[0]["text"]
105
- logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
106
-
107
- prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
 
 
 
 
 
 
 
 
 
108
 
109
  answer = "Sorry, I couldn't process your request right now. Please try again later."
110
  try:
 
5
  from fastapi import FastAPI, HTTPException
6
  from pydantic import BaseModel
7
  from llama_cpp import Llama
8
+ # Correctly reference the module within the 'app' package
9
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
10
 
11
  # -----------------------------
 
27
  # ✅ Vector DB and Data Configuration
28
  # -----------------------------
29
  DB_PERSIST_DIRECTORY = "/app/vector_database"
30
+ # CORRECTED FILENAME: Match the output of your chunking script
31
+ CHUNKS_FILE_PATH = "/app/granular_chunks_improved.jsonl"
32
 
33
  logger.info("[INFO] Initializing vector DB...")
34
  db = PolicyVectorDB(
35
  persist_directory=DB_PERSIST_DIRECTORY,
36
  top_k_default=5,
37
+ relevance_threshold=0.2 # This threshold is now applied inside the search method
38
  )
39
 
40
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
 
50
 
51
  llm = Llama(
52
  model_path=MODEL_PATH,
53
+ n_ctx=2048,
54
  n_threads=2,
55
  n_batch=8,
56
  use_mlock=False,
 
88
  question = query.question.strip()
89
  logger.info(f"[QUERY] {question}")
90
 
91
+ # The search method now handles filtering internally
92
+ search_results = db.search(question, top_k=5)
 
 
 
 
93
 
94
+ if not search_results:
95
  logger.info("[RESPONSE] No relevant context found.")
96
  return {
97
  "question": question,
 
99
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
100
  }
101
 
102
+ # RECOMMENDED CHANGE: Combine the top 3 contexts for a richer prompt
103
+ top_k_for_context = 3
104
+ context_chunks = [result['text'] for result in search_results[:top_k_for_context]]
105
+ context = "\n---\n".join(context_chunks)
106
+
107
+ top_score = search_results[0]['relevance_score']
108
+ logger.info(f"[INFO] Using top {len(context_chunks)} contexts (top score: {top_score:.4f})")
109
+
110
+ prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies. Only use the context provided to answer the question. Be precise.
111
+ ### Relevant Context:
112
+ {context}
113
+ ### Question: {question}
114
+ ### Answer:"""
115
 
116
  answer = "Sorry, I couldn't process your request right now. Please try again later."
117
  try:
app/policy_vector_db.py CHANGED
@@ -14,8 +14,9 @@ class PolicyVectorDB:
14
  self.persist_directory = persist_directory
15
  self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
16
  self.collection_name = "neepco_dop_policies"
 
17
  self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
18
- self.collection = None
19
  self.top_k_default = top_k_default
20
  self.relevance_threshold = relevance_threshold
21
 
@@ -36,29 +37,26 @@ class PolicyVectorDB:
36
  logger.info("No chunks provided to add.")
37
  return
38
 
39
- # Ensure all IDs are strings before checking for existence
40
- new_chunks = [chunk for chunk in chunks if chunk.get('id')]
41
- existing_ids = set(collection.get(ids=[str(c['id']) for c in new_chunks])['ids'])
42
-
43
- new_chunks = [chunk for chunk in new_chunks if str(chunk.get('id')) not in existing_ids]
 
 
 
44
 
45
  if not new_chunks:
46
- logger.info("No new chunks to add to the database.")
47
  return
48
 
49
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
50
  batch_size = 64
51
  for i in range(0, len(new_chunks), batch_size):
52
  batch = new_chunks[i:i + batch_size]
53
- texts = [chunk['text'] for chunk in batch]
54
  ids = [str(chunk['id']) for chunk in batch]
55
-
56
- metadatas = []
57
- for chunk in batch:
58
- meta = chunk.get('metadata')
59
- if not meta:
60
- meta = {"description": "General information chunk."}
61
- metadatas.append(self._flatten_metadata(meta))
62
 
63
  embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
64
  collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
@@ -68,45 +66,58 @@ class PolicyVectorDB:
68
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
69
  collection = self._get_collection()
70
  query_embedding = self.embedding_model.encode([query_text]).tolist()
71
- top_k = top_k if top_k else self.top_k_default
 
 
72
  results = collection.query(
73
  query_embeddings=query_embedding,
74
- n_results=top_k,
75
  include=["documents", "metadatas", "distances"]
76
  )
77
 
78
  search_results = []
79
- if results and results['documents'] and results['documents'][0]:
80
  for i, doc in enumerate(results['documents'][0]):
81
- relevance_score = 1 - results['distances'][0][i]
82
- search_results.append({
83
- 'text': doc,
84
- 'metadata': results['metadatas'][0][i],
85
- 'relevance_score': relevance_score
86
- })
87
- return search_results
 
 
 
 
 
88
 
89
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
90
  try:
91
- if db_instance._get_collection().count() == 0:
92
- logger.info("Vector database is empty. Attempting to populate from chunks file.")
93
- if not os.path.exists(chunks_file_path):
94
- logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
95
- return False
96
-
97
- with open(chunks_file_path, 'r', encoding='utf-8') as f:
98
- chunks_to_add = json.load(f)
99
-
100
- if not chunks_to_add:
101
- logger.warning(f"Chunks file at {chunks_file_path} is empty. No data to add to DB.")
102
- return False
103
-
104
- db_instance.add_chunks(chunks_to_add)
105
- logger.info("Vector database population attempt complete.")
106
- return True
107
- else:
108
  logger.info("Vector database already contains data. Skipping population.")
109
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  except Exception as e:
111
  logger.error(f"DB Population Error: {e}", exc_info=True)
112
  return False
 
14
  self.persist_directory = persist_directory
15
  self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
16
  self.collection_name = "neepco_dop_policies"
17
+ # ✅ Use 'cuda' if a GPU is available for better performance
18
  self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
19
+ self.collection = self._get_collection()
20
  self.top_k_default = top_k_default
21
  self.relevance_threshold = relevance_threshold
22
 
 
37
  logger.info("No chunks provided to add.")
38
  return
39
 
40
+ chunks_with_ids = [c for c in chunks if c.get('id')]
41
+ if len(chunks) != len(chunks_with_ids):
42
+ logger.warning(f"Skipped {len(chunks) - len(chunks_with_ids)} chunks that were missing an 'id'.")
43
+ if not chunks_with_ids:
44
+ return
45
+
46
+ existing_ids = set(collection.get(ids=[str(c['id']) for c in chunks_with_ids])['ids'])
47
+ new_chunks = [chunk for chunk in chunks_with_ids if str(chunk.get('id')) not in existing_ids]
48
 
49
  if not new_chunks:
50
+ logger.info("All provided chunks already exist in the database.")
51
  return
52
 
53
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
54
  batch_size = 64
55
  for i in range(0, len(new_chunks), batch_size):
56
  batch = new_chunks[i:i + batch_size]
 
57
  ids = [str(chunk['id']) for chunk in batch]
58
+ texts = [chunk['text'] for chunk in batch]
59
+ metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
 
 
 
 
 
60
 
61
  embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
62
  collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
 
66
  def search(self, query_text: str, top_k: int = None) -> List[Dict]:
67
  collection = self._get_collection()
68
  query_embedding = self.embedding_model.encode([query_text]).tolist()
69
+ k = top_k if top_k is not None else self.top_k_default
70
+
71
+ # Retrieve more results initially to allow for filtering
72
  results = collection.query(
73
  query_embeddings=query_embedding,
74
+ n_results=k * 2,
75
  include=["documents", "metadatas", "distances"]
76
  )
77
 
78
  search_results = []
79
+ if results and results.get('documents') and results['documents'][0]:
80
  for i, doc in enumerate(results['documents'][0]):
81
+ relevance_score = 1 - results['distances'][0][i]
82
+
83
+ # ✅ RECOMMENDED CHANGE: Filter results internally based on the threshold
84
+ if relevance_score >= self.relevance_threshold:
85
+ search_results.append({
86
+ 'text': doc,
87
+ 'metadata': results['metadatas'][0][i],
88
+ 'relevance_score': relevance_score
89
+ })
90
+
91
+ # Return the top k results *after* filtering
92
+ return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
93
 
94
  def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
95
  try:
96
+ if db_instance._get_collection().count() > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  logger.info("Vector database already contains data. Skipping population.")
98
  return True
99
+
100
+ logger.info("Vector database is empty. Attempting to populate from chunks file.")
101
+ if not os.path.exists(chunks_file_path):
102
+ logger.error(f"Chunks file not found at {chunks_file_path}. Cannot populate DB.")
103
+ return False
104
+
105
+ # ✅ CORRECTED CODE: Read the JSONL file line-by-line
106
+ chunks_to_add = []
107
+ with open(chunks_file_path, 'r', encoding='utf-8') as f:
108
+ for line in f:
109
+ try:
110
+ chunks_to_add.append(json.loads(line))
111
+ except json.JSONDecodeError:
112
+ logger.warning(f"Skipping malformed line in chunks file: {line.strip()}")
113
+
114
+ if not chunks_to_add:
115
+ logger.warning(f"Chunks file at {chunks_file_path} is empty or invalid. No data to add.")
116
+ return False
117
+
118
+ db_instance.add_chunks(chunks_to_add)
119
+ logger.info("Vector database population attempt complete.")
120
+ return True
121
  except Exception as e:
122
  logger.error(f"DB Population Error: {e}", exc_info=True)
123
  return False
create_granular_chunks.py CHANGED
@@ -3,13 +3,13 @@ import re
3
  from typing import List, Dict, Any
4
 
5
  # Define the input and output filenames
6
- INPUT_FILE = "combined_context.jsonl" # Or your Kaggle path
7
- OUTPUT_FILE = "granular_chunks.jsonl"
8
 
9
  # Global counter to ensure all generated IDs are unique
10
  chunk_counter = 0
11
 
12
- def get_unique_id():
13
  """Returns a unique, incrementing ID."""
14
  global chunk_counter
15
  chunk_counter += 1
@@ -36,18 +36,26 @@ def parse_value_to_int(value_str: str) -> int:
36
  return 0
37
 
38
  def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
39
- """Helper function to create a standardized chunk."""
40
  chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
41
- text = text_override if text_override else context.get("description", context.get("title", str(context)))
42
 
 
 
 
 
 
 
 
43
  metadata = {
44
  "section": context.get("section"),
45
  "clause": context.get("clause"),
 
46
  "title": context.get("title"),
47
  "description": context.get("description"),
48
  "authority": context.get("authority"),
49
  "limit_text": context.get("limit_text"),
50
  "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
 
51
  }
52
 
53
  return {
@@ -56,50 +64,103 @@ def create_chunk(context: Dict, text_override: str = None, id_override: str = No
56
  "metadata": {k: v for k, v in metadata.items() if v is not None}
57
  }
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def process_chunk(data: Dict, context: Dict) -> List[Dict]:
60
  """
61
- Processes a dictionary from the source file and deconstructs it if possible.
62
  """
63
  new_chunks = []
64
 
65
- # Update context with current data
66
  current_context = context.copy()
67
  current_context.update(data)
68
 
69
- # Case 1: Handle "delegation" structure
 
 
 
 
70
  if "delegation" in data and isinstance(data["delegation"], dict):
71
  for authority, limit_text in data["delegation"].items():
72
- text = f"Regarding '{current_context.get('description', current_context.get('title'))}', the power for {authority} is {limit_text}."
 
73
  chunk_context = current_context.copy()
74
  chunk_context["authority"] = authority
75
- chunk_context["limit_text"] = limit_text
76
  new_chunks.append(create_chunk(chunk_context, text_override=text))
77
  return new_chunks
78
 
79
- # Case 2: Handle "authority" and "extent_of_power" structure (often in Section II)
80
  if "authority" in data and "extent_of_power" in data:
81
- # This structure is complex, we will create a single, descriptive chunk
82
- text = f"Regarding '{current_context.get('title')}', the authority and extent of power are as follows: {json.dumps(data)}."
83
- new_chunks.append(create_chunk(current_context, text_override=text))
84
- return new_chunks
85
 
86
- # Recursive step: process nested lists
87
- has_nested_chunks = False
88
  for key, value in data.items():
89
- if isinstance(value, list):
90
- for item in value:
91
- if isinstance(item, dict):
92
- # Recurse and add the results
93
  nested_results = process_chunk(item, current_context)
94
  if nested_results:
95
  new_chunks.extend(nested_results)
96
  has_nested_chunks = True
 
 
 
 
 
 
 
 
 
 
97
 
98
- # If we processed children, we don't need to keep the parent chunk
99
  if has_nested_chunks:
100
  return new_chunks
101
 
102
- # Base case: If no specific rules were deconstructed, create a single chunk for the item
 
103
  new_chunks.append(create_chunk(current_context))
104
  return new_chunks
105
 
@@ -128,7 +189,7 @@ def main():
128
  for chunk in final_chunks:
129
  f.write(json.dumps(chunk) + '\n')
130
 
131
- print(f"Successfully created granular chunks file: '{OUTPUT_FILE}'")
132
 
133
  if __name__ == "__main__":
134
  main()
 
3
  from typing import List, Dict, Any
4
 
5
  # Define the input and output filenames
6
+ INPUT_FILE = "combined_context.jsonl"
7
+ OUTPUT_FILE = "granular_chunks_improved.jsonl"
8
 
9
  # Global counter to ensure all generated IDs are unique
10
  chunk_counter = 0
11
 
12
+ def get_unique_id() -> int:
13
  """Returns a unique, incrementing ID."""
14
  global chunk_counter
15
  chunk_counter += 1
 
36
  return 0
37
 
38
  def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict:
39
+ """Helper function to create a standardized chunk with rich metadata."""
40
  chunk_id = id_override if id_override else f"chunk-{get_unique_id()}"
 
41
 
42
+ # Determine the primary text for the chunk
43
+ text = text_override
44
+ if not text:
45
+ # Create a sensible default text if none is provided
46
+ text_parts = [context.get("title"), context.get("description")]
47
+ text = ". ".join(filter(None, text_parts)) or str(context)
48
+
49
  metadata = {
50
  "section": context.get("section"),
51
  "clause": context.get("clause"),
52
+ "subclause_id": context.get("id"),
53
  "title": context.get("title"),
54
  "description": context.get("description"),
55
  "authority": context.get("authority"),
56
  "limit_text": context.get("limit_text"),
57
  "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))),
58
+ "source": context.get("source"),
59
  }
60
 
61
  return {
 
64
  "metadata": {k: v for k, v in metadata.items() if v is not None}
65
  }
66
 
67
+ def _process_authority_power(data: Dict, context: Dict) -> List[Dict]:
68
+ """
69
+ Specifically handles the complex "authority" and "extent_of_power" structures.
70
+ This logic is complex because the data types for these keys vary.
71
+ """
72
+ chunks = []
73
+ title = context.get("title", "this rule")
74
+
75
+ # Case 1: Authority and Power are simple strings
76
+ if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str):
77
+ text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'."
78
+ chunk_context = context.copy()
79
+ chunk_context["authority"] = data['authority']
80
+ chunk_context["limit_text"] = data['extent_of_power']
81
+ chunks.append(create_chunk(chunk_context, text_override=text))
82
+
83
+ # Case 2: Authority and Power are lists of dictionaries (most complex case)
84
+ elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list):
85
+ authorities = data["authority"]
86
+ powers = data["extent_of_power"]
87
+ # Assuming the lists correspond to each other
88
+ for i in range(min(len(authorities), len(powers))):
89
+ auth_item = authorities[i]
90
+ power_item = powers[i]
91
+ # Extract descriptions from the dictionaries
92
+ auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item)
93
+ power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item)
94
+
95
+ text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'."
96
+ chunk_context = context.copy()
97
+ chunk_context["authority"] = auth_desc
98
+ chunk_context["limit_text"] = power_desc
99
+ chunks.append(create_chunk(chunk_context, text_override=text))
100
+
101
+ # Fallback for any other structure
102
+ else:
103
+ text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}."
104
+ chunks.append(create_chunk(context, text_override=text))
105
+
106
+ return chunks
107
+
108
  def process_chunk(data: Dict, context: Dict) -> List[Dict]:
109
  """
110
+ Processes a dictionary from the source file and deconstructs it into granular chunks.
111
  """
112
  new_chunks = []
113
 
114
+ # Update context with current data, giving preference to new keys
115
  current_context = context.copy()
116
  current_context.update(data)
117
 
118
+ has_nested_chunks = False
119
+
120
+ # --- Rule-based deconstruction ---
121
+
122
+ # Rule 1: Handle "delegation" structure (most specific)
123
  if "delegation" in data and isinstance(data["delegation"], dict):
124
  for authority, limit_text in data["delegation"].items():
125
+ desc = current_context.get('description') or current_context.get('title')
126
+ text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'."
127
  chunk_context = current_context.copy()
128
  chunk_context["authority"] = authority
129
+ chunk_context["limit_text"] = str(limit_text)
130
  new_chunks.append(create_chunk(chunk_context, text_override=text))
131
  return new_chunks
132
 
133
+ # Rule 2: Handle "authority" and "extent_of_power" structures
134
  if "authority" in data and "extent_of_power" in data:
135
+ return _process_authority_power(data, current_context)
 
 
 
136
 
137
+ # Rule 3: Recursively process nested lists of dictionaries or strings
 
138
  for key, value in data.items():
139
+ if isinstance(value, list) and value:
140
+ # Sub-rule 3a: List of dictionaries (e.g., subclauses, items)
141
+ if all(isinstance(item, dict) for item in value):
142
+ for item in value:
143
  nested_results = process_chunk(item, current_context)
144
  if nested_results:
145
  new_chunks.extend(nested_results)
146
  has_nested_chunks = True
147
+
148
+ # Sub-rule 3b: List of simple strings (e.g., items in Annexure A)
149
+ elif all(isinstance(item, str) for item in value):
150
+ title = current_context.get('title')
151
+ for item_text in value:
152
+ text = f"Regarding '{title}', a relevant item is: {item_text}."
153
+ new_chunks.append(create_chunk(current_context, text_override=text))
154
+ has_nested_chunks = True
155
+
156
+ # --- Finalization ---
157
 
158
+ # If we created specific chunks from children, we don't need the generic parent.
159
  if has_nested_chunks:
160
  return new_chunks
161
 
162
+ # Base case: If no specific rules were matched, create a single chunk for the item.
163
+ # This happens for "leaf" nodes that cannot be deconstructed further.
164
  new_chunks.append(create_chunk(current_context))
165
  return new_chunks
166
 
 
189
  for chunk in final_chunks:
190
  f.write(json.dumps(chunk) + '\n')
191
 
192
+ print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
193
 
194
  if __name__ == "__main__":
195
  main()
processed_chunks.json DELETED
The diff for this file is too large to render. See raw diff