Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 5, 2025

Commit

448f148

verified ·

1 Parent(s): ef8706f

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +82 -89

create_granular_chunks.py CHANGED Viewed

@@ -1,36 +1,21 @@
 import os
 import json
 import re
-from typing import List, Dict, Any, Optional
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
-# As requested, the output filename remains the same.
-OUTPUT_FILE = "granular_chunks_final.jsonl"
 # --- Global State ---
 chunk_counter = 0
-def get_unique_id() -> int:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
-def format_delegation_text(delegation: Any) -> str:
-    """
-    Formats a delegation dictionary or string into a readable string.
-    Handles cases where delegation is a dict or a simple string.
-    """
-    # FIX: Check if the input is a dictionary. If not, it's a descriptive string.
-    if not isinstance(delegation, dict):
-        return str(delegation)
-    parts = [f"the limit for {auth} is {limit}" for auth, limit in delegation.items() if limit and str(limit).lower() != 'nil']
-    if not parts:
-        return "No specific delegation provided."
-    return ", ".join(parts)
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
     metadata = {
@@ -41,7 +26,7 @@ def create_chunk(context: Dict, text: str) -> Dict:
     }
     # Add any other relevant context keys to metadata
     for key, value in context.items():
-        if key not in metadata and isinstance(value, (str, int, float)):
             metadata[key] = value
     return {
@@ -50,77 +35,85 @@ def create_chunk(context: Dict, text: str) -> Dict:
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
-def process_complex_rule(data: Dict, parent_context: Dict) -> List[Dict]:
     """
-    This is the core new logic. It identifies complex rules with nested lists
-    (like methods or subclauses) and combines them into a single, rich chunk.
     """
-    context = {**parent_context, **data}
     chunks = []
-    # Identify the key that holds the list of nested rules
-    nested_list_key = None
-    if "methods" in data and isinstance(data.get("methods"), list):
-        nested_list_key = "methods"
-    elif "subclauses" in data and isinstance(data.get("subclauses"), list):
-        nested_list_key = "subclauses"
-    if not nested_list_key:
-        return []
-    base_title = context.get('title', 'a policy')
-    # Use the description from the current level, which is more specific
-    base_desc = context.get('description', '')
-    # --- Build a single, comprehensive text block for this entire rule ---
-    full_text_parts = [f"Regarding the policy for '{base_title}'"]
-    if base_desc:
-        full_text_parts.append(f"specifically for '{base_desc}'")
-    full_text_parts.append("the rules are as follows:")
-    # Iterate through the nested rules and append their details to the text block
-    for item in data[nested_list_key]:
-        if isinstance(item, dict) and "delegation" in item:
-            item_desc = item.get('description') or item.get('method') or item.get('title', '')
-            delegation_text = format_delegation_text(item["delegation"])
-            full_text_parts.append(f"- For '{item_desc}', {delegation_text}.")
-    # Add any remarks from the parent level to the end of the text block
-    if "remarks" in data and isinstance(data["remarks"], list):
-        full_text_parts.append("Important remarks include:")
-        full_text_parts.extend([f"  - {remark}" for remark in data["remarks"]])
-    elif "remarks" in data and isinstance(data["remarks"], str):
-         full_text_parts.append(f"An important remark is: {data['remarks']}")
-    # Create a single, powerful chunk from the combined text
-    if len(full_text_parts) > 2: # Ensure we have more than just the intro
-        final_text = " ".join(full_text_parts)
-        chunks.append(create_chunk(context, final_text))
-    return chunks
-def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
-    """
-    Main processing function. It prioritizes creating comprehensive chunks for complex rules.
-    """
-    context = {**(parent_context or {}), **data}
-    # --- Priority 1: Attempt to process as a complex rule with nested delegations ---
-    complex_chunks = process_complex_rule(data, parent_context or {})
-    if complex_chunks:
-        return complex_chunks
-    # --- Priority 2: Handle simple, flat delegation rules ---
     if "delegation" in data and isinstance(data.get("delegation"), dict):
         base_desc = context.get('description') or context.get('title', 'this rule')
         delegation_text = format_delegation_text(data["delegation"])
         text = f"Regarding '{base_desc}', the delegated financial powers are: {delegation_text}."
-        return [create_chunk(context, text)]
-    # --- Priority 3: Recursively process deeper structures ---
-    # This is important for traversing the JSON but is now secondary to creating comprehensive chunks.
-    chunks = []
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
@@ -128,28 +121,28 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     if chunks:
         return chunks
-    # --- Fallback: Create a chunk for simple descriptive text if no other rule applies ---
     description = context.get("description")
     title = context.get("title")
-    if description:
         text = f"The policy for '{title}' states: {description}."
-        return [create_chunk(context, text)]
-    return []
 def main():
     """Main function to read, process, and write."""
-    print(f"Starting to process '{INPUT_FILE}' with the best-approach chunking strategy...")
-    final_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
                 try:
                     data = json.loads(line)
-                    processed = process_entry(data)
-                    if processed:
-                        final_chunks.extend(processed)
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
@@ -157,10 +150,10 @@ def main():
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
-    print(f"Deconstructed into {len(final_chunks)} comprehensive, self-contained chunks.")
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
-        for chunk in final_chunks:
             f.write(json.dumps(chunk) + '\n')
     print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")

 import os
 import json
 import re
+from typing import List, Dict, Any
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl" # Keeping the filename consistent
 # --- Global State ---
 chunk_counter = 0
+def get_unique_id() -> str:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
     metadata = {
     }
     # Add any other relevant context keys to metadata
     for key, value in context.items():
+        if key not in metadata and isinstance(value, (str, int, float, bool)):
             metadata[key] = value
     return {
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
+def format_delegation_text(delegation: Any) -> str:
+    """Formats a delegation dictionary or string into a readable string."""
+    if not isinstance(delegation, dict):
+        return str(delegation) # Handles cases where it's a simple string
+    parts = [f"the limit for {auth} is {limit}" for auth, limit in delegation.items() if limit and str(limit).lower() != 'nil']
+    return ", ".join(parts) if parts else "No specific delegation provided."
+def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     """
+    The definitive processing function. It creates highly descriptive and self-contained chunks
+    by applying specific handlers based on the structure of each JSON entry.
     """
+    context = {**(parent_context or {}), **data}
     chunks = []
+    # --- Handler 1: Committee Composition (e.g., LPC-1, LPC-2) ---
+    if "composition" in data and isinstance(data["composition"], list):
+        base_title = context.get('title', 'a committee')
+        composition_parts = []
+        for item in data["composition"]:
+            if isinstance(item, dict):
+                for role, members in item.items():
+                    member_text = f"the {role} is {members}" if isinstance(members, str) else f"the {role} are: {', '.join(members)}"
+                    composition_parts.append(member_text)
+        if composition_parts:
+            full_text = f"Regarding '{base_title}', the composition is: {'; '.join(composition_parts)}."
+            if context.get("approving_authority"):
+                full_text += f" The approving authority is {context['approving_authority']}."
+            if context.get("remarks"):
+                full_text += f" Remarks include: {' '.join(context['remarks'])}"
+            chunks.append(create_chunk(context, full_text))
+        return chunks
+    # --- Handler 2: Complex Nested Rules with Delegations ---
+    # This is the most important handler for creating comprehensive, self-contained chunks.
+    nested_list_key = next((key for key in ["methods", "subclauses"] if key in data and isinstance(data[key], list)), None)
+    if nested_list_key:
+        is_complex_delegation = all(isinstance(item, dict) and "delegation" in item for item in data[nested_list_key])
+        if is_complex_delegation:
+            base_title = context.get('title', 'a policy')
+            base_desc = context.get('description', '')
+            text_parts = [f"Regarding the policy for '{base_title}'"]
+            if base_desc:
+                text_parts.append(f"specifically for '{base_desc}'")
+            text_parts.append(", the rules are as follows:")
+            for item in data[nested_list_key]:
+                item_desc = item.get('description') or item.get('method') or item.get('title', 'a specific method')
+                delegation_text = format_delegation_text(item["delegation"])
+                text_parts.append(f"For '{item_desc}', {delegation_text}.")
+            final_text = " ".join(text_parts)
+            chunks.append(create_chunk(context, final_text))
+            return chunks
+    # --- Handler 3: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
+    list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data[key], list)), None)
+    if list_key:
+        base_title = context.get('title', 'a policy')
+        prefix = f"Regarding '{base_title}', the following items are {'excluded' if list_key == 'exclusions' else 'included'}:"
+        # Create individual chunks for each item for better specific retrieval
+        for item in data[list_key]:
+            if isinstance(item, str):
+                chunks.append(create_chunk(context, f"A rule regarding '{base_title}' is: {item}."))
+        return chunks
+    # --- Handler 4: Flat Delegation (a rule with a direct delegation dict) ---
     if "delegation" in data and isinstance(data.get("delegation"), dict):
         base_desc = context.get('description') or context.get('title', 'this rule')
         delegation_text = format_delegation_text(data["delegation"])
         text = f"Regarding '{base_desc}', the delegated financial powers are: {delegation_text}."
+        chunks.append(create_chunk(context, text))
+        return chunks
+    # --- Handler 5: Recursive Processor for Generic Nested Structures ---
+    # If no specific handler above matched, traverse deeper into the JSON.
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
     if chunks:
         return chunks
+    # --- Fallback Handler: For simple descriptive text nodes ---
     description = context.get("description")
     title = context.get("title")
+    if description and isinstance(description, str):
         text = f"The policy for '{title}' states: {description}."
+        chunks.append(create_chunk(context, text))
+    return chunks
 def main():
     """Main function to read, process, and write."""
+    print(f"Starting to process '{INPUT_FILE}' with the definitive chunking strategy...")
+    all_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
                 try:
                     data = json.loads(line)
+                    processed_chunks = process_entry(data)
+                    if processed_chunks:
+                        all_chunks.extend(processed_chunks)
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
+    print(f"Deconstructed into {len(all_chunks)} highly descriptive chunks.")
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+        for chunk in all_chunks:
             f.write(json.dumps(chunk) + '\n')
     print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")