Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 16, 2025

Commit

3076a66

verified ·

1 Parent(s): 04c50c5

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +141 -91

create_granular_chunks.py CHANGED Viewed

@@ -1,134 +1,175 @@
 import os
 import json
 import re
 from typing import List, Dict, Any
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
-OUTPUT_FILE = "granular_chunks_final.jsonl" # Keeping the filename consistent
 # --- Global State ---
 chunk_counter = 0
-def get_unique_id() -> str:
-    """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
-    return f"chunk-{chunk_counter}"
-def create_chunk(context: Dict, text: str) -> Dict:
-    """Creates a standardized chunk dictionary with rich metadata."""
     metadata = {
         "section": context.get("section"),
         "clause": context.get("clause") or context.get("Clause"),
         "title": context.get("title"),
-        "source_description": context.get("description"),
     }
-    for key, value in context.items():
-        if key not in metadata and isinstance(value, (str, int, float, bool)):
-            metadata[key] = value
     return {
-        "id": get_unique_id(),
-        "text": text,
-        "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
-def format_delegation_text(delegation: Any) -> str:
-    """
-    Formats a delegation dictionary or string into a readable string.
-    --- ACCURACY FIX ---
-    This function now explicitly includes "NIL" or "---" values instead of skipping them.
-    This is crucial for the model to correctly answer questions about roles with no power.
-    """
-    if not isinstance(delegation, dict):
-        return str(delegation)
-    # Use "is NIL" for None or "---", otherwise use "is [limit]"
-    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
-    return ", ".join(parts) if parts else "No specific delegation provided."
-def format_remarks(remarks: Any) -> str:
-    """Safely formats the 'remarks' field, handling various data types."""
     if isinstance(remarks, list):
-        remark_parts = []
-        for item in remarks:
-            if isinstance(item, dict):
-                for key, value in item.items():
-                    remark_parts.append(f"{key}: {value}")
             else:
-                remark_parts.append(str(item))
-        return " ".join(remark_parts)
-    return str(remarks)
-def build_descriptive_text(context: Dict) -> str:
-    """
-    Intelligently builds a single, descriptive, natural language sentence
-    by combining all relevant fields from the context.
-    """
-    text_parts = []
-    if context.get("title"):
-        text_parts.append(f"Regarding the policy for '{context['title']}'")
-    specific_desc = context.get('description') or context.get('method')
-    if specific_desc and specific_desc != context.get('title'):
-         text_parts.append(f"specifically for '{specific_desc}'")
-    if "delegation" in context:
-        delegation_text = format_delegation_text(context["delegation"])
-        text_parts.append(f", the financial delegations are: {delegation_text}.")
-    elif "composition" in context:
-        composition_parts = []
-        for item in context["composition"]:
-            if isinstance(item, dict):
-                for role, members in item.items():
-                    member_text = f"the {role} is {members}" if isinstance(members, str) else f"the {role} are: {', '.join(members)}"
-                    composition_parts.append(member_text)
-        text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
-    if "remarks" in context and context["remarks"]:
-        remarks_text = format_remarks(context["remarks"])
-        text_parts.append(f" Important remarks include: {remarks_text}")
-    return " ".join(text_parts)
-def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     """
-    The definitive processing function. It traverses the JSON and uses a set of handlers
-    to create highly descriptive, self-contained chunks.
     """
     context = {**(parent_context or {}), **data}
     chunks = []
-    # --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
-        base_title = context.get('title', 'a policy')
         for item in data[list_key]:
             if isinstance(item, str):
-                chunks.append(create_chunk(context, f"A rule regarding '{base_title}' is: {item}."))
         return chunks
-    # --- Handler 2: Recursive Traversal ---
-    has_recursed = False
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
-                chunks.extend(process_entry(item, context))
-            has_recursed = True
-    # --- Handler 3: Leaf Node Creation ---
-    if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
-        text = build_descriptive_text(context)
-        chunks.append(create_chunk(context, text))
     return chunks
 def main():
-    """Main function to read, process, and write."""
-    print(f"Starting to process '{INPUT_FILE}' with the definitive chunking strategy...")
     all_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
@@ -144,17 +185,26 @@ def main():
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
-    print(f"Deconstructed into {len(all_chunks)} highly descriptive chunks.")
-    # Remove duplicates before writing
-    unique_chunks = {chunk['text']: chunk for chunk in all_chunks}.values()
-    print(f"Removed duplicates, writing {len(unique_chunks)} unique chunks.")
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
         for chunk in unique_chunks:
-            f.write(json.dumps(chunk) + '\n')
-    print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
 if __name__ == "__main__":
     main()

 import os
 import json
 import re
+import hashlib
 from typing import List, Dict, Any
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl"
 # --- Global State ---
 chunk_counter = 0
+# -----------------------
+# Utility Helpers
+# -----------------------
+def get_unique_id(context: Dict, role: str = None) -> str:
+    """Generate semantic ID using section/clause/title and optional role, ensure uniqueness via hash."""
     global chunk_counter
     chunk_counter += 1
+    base_str = f"{context.get('section','')}-{context.get('clause','')}-{context.get('title','')}"
+    if role:
+        base_str += f"-{role}"
+    digest = hashlib.sha1(base_str.encode()).hexdigest()[:6]
+    return f"{base_str.replace(' ', '_')}-{digest}-{chunk_counter}"
+def normalize_money(value: str) -> Dict[str, Any]:
+    """
+    Try to normalize monetary values (₹10 crore -> 100000000).
+    Returns dict with human text and normalized number
+    """
+    multipliers = {
+        "lakh": 1e5,
+        "crore": 1e7
+    }
+    result = {"original": value, "normalized": None}
+    if not isinstance(value, str):
+        return result
+    match = re.search(r"₹?\s*([\d,.]+)\s*(crore|lakh)?", value, flags=re.IGNORECASE)
+    if match:
+        number = float(match.group(1).replace(",", ""))
+        unit = match.group(2).lower() if match.group(2) else None
+        if unit in multipliers:
+            number *= multipliers[unit]
+        result["normalized"] = int(number)
+    return result
+def create_chunk(context: Dict, text: str, extra_metadata: Dict = None, role: str = None, parent_id: str = None) -> Dict:
+    """Creates a standardized chunk dictionary with traceable metadata."""
     metadata = {
         "section": context.get("section"),
         "clause": context.get("clause") or context.get("Clause"),
         "title": context.get("title"),
+        "description": context.get("description"),
+        "parent_title": context.get("parent_title"),
+        "grandparent_title": context.get("grandparent_title"),
     }
+    # Merge with extras and flatten
+    if extra_metadata:
+        metadata.update(extra_metadata)
     return {
+        "id": get_unique_id(context, role),
+        "text": text.strip(),
+        "metadata": {k: v for k, v in metadata.items() if v is not None},
+        "parent_id": parent_id
     }
+def format_delegation(delegation: Any, context: Dict, parent_id: str = None) -> List[Dict]:
+    """Return chunks for delegations in natural + structured formats."""
+    chunks = []
+    if isinstance(delegation, dict):
+        for role, limit in delegation.items():
+            norm_val = normalize_money(limit)
+            text = f"In the context of '{context.get('title')}', the limit for {role} is {limit if limit not in [None,'---'] else 'NIL'}."
+            meta = {"role": role, "limit": limit, "limit_normalized": norm_val.get("normalized")}
+            chunks.append(create_chunk(context, text, meta, role=role, parent_id=parent_id))
+    else:
+        # simple string delegation
+        chunks.append(create_chunk(context, f"Delegation rule: {delegation}", parent_id=parent_id))
+    return chunks
+def format_remarks(remarks: Any, context: Dict, parent_id: str = None) -> List[Dict]:
+    """Split remarks into individual atomic chunks."""
+    chunks = []
     if isinstance(remarks, list):
+        for r in remarks:
+            if isinstance(r, dict):
+                for k, v in r.items():
+                    text = f"Remark for '{context.get('title')}': {k}: {v}"
+                    chunks.append(create_chunk(context, text, parent_id=parent_id))
             else:
+                text = f"Remark for '{context.get('title')}': {r}"
+                chunks.append(create_chunk(context, text, parent_id=parent_id))
+    else:
+        text = f"Remark for '{context.get('title')}': {remarks}"
+        chunks.append(create_chunk(context, text, parent_id=parent_id))
+    return chunks
+# -----------------------
+# Processing Logic
+# -----------------------
+def process_entry(data: Dict, parent_context: Dict = None, parent_id: str = None) -> List[Dict]:
     """
+    Recursive processor that expands JSON entries into granular atomic chunks.
     """
     context = {**(parent_context or {}), **data}
     chunks = []
+    # Hierarchy fields
+    if parent_context:
+        if parent_context.get("title"):
+            context["parent_title"] = parent_context.get("title")
+        if parent_context.get("parent_title"):
+            context["grandparent_title"] = parent_context.get("parent_title")
+    # Handle list of plain items (rules, exclusions)
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
         for item in data[list_key]:
             if isinstance(item, str):
+                chunks.append(create_chunk(context, f"Rule under '{context.get('title')}': {item}.", parent_id=parent_id))
         return chunks
+    # Handle delegation
+    if "delegation" in data:
+        chunks.extend(format_delegation(data["delegation"], context, parent_id=parent_id))
+    # Handle description (atomic chunk)
+    if data.get("description"):
+        chunks.append(create_chunk(context, f"Description: {data['description']}", parent_id=parent_id))
+    # Handle composition
+    if "composition" in data:
+        for item in data["composition"]:
+            if isinstance(item, dict):
+                for role, members in item.items():
+                    member_text = members if isinstance(members, str) else ", ".join(members)
+                    chunks.append(create_chunk(context,
+                                               f"Committee composition: {role} = {member_text}",
+                                               extra_metadata={"role": role},
+                                               parent_id=parent_id))
+    # Handle remarks
+    if "remarks" in data and data["remarks"]:
+        chunks.extend(format_remarks(data["remarks"], context, parent_id=parent_id))
+    # Recurse into nested dict lists (subclauses, methods, etc.)
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
+                chunks.extend(process_entry(item, context, parent_id=context.get("id", None)))
     return chunks
+# -----------------------
+# Main
+# -----------------------
 def main():
+    print(f"Processing '{INPUT_FILE}' with improved chunking...")
     all_chunks = []
+    # Read file
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
+    print(f"Generated {len(all_chunks)} raw chunks.")
+    # Deduplicate based on text+metadata hash
+    seen = set()
+    unique_chunks = []
+    for ch in all_chunks:
+        sig = json.dumps((ch["text"], ch["metadata"]), sort_keys=True)
+        if sig not in seen:
+            seen.add(sig)
+            unique_chunks.append(ch)
+    print(f"Deduplicated to {len(unique_chunks)} unique chunks.")
+    # Write output
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
         for chunk in unique_chunks:
+            f.write(json.dumps(chunk, ensure_ascii=False) + '\n')
+    print(f"Successfully wrote improved granular chunks to {OUTPUT_FILE}")
 if __name__ == "__main__":
     main()