Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 21

Commit

b91ce4b

verified ·

1 Parent(s): 3c0b78f

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +76 -62

create_granular_chunks.py CHANGED Viewed

@@ -5,25 +5,51 @@ import re
 from typing import List, Dict, Any
 import nltk
 # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
 nltk.download('punkt')
-nltk.download('punkt_tab')  # Also download punkt_tab to avoid LookupError
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
-OUTPUT_FILE = "granular_chunks_final.jsonl"  # Keep filename consistent
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
-    """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
@@ -33,31 +59,27 @@ def create_chunk(context: Dict, text: str) -> Dict:
         "title": context.get("title"),
         "source_description": context.get("description"),
     }
-    # Add other primitive metadata keys
     for key, value in context.items():
         if key not in metadata and isinstance(value, (str, int, float, bool)):
             metadata[key] = value
     return {
         "id": get_unique_id(),
         "text": text.strip(),
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
 def format_delegation_text(delegation: Any) -> str:
-    """
-    Formats a delegation dictionary or string into a readable string.
-    Explicitly includes "NIL" or "---" to capture no power cases.
-    """
     if not isinstance(delegation, dict):
         return str(delegation)
-    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
     return ", ".join(parts) if parts else "No specific delegation provided."
 def format_remarks(remarks: Any) -> str:
-    """Safely formats the 'remarks' field, handling various data types."""
     if isinstance(remarks, list):
         remark_parts = []
         for item in remarks:
@@ -69,21 +91,13 @@ def format_remarks(remarks: Any) -> str:
         return " ".join(remark_parts)
     return str(remarks)
 def build_descriptive_text(context: Dict) -> str:
-    """
-    Builds a clear, descriptive, natural language text by combining fields.
-    Focused for best relevance and contextual richness.
-    """
     text_parts = []
     if context.get("title"):
         text_parts.append(f"Regarding the policy '{context['title']}'")
     specific_desc = context.get('description') or context.get('method')
     if specific_desc and specific_desc != context.get('title'):
         text_parts.append(f"specifically for '{specific_desc}'")
     if "delegation" in context:
         delegation_text = format_delegation_text(context["delegation"])
         text_parts.append(f", financial delegations are: {delegation_text}.")
@@ -96,68 +110,72 @@ def build_descriptive_text(context: Dict) -> str:
                                    else f"the {role} are: {', '.join(members)}")
                     composition_parts.append(member_text)
         text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
     if "remarks" in context and context["remarks"]:
         remarks_text = format_remarks(context["remarks"])
         text_parts.append(f" Important remarks include: {remarks_text}")
-    # Join all parts into a flowing sentence
     return " ".join(text_parts).strip()
-def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int = 200) -> List[str]:
-    """
-    Splits a long text into smaller chunks with controlled overlap.
-    Uses sentence tokenization for natural splits.
-    """
-    text = text.strip()
-    if len(text) <= max_char_length:
-        return [text]
-    # Explicitly specify language to avoid punkt_tab error
-    sentences = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
     current_chunk = ""
-    for sentence in sentences:
-        # +1 for space/newline likely added between sentences
-        if len(current_chunk) + len(sentence) + 1 <= max_char_length:
             current_chunk += (" " + sentence) if current_chunk else sentence
         else:
             chunks.append(current_chunk.strip())
-            # Start next chunk with overlap from end of previous chunk (by characters)
-            if overlap < len(current_chunk):
-                current_chunk = current_chunk[-overlap:] + " " + sentence
             else:
                 current_chunk = sentence
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
-    """
-    Processes a JSON policy entry and returns granular, context-rich chunks.
-    Applies recursive traversal and implements chunk size limiting.
-    """
     context = {**(parent_context or {}), **data}
     chunks = []
-    # Handler 1: Simple Item Lists (ex: rules, exclusions)
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
         base_title = context.get('title', 'a policy')
         for item in data[list_key]:
             if isinstance(item, str):
-                # Build chunk text with clear descriptive prefix for relevance
                 text = f"A rule regarding '{base_title}' is: {item}."
-                # Split if too long
-                for sub_chunk in split_text_into_chunks(text):
                     chunks.append(create_chunk(context, sub_chunk))
         return chunks
-    # Handler 2: Recursive traversal for nested dictionaries/lists
     has_recursed = False
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
@@ -168,16 +186,13 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     # Handler 3: Leaf nodes with delegation, composition or description
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
-        # Split long descriptive text intelligently
-        for chunk_text in split_text_into_chunks(text):
             chunks.append(create_chunk(context, chunk_text))
     return chunks
 def main():
-    """Main orchestration to read input, process, and write chunks."""
-    print(f"Starting to process '{INPUT_FILE}' for improved granular chunking...")
     all_chunks = []
     try:
@@ -208,10 +223,9 @@ def main():
     # Write output in JSONL format for later vector DB ingestion
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
         for chunk in unique_chunks:
-            outf.write(json.dumps(chunk, ensure_ascii=False) + "\n")
     print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
     main()

 from typing import List, Dict, Any
 import nltk
+# --- Tokenizer Import ---
+import tiktoken  # pip install tiktoken
 # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
 nltk.download('punkt')
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl"
+# Token-based chunking parameters (typical LLM embedding context ~512 tokens)
+MAX_TOKENS = 400
+OVERLAP_TOKENS = 50
+TOKENIZER_MODEL = "cl100k_base"  # use "cl100k_base" for OpenAI, adjust as needed
+# --- Keyword Enhancement ---
+FINANCIAL_KEYWORDS = [
+    "₹", "INR", "crore", "lakh", "limit", "delegation", "expenditure", "budget", "revenue", "capital",
+    "surplus", "investment", "write-off", "dividend", "pay", "salary", "contract value"
+]
+AUTHORITY_KEYWORDS = [
+    "CMD", "Chairman", "Board", "Director", "ED", "Executive Director", "CGM", "GM", "DGM", "Sr. M",
+    "Manager", "HOD", "Head of Finance", "Finance Head", "Project Head"
+]
+def get_encoding():
+    return tiktoken.get_encoding(TOKENIZER_MODEL)
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
+def enhance_chunk_with_keywords(text: str, metadata: dict) -> dict:
+    """Add keywords (financial and authority) to metadata if present in text."""
+    present_financial = [kw for kw in FINANCIAL_KEYWORDS if kw.lower() in text.lower()]
+    present_authority = [kw for kw in AUTHORITY_KEYWORDS if kw.lower() in text.lower()]
+    if present_financial:
+        metadata['financial_keywords'] = present_financial
+    if present_authority:
+        metadata['authority_keywords'] = present_authority
+    return metadata
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
         "title": context.get("title"),
         "source_description": context.get("description"),
     }
     for key, value in context.items():
         if key not in metadata and isinstance(value, (str, int, float, bool)):
             metadata[key] = value
+    # --- Keyword Enhancement ---
+    metadata = enhance_chunk_with_keywords(text, metadata)
     return {
         "id": get_unique_id(),
         "text": text.strip(),
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
 def format_delegation_text(delegation: Any) -> str:
     if not isinstance(delegation, dict):
         return str(delegation)
+    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}"
+             for auth, limit in delegation.items()]
     return ", ".join(parts) if parts else "No specific delegation provided."
 def format_remarks(remarks: Any) -> str:
     if isinstance(remarks, list):
         remark_parts = []
         for item in remarks:
         return " ".join(remark_parts)
     return str(remarks)
 def build_descriptive_text(context: Dict) -> str:
     text_parts = []
     if context.get("title"):
         text_parts.append(f"Regarding the policy '{context['title']}'")
     specific_desc = context.get('description') or context.get('method')
     if specific_desc and specific_desc != context.get('title'):
         text_parts.append(f"specifically for '{specific_desc}'")
     if "delegation" in context:
         delegation_text = format_delegation_text(context["delegation"])
         text_parts.append(f", financial delegations are: {delegation_text}.")
                                    else f"the {role} are: {', '.join(members)}")
                     composition_parts.append(member_text)
         text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
     if "remarks" in context and context["remarks"]:
         remarks_text = format_remarks(context["remarks"])
         text_parts.append(f" Important remarks include: {remarks_text}")
     return " ".join(text_parts).strip()
+def count_tokens(text: str) -> int:
+    encoding = get_encoding()
+    return len(encoding.encode(text))
+def get_token_overlap(text: str, overlap_tokens: int) -> str:
+    """Return the last `overlap_tokens` worth of text from the input string."""
+    encoding = get_encoding()
+    tokens = encoding.encode(text)
+    if len(tokens) <= overlap_tokens:
+        return text
+    # Decode only the last overlap_tokens tokens
+    overlapped = encoding.decode(tokens[-overlap_tokens:])
+    # Remove possible split word inconsistencies by finding last complete sentence
+    # This is optional: can simply return overlapped
+    last_period = overlapped.rfind('.')
+    if last_period != -1 and last_period < len(overlapped) - 2:
+        return overlapped[last_period+1:].strip()
+    return overlapped.strip()
+def split_text_by_tokens(text: str, max_tokens: int = MAX_TOKENS, overlap_tokens: int = OVERLAP_TOKENS) -> List[str]:
+    """Split text into chunks based on token count, with specified overlap."""
+    encoding = get_encoding()
+    sents = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
     current_chunk = ""
+    current_tokens = 0
+    for sentence in sents:
+        sentence_tokens = len(encoding.encode(sentence))
+        if current_tokens + sentence_tokens <= max_tokens:
             current_chunk += (" " + sentence) if current_chunk else sentence
+            current_tokens += sentence_tokens
         else:
             chunks.append(current_chunk.strip())
+            # Overlap logic
+            if overlap_tokens < current_tokens:
+                overlap_text = get_token_overlap(current_chunk, overlap_tokens)
+                current_chunk = overlap_text + " " + sentence
+                current_tokens = len(encoding.encode(current_chunk))
             else:
                 current_chunk = sentence
+                current_tokens = sentence_tokens
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     context = {**(parent_context or {}), **data}
     chunks = []
+    # Handler 1: Simple Item Lists
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
         base_title = context.get('title', 'a policy')
         for item in data[list_key]:
             if isinstance(item, str):
                 text = f"A rule regarding '{base_title}' is: {item}."
+                for sub_chunk in split_text_by_tokens(text):
                     chunks.append(create_chunk(context, sub_chunk))
         return chunks
+    # Handler 2: Recursive traversal for nested dicts/lists
     has_recursed = False
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
     # Handler 3: Leaf nodes with delegation, composition or description
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
+        for chunk_text in split_text_by_tokens(text):
             chunks.append(create_chunk(context, chunk_text))
     return chunks
 def main():
+    print(f"Starting to process '{INPUT_FILE}' with token-based chunking and keyword enhancement...")
     all_chunks = []
     try:
     # Write output in JSONL format for later vector DB ingestion
     with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
         for chunk in unique_chunks:
+            outf.write(json.dumps(chunk, ensure_ascii=False) + "\\n")
     print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
     main()