import json import re from typing import List, Dict, Any # Define the input and output filenames INPUT_FILE = "combined_context.jsonl" OUTPUT_FILE = "granular_chunks_improved.jsonl" # Global counter to ensure all generated IDs are unique chunk_counter = 0 def get_unique_id() -> int: """Returns a unique, incrementing ID.""" global chunk_counter chunk_counter += 1 return chunk_counter def parse_value_to_int(value_str: str) -> int: """Converts a financial string like '₹5 crore' or '₹50 lakh' to an integer.""" if not isinstance(value_str, str): return 0 value_str = value_str.lower().replace('₹', '').strip() if value_str in ["nil", "---", ""]: return 0 try: num_part = re.findall(r'[\d\.]+', value_str) if not num_part: return 0 num = float(num_part[0]) if 'crore' in value_str: return int(num * 1_00_00_000) if 'lakh' in value_str: return int(num * 1_00_000) return int(num) except (ValueError, IndexError): return 0 def create_chunk(context: Dict, text_override: str = None, id_override: str = None) -> Dict: """Helper function to create a standardized chunk with rich metadata.""" chunk_id = id_override if id_override else f"chunk-{get_unique_id()}" # Determine the primary text for the chunk text = text_override if not text: # Create a sensible default text if none is provided text_parts = [context.get("title"), context.get("description")] text = ". ".join(filter(None, text_parts)) or str(context) metadata = { "section": context.get("section"), "clause": context.get("clause"), "subclause_id": context.get("id"), "title": context.get("title"), "description": context.get("description"), "authority": context.get("authority"), "limit_text": context.get("limit_text"), "limit_inr": parse_value_to_int(str(context.get("limit_text", ""))), "source": context.get("source"), } return { "id": chunk_id, "text": text, "metadata": {k: v for k, v in metadata.items() if v is not None} } def _process_authority_power(data: Dict, context: Dict) -> List[Dict]: """ Specifically handles the complex "authority" and "extent_of_power" structures. This logic is complex because the data types for these keys vary. """ chunks = [] title = context.get("title", "this rule") # Case 1: Authority and Power are simple strings if isinstance(data.get("authority"), str) and isinstance(data.get("extent_of_power"), str): text = f"Regarding '{title}', the approving authority is {data['authority']} with '{data['extent_of_power']}'." chunk_context = context.copy() chunk_context["authority"] = data['authority'] chunk_context["limit_text"] = data['extent_of_power'] chunks.append(create_chunk(chunk_context, text_override=text)) # Case 2: Authority and Power are lists of dictionaries (most complex case) elif isinstance(data.get("authority"), list) and isinstance(data.get("extent_of_power"), list): authorities = data["authority"] powers = data["extent_of_power"] # Assuming the lists correspond to each other for i in range(min(len(authorities), len(powers))): auth_item = authorities[i] power_item = powers[i] # Extract descriptions from the dictionaries auth_desc = next(iter(auth_item.values())) if isinstance(auth_item, dict) else str(auth_item) power_desc = next(iter(power_item.values())) if isinstance(power_item, dict) else str(power_item) text = f"For '{title}', the authority for '{auth_desc}' is given '{power_desc}'." chunk_context = context.copy() chunk_context["authority"] = auth_desc chunk_context["limit_text"] = power_desc chunks.append(create_chunk(chunk_context, text_override=text)) # Fallback for any other structure else: text = f"Regarding '{title}', the authority and power details are as follows: {json.dumps(data)}." chunks.append(create_chunk(context, text_override=text)) return chunks def process_chunk(data: Dict, context: Dict) -> List[Dict]: """ Processes a dictionary from the source file and deconstructs it into granular chunks. """ new_chunks = [] # Update context with current data, giving preference to new keys current_context = context.copy() current_context.update(data) has_nested_chunks = False # --- Rule-based deconstruction --- # Rule 1: Handle "delegation" structure (most specific) if "delegation" in data and isinstance(data["delegation"], dict): for authority, limit_text in data["delegation"].items(): desc = current_context.get('description') or current_context.get('title') text = f"Regarding '{desc}', the delegation for {authority} is '{limit_text}'." chunk_context = current_context.copy() chunk_context["authority"] = authority chunk_context["limit_text"] = str(limit_text) new_chunks.append(create_chunk(chunk_context, text_override=text)) return new_chunks # Rule 2: Handle "authority" and "extent_of_power" structures if "authority" in data and "extent_of_power" in data: return _process_authority_power(data, current_context) # Rule 3: Recursively process nested lists of dictionaries or strings for key, value in data.items(): if isinstance(value, list) and value: # Sub-rule 3a: List of dictionaries (e.g., subclauses, items) if all(isinstance(item, dict) for item in value): for item in value: nested_results = process_chunk(item, current_context) if nested_results: new_chunks.extend(nested_results) has_nested_chunks = True # Sub-rule 3b: List of simple strings (e.g., items in Annexure A) elif all(isinstance(item, str) for item in value): title = current_context.get('title') for item_text in value: text = f"Regarding '{title}', a relevant item is: {item_text}." new_chunks.append(create_chunk(current_context, text_override=text)) has_nested_chunks = True # --- Finalization --- # If we created specific chunks from children, we don't need the generic parent. if has_nested_chunks: return new_chunks # Base case: If no specific rules were matched, create a single chunk for the item. # This happens for "leaf" nodes that cannot be deconstructed further. new_chunks.append(create_chunk(current_context)) return new_chunks def main(): """Main function to read, process, and write.""" print(f"Starting to process '{INPUT_FILE}'...") final_chunks = [] try: with open(INPUT_FILE, 'r', encoding='utf-8') as f: for i, line in enumerate(f): try: data = json.loads(line) processed = process_chunk(data, {}) final_chunks.extend(processed) except json.JSONDecodeError: print(f"Warning: Skipping malformed JSON on line {i+1}") continue except FileNotFoundError: print(f"Error: Input file '{INPUT_FILE}' not found.") return print(f"Deconstructed into {len(final_chunks)} granular chunks.") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: for chunk in final_chunks: f.write(json.dumps(chunk) + '\n') print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'") if __name__ == "__main__": main()