Kalpokoch commited on
Commit
448f148
·
verified ·
1 Parent(s): ef8706f

Update create_granular_chunks.py

Browse files
Files changed (1) hide show
  1. create_granular_chunks.py +82 -89
create_granular_chunks.py CHANGED
@@ -1,36 +1,21 @@
1
  import os
2
  import json
3
  import re
4
- from typing import List, Dict, Any, Optional
5
 
6
  # --- Configuration ---
7
  INPUT_FILE = "combined_context.jsonl"
8
- # As requested, the output filename remains the same.
9
- OUTPUT_FILE = "granular_chunks_final.jsonl"
10
 
11
  # --- Global State ---
12
  chunk_counter = 0
13
 
14
- def get_unique_id() -> int:
15
  """Returns a unique, incrementing ID for each chunk."""
16
  global chunk_counter
17
  chunk_counter += 1
18
  return f"chunk-{chunk_counter}"
19
 
20
- def format_delegation_text(delegation: Any) -> str:
21
- """
22
- Formats a delegation dictionary or string into a readable string.
23
- Handles cases where delegation is a dict or a simple string.
24
- """
25
- # FIX: Check if the input is a dictionary. If not, it's a descriptive string.
26
- if not isinstance(delegation, dict):
27
- return str(delegation)
28
-
29
- parts = [f"the limit for {auth} is {limit}" for auth, limit in delegation.items() if limit and str(limit).lower() != 'nil']
30
- if not parts:
31
- return "No specific delegation provided."
32
- return ", ".join(parts)
33
-
34
  def create_chunk(context: Dict, text: str) -> Dict:
35
  """Creates a standardized chunk dictionary with rich metadata."""
36
  metadata = {
@@ -41,7 +26,7 @@ def create_chunk(context: Dict, text: str) -> Dict:
41
  }
42
  # Add any other relevant context keys to metadata
43
  for key, value in context.items():
44
- if key not in metadata and isinstance(value, (str, int, float)):
45
  metadata[key] = value
46
 
47
  return {
@@ -50,77 +35,85 @@ def create_chunk(context: Dict, text: str) -> Dict:
50
  "metadata": {k: v for k, v in metadata.items() if v is not None}
51
  }
52
 
53
- def process_complex_rule(data: Dict, parent_context: Dict) -> List[Dict]:
 
 
 
 
 
 
 
54
  """
55
- This is the core new logic. It identifies complex rules with nested lists
56
- (like methods or subclauses) and combines them into a single, rich chunk.
57
  """
58
- context = {**parent_context, **data}
59
  chunks = []
60
-
61
- # Identify the key that holds the list of nested rules
62
- nested_list_key = None
63
- if "methods" in data and isinstance(data.get("methods"), list):
64
- nested_list_key = "methods"
65
- elif "subclauses" in data and isinstance(data.get("subclauses"), list):
66
- nested_list_key = "subclauses"
67
-
68
- if not nested_list_key:
69
- return []
70
 
71
- base_title = context.get('title', 'a policy')
72
- # Use the description from the current level, which is more specific
73
- base_desc = context.get('description', '')
74
-
75
- # --- Build a single, comprehensive text block for this entire rule ---
76
- full_text_parts = [f"Regarding the policy for '{base_title}'"]
77
- if base_desc:
78
- full_text_parts.append(f"specifically for '{base_desc}'")
79
-
80
- full_text_parts.append("the rules are as follows:")
81
-
82
- # Iterate through the nested rules and append their details to the text block
83
- for item in data[nested_list_key]:
84
- if isinstance(item, dict) and "delegation" in item:
85
- item_desc = item.get('description') or item.get('method') or item.get('title', '')
86
- delegation_text = format_delegation_text(item["delegation"])
87
- full_text_parts.append(f"- For '{item_desc}', {delegation_text}.")
88
-
89
- # Add any remarks from the parent level to the end of the text block
90
- if "remarks" in data and isinstance(data["remarks"], list):
91
- full_text_parts.append("Important remarks include:")
92
- full_text_parts.extend([f" - {remark}" for remark in data["remarks"]])
93
- elif "remarks" in data and isinstance(data["remarks"], str):
94
- full_text_parts.append(f"An important remark is: {data['remarks']}")
95
-
96
- # Create a single, powerful chunk from the combined text
97
- if len(full_text_parts) > 2: # Ensure we have more than just the intro
98
- final_text = " ".join(full_text_parts)
99
- chunks.append(create_chunk(context, final_text))
100
 
101
- return chunks
 
 
 
 
 
 
 
102
 
103
- def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
104
- """
105
- Main processing function. It prioritizes creating comprehensive chunks for complex rules.
106
- """
107
- context = {**(parent_context or {}), **data}
108
-
109
- # --- Priority 1: Attempt to process as a complex rule with nested delegations ---
110
- complex_chunks = process_complex_rule(data, parent_context or {})
111
- if complex_chunks:
112
- return complex_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # --- Priority 2: Handle simple, flat delegation rules ---
115
  if "delegation" in data and isinstance(data.get("delegation"), dict):
116
  base_desc = context.get('description') or context.get('title', 'this rule')
117
  delegation_text = format_delegation_text(data["delegation"])
118
  text = f"Regarding '{base_desc}', the delegated financial powers are: {delegation_text}."
119
- return [create_chunk(context, text)]
 
120
 
121
- # --- Priority 3: Recursively process deeper structures ---
122
- # This is important for traversing the JSON but is now secondary to creating comprehensive chunks.
123
- chunks = []
124
  for key, value in data.items():
125
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
126
  for item in value:
@@ -128,28 +121,28 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
128
  if chunks:
129
  return chunks
130
 
131
- # --- Fallback: Create a chunk for simple descriptive text if no other rule applies ---
132
  description = context.get("description")
133
  title = context.get("title")
134
- if description:
135
  text = f"The policy for '{title}' states: {description}."
136
- return [create_chunk(context, text)]
137
 
138
- return []
139
 
140
  def main():
141
  """Main function to read, process, and write."""
142
- print(f"Starting to process '{INPUT_FILE}' with the best-approach chunking strategy...")
143
- final_chunks = []
144
 
145
  try:
146
  with open(INPUT_FILE, 'r', encoding='utf-8') as f:
147
  for i, line in enumerate(f):
148
  try:
149
  data = json.loads(line)
150
- processed = process_entry(data)
151
- if processed:
152
- final_chunks.extend(processed)
153
  except json.JSONDecodeError:
154
  print(f"Warning: Skipping malformed JSON on line {i+1}")
155
  continue
@@ -157,10 +150,10 @@ def main():
157
  print(f"Error: Input file '{INPUT_FILE}' not found.")
158
  return
159
 
160
- print(f"Deconstructed into {len(final_chunks)} comprehensive, self-contained chunks.")
161
 
162
  with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
163
- for chunk in final_chunks:
164
  f.write(json.dumps(chunk) + '\n')
165
 
166
  print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
 
1
  import os
2
  import json
3
  import re
4
+ from typing import List, Dict, Any
5
 
6
  # --- Configuration ---
7
  INPUT_FILE = "combined_context.jsonl"
8
+ OUTPUT_FILE = "granular_chunks_final.jsonl" # Keeping the filename consistent
 
9
 
10
  # --- Global State ---
11
  chunk_counter = 0
12
 
13
+ def get_unique_id() -> str:
14
  """Returns a unique, incrementing ID for each chunk."""
15
  global chunk_counter
16
  chunk_counter += 1
17
  return f"chunk-{chunk_counter}"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def create_chunk(context: Dict, text: str) -> Dict:
20
  """Creates a standardized chunk dictionary with rich metadata."""
21
  metadata = {
 
26
  }
27
  # Add any other relevant context keys to metadata
28
  for key, value in context.items():
29
+ if key not in metadata and isinstance(value, (str, int, float, bool)):
30
  metadata[key] = value
31
 
32
  return {
 
35
  "metadata": {k: v for k, v in metadata.items() if v is not None}
36
  }
37
 
38
+ def format_delegation_text(delegation: Any) -> str:
39
+ """Formats a delegation dictionary or string into a readable string."""
40
+ if not isinstance(delegation, dict):
41
+ return str(delegation) # Handles cases where it's a simple string
42
+ parts = [f"the limit for {auth} is {limit}" for auth, limit in delegation.items() if limit and str(limit).lower() != 'nil']
43
+ return ", ".join(parts) if parts else "No specific delegation provided."
44
+
45
+ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
46
  """
47
+ The definitive processing function. It creates highly descriptive and self-contained chunks
48
+ by applying specific handlers based on the structure of each JSON entry.
49
  """
50
+ context = {**(parent_context or {}), **data}
51
  chunks = []
 
 
 
 
 
 
 
 
 
 
52
 
53
+ # --- Handler 1: Committee Composition (e.g., LPC-1, LPC-2) ---
54
+ if "composition" in data and isinstance(data["composition"], list):
55
+ base_title = context.get('title', 'a committee')
56
+ composition_parts = []
57
+ for item in data["composition"]:
58
+ if isinstance(item, dict):
59
+ for role, members in item.items():
60
+ member_text = f"the {role} is {members}" if isinstance(members, str) else f"the {role} are: {', '.join(members)}"
61
+ composition_parts.append(member_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ if composition_parts:
64
+ full_text = f"Regarding '{base_title}', the composition is: {'; '.join(composition_parts)}."
65
+ if context.get("approving_authority"):
66
+ full_text += f" The approving authority is {context['approving_authority']}."
67
+ if context.get("remarks"):
68
+ full_text += f" Remarks include: {' '.join(context['remarks'])}"
69
+ chunks.append(create_chunk(context, full_text))
70
+ return chunks
71
 
72
+ # --- Handler 2: Complex Nested Rules with Delegations ---
73
+ # This is the most important handler for creating comprehensive, self-contained chunks.
74
+ nested_list_key = next((key for key in ["methods", "subclauses"] if key in data and isinstance(data[key], list)), None)
75
+ if nested_list_key:
76
+ is_complex_delegation = all(isinstance(item, dict) and "delegation" in item for item in data[nested_list_key])
77
+ if is_complex_delegation:
78
+ base_title = context.get('title', 'a policy')
79
+ base_desc = context.get('description', '')
80
+
81
+ text_parts = [f"Regarding the policy for '{base_title}'"]
82
+ if base_desc:
83
+ text_parts.append(f"specifically for '{base_desc}'")
84
+ text_parts.append(", the rules are as follows:")
85
+
86
+ for item in data[nested_list_key]:
87
+ item_desc = item.get('description') or item.get('method') or item.get('title', 'a specific method')
88
+ delegation_text = format_delegation_text(item["delegation"])
89
+ text_parts.append(f"For '{item_desc}', {delegation_text}.")
90
+
91
+ final_text = " ".join(text_parts)
92
+ chunks.append(create_chunk(context, final_text))
93
+ return chunks
94
+
95
+ # --- Handler 3: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
96
+ list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data[key], list)), None)
97
+ if list_key:
98
+ base_title = context.get('title', 'a policy')
99
+ prefix = f"Regarding '{base_title}', the following items are {'excluded' if list_key == 'exclusions' else 'included'}:"
100
+
101
+ # Create individual chunks for each item for better specific retrieval
102
+ for item in data[list_key]:
103
+ if isinstance(item, str):
104
+ chunks.append(create_chunk(context, f"A rule regarding '{base_title}' is: {item}."))
105
+ return chunks
106
 
107
+ # --- Handler 4: Flat Delegation (a rule with a direct delegation dict) ---
108
  if "delegation" in data and isinstance(data.get("delegation"), dict):
109
  base_desc = context.get('description') or context.get('title', 'this rule')
110
  delegation_text = format_delegation_text(data["delegation"])
111
  text = f"Regarding '{base_desc}', the delegated financial powers are: {delegation_text}."
112
+ chunks.append(create_chunk(context, text))
113
+ return chunks
114
 
115
+ # --- Handler 5: Recursive Processor for Generic Nested Structures ---
116
+ # If no specific handler above matched, traverse deeper into the JSON.
 
117
  for key, value in data.items():
118
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
119
  for item in value:
 
121
  if chunks:
122
  return chunks
123
 
124
+ # --- Fallback Handler: For simple descriptive text nodes ---
125
  description = context.get("description")
126
  title = context.get("title")
127
+ if description and isinstance(description, str):
128
  text = f"The policy for '{title}' states: {description}."
129
+ chunks.append(create_chunk(context, text))
130
 
131
+ return chunks
132
 
133
  def main():
134
  """Main function to read, process, and write."""
135
+ print(f"Starting to process '{INPUT_FILE}' with the definitive chunking strategy...")
136
+ all_chunks = []
137
 
138
  try:
139
  with open(INPUT_FILE, 'r', encoding='utf-8') as f:
140
  for i, line in enumerate(f):
141
  try:
142
  data = json.loads(line)
143
+ processed_chunks = process_entry(data)
144
+ if processed_chunks:
145
+ all_chunks.extend(processed_chunks)
146
  except json.JSONDecodeError:
147
  print(f"Warning: Skipping malformed JSON on line {i+1}")
148
  continue
 
150
  print(f"Error: Input file '{INPUT_FILE}' not found.")
151
  return
152
 
153
+ print(f"Deconstructed into {len(all_chunks)} highly descriptive chunks.")
154
 
155
  with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
156
+ for chunk in all_chunks:
157
  f.write(json.dumps(chunk) + '\n')
158
 
159
  print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")