Kalpokoch commited on
Commit
3076a66
·
verified ·
1 Parent(s): 04c50c5

Update create_granular_chunks.py

Browse files
Files changed (1) hide show
  1. create_granular_chunks.py +141 -91
create_granular_chunks.py CHANGED
@@ -1,134 +1,175 @@
1
  import os
2
  import json
3
  import re
 
4
  from typing import List, Dict, Any
5
 
6
  # --- Configuration ---
7
  INPUT_FILE = "combined_context.jsonl"
8
- OUTPUT_FILE = "granular_chunks_final.jsonl" # Keeping the filename consistent
9
 
10
  # --- Global State ---
11
  chunk_counter = 0
12
 
13
- def get_unique_id() -> str:
14
- """Returns a unique, incrementing ID for each chunk."""
 
 
 
 
 
15
  global chunk_counter
16
  chunk_counter += 1
17
- return f"chunk-{chunk_counter}"
 
 
 
 
18
 
19
- def create_chunk(context: Dict, text: str) -> Dict:
20
- """Creates a standardized chunk dictionary with rich metadata."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  metadata = {
22
  "section": context.get("section"),
23
  "clause": context.get("clause") or context.get("Clause"),
24
  "title": context.get("title"),
25
- "source_description": context.get("description"),
 
 
26
  }
27
- for key, value in context.items():
28
- if key not in metadata and isinstance(value, (str, int, float, bool)):
29
- metadata[key] = value
30
-
31
  return {
32
- "id": get_unique_id(),
33
- "text": text,
34
- "metadata": {k: v for k, v in metadata.items() if v is not None}
 
35
  }
36
 
37
- def format_delegation_text(delegation: Any) -> str:
38
- """
39
- Formats a delegation dictionary or string into a readable string.
40
- --- ACCURACY FIX ---
41
- This function now explicitly includes "NIL" or "---" values instead of skipping them.
42
- This is crucial for the model to correctly answer questions about roles with no power.
43
- """
44
- if not isinstance(delegation, dict):
45
- return str(delegation)
46
- # Use "is NIL" for None or "---", otherwise use "is [limit]"
47
- parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
48
- return ", ".join(parts) if parts else "No specific delegation provided."
49
-
50
- def format_remarks(remarks: Any) -> str:
51
- """Safely formats the 'remarks' field, handling various data types."""
 
 
 
 
52
  if isinstance(remarks, list):
53
- remark_parts = []
54
- for item in remarks:
55
- if isinstance(item, dict):
56
- for key, value in item.items():
57
- remark_parts.append(f"{key}: {value}")
58
  else:
59
- remark_parts.append(str(item))
60
- return " ".join(remark_parts)
61
- return str(remarks)
 
 
 
62
 
63
- def build_descriptive_text(context: Dict) -> str:
64
- """
65
- Intelligently builds a single, descriptive, natural language sentence
66
- by combining all relevant fields from the context.
67
- """
68
- text_parts = []
69
-
70
- if context.get("title"):
71
- text_parts.append(f"Regarding the policy for '{context['title']}'")
72
-
73
- specific_desc = context.get('description') or context.get('method')
74
- if specific_desc and specific_desc != context.get('title'):
75
- text_parts.append(f"specifically for '{specific_desc}'")
76
-
77
- if "delegation" in context:
78
- delegation_text = format_delegation_text(context["delegation"])
79
- text_parts.append(f", the financial delegations are: {delegation_text}.")
80
- elif "composition" in context:
81
- composition_parts = []
82
- for item in context["composition"]:
83
- if isinstance(item, dict):
84
- for role, members in item.items():
85
- member_text = f"the {role} is {members}" if isinstance(members, str) else f"the {role} are: {', '.join(members)}"
86
- composition_parts.append(member_text)
87
- text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
88
-
89
- if "remarks" in context and context["remarks"]:
90
- remarks_text = format_remarks(context["remarks"])
91
- text_parts.append(f" Important remarks include: {remarks_text}")
92
 
93
- return " ".join(text_parts)
 
 
94
 
95
- def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
96
  """
97
- The definitive processing function. It traverses the JSON and uses a set of handlers
98
- to create highly descriptive, self-contained chunks.
99
  """
100
  context = {**(parent_context or {}), **data}
101
  chunks = []
102
 
103
- # --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
 
 
 
 
 
 
 
104
  list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
105
  if list_key:
106
- base_title = context.get('title', 'a policy')
107
  for item in data[list_key]:
108
  if isinstance(item, str):
109
- chunks.append(create_chunk(context, f"A rule regarding '{base_title}' is: {item}."))
110
  return chunks
111
 
112
- # --- Handler 2: Recursive Traversal ---
113
- has_recursed = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  for key, value in data.items():
115
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
116
  for item in value:
117
- chunks.extend(process_entry(item, context))
118
- has_recursed = True
119
-
120
- # --- Handler 3: Leaf Node Creation ---
121
- if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
122
- text = build_descriptive_text(context)
123
- chunks.append(create_chunk(context, text))
124
 
125
  return chunks
126
 
 
 
 
 
 
127
  def main():
128
- """Main function to read, process, and write."""
129
- print(f"Starting to process '{INPUT_FILE}' with the definitive chunking strategy...")
130
  all_chunks = []
131
-
 
132
  try:
133
  with open(INPUT_FILE, 'r', encoding='utf-8') as f:
134
  for i, line in enumerate(f):
@@ -144,17 +185,26 @@ def main():
144
  print(f"Error: Input file '{INPUT_FILE}' not found.")
145
  return
146
 
147
- print(f"Deconstructed into {len(all_chunks)} highly descriptive chunks.")
148
 
149
- # Remove duplicates before writing
150
- unique_chunks = {chunk['text']: chunk for chunk in all_chunks}.values()
151
- print(f"Removed duplicates, writing {len(unique_chunks)} unique chunks.")
 
 
 
 
 
152
 
 
 
 
153
  with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
154
  for chunk in unique_chunks:
155
- f.write(json.dumps(chunk) + '\n')
 
 
156
 
157
- print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
158
 
159
  if __name__ == "__main__":
160
  main()
 
1
  import os
2
  import json
3
  import re
4
+ import hashlib
5
  from typing import List, Dict, Any
6
 
7
  # --- Configuration ---
8
  INPUT_FILE = "combined_context.jsonl"
9
+ OUTPUT_FILE = "granular_chunks_final.jsonl"
10
 
11
  # --- Global State ---
12
  chunk_counter = 0
13
 
14
+
15
+ # -----------------------
16
+ # Utility Helpers
17
+ # -----------------------
18
+
19
+ def get_unique_id(context: Dict, role: str = None) -> str:
20
+ """Generate semantic ID using section/clause/title and optional role, ensure uniqueness via hash."""
21
  global chunk_counter
22
  chunk_counter += 1
23
+ base_str = f"{context.get('section','')}-{context.get('clause','')}-{context.get('title','')}"
24
+ if role:
25
+ base_str += f"-{role}"
26
+ digest = hashlib.sha1(base_str.encode()).hexdigest()[:6]
27
+ return f"{base_str.replace(' ', '_')}-{digest}-{chunk_counter}"
28
 
29
+
30
+ def normalize_money(value: str) -> Dict[str, Any]:
31
+ """
32
+ Try to normalize monetary values (₹10 crore -> 100000000).
33
+ Returns dict with human text and normalized number
34
+ """
35
+ multipliers = {
36
+ "lakh": 1e5,
37
+ "crore": 1e7
38
+ }
39
+ result = {"original": value, "normalized": None}
40
+ if not isinstance(value, str):
41
+ return result
42
+ match = re.search(r"₹?\s*([\d,.]+)\s*(crore|lakh)?", value, flags=re.IGNORECASE)
43
+ if match:
44
+ number = float(match.group(1).replace(",", ""))
45
+ unit = match.group(2).lower() if match.group(2) else None
46
+ if unit in multipliers:
47
+ number *= multipliers[unit]
48
+ result["normalized"] = int(number)
49
+ return result
50
+
51
+
52
+ def create_chunk(context: Dict, text: str, extra_metadata: Dict = None, role: str = None, parent_id: str = None) -> Dict:
53
+ """Creates a standardized chunk dictionary with traceable metadata."""
54
  metadata = {
55
  "section": context.get("section"),
56
  "clause": context.get("clause") or context.get("Clause"),
57
  "title": context.get("title"),
58
+ "description": context.get("description"),
59
+ "parent_title": context.get("parent_title"),
60
+ "grandparent_title": context.get("grandparent_title"),
61
  }
62
+ # Merge with extras and flatten
63
+ if extra_metadata:
64
+ metadata.update(extra_metadata)
 
65
  return {
66
+ "id": get_unique_id(context, role),
67
+ "text": text.strip(),
68
+ "metadata": {k: v for k, v in metadata.items() if v is not None},
69
+ "parent_id": parent_id
70
  }
71
 
72
+
73
+ def format_delegation(delegation: Any, context: Dict, parent_id: str = None) -> List[Dict]:
74
+ """Return chunks for delegations in natural + structured formats."""
75
+ chunks = []
76
+ if isinstance(delegation, dict):
77
+ for role, limit in delegation.items():
78
+ norm_val = normalize_money(limit)
79
+ text = f"In the context of '{context.get('title')}', the limit for {role} is {limit if limit not in [None,'---'] else 'NIL'}."
80
+ meta = {"role": role, "limit": limit, "limit_normalized": norm_val.get("normalized")}
81
+ chunks.append(create_chunk(context, text, meta, role=role, parent_id=parent_id))
82
+ else:
83
+ # simple string delegation
84
+ chunks.append(create_chunk(context, f"Delegation rule: {delegation}", parent_id=parent_id))
85
+ return chunks
86
+
87
+
88
+ def format_remarks(remarks: Any, context: Dict, parent_id: str = None) -> List[Dict]:
89
+ """Split remarks into individual atomic chunks."""
90
+ chunks = []
91
  if isinstance(remarks, list):
92
+ for r in remarks:
93
+ if isinstance(r, dict):
94
+ for k, v in r.items():
95
+ text = f"Remark for '{context.get('title')}': {k}: {v}"
96
+ chunks.append(create_chunk(context, text, parent_id=parent_id))
97
  else:
98
+ text = f"Remark for '{context.get('title')}': {r}"
99
+ chunks.append(create_chunk(context, text, parent_id=parent_id))
100
+ else:
101
+ text = f"Remark for '{context.get('title')}': {remarks}"
102
+ chunks.append(create_chunk(context, text, parent_id=parent_id))
103
+ return chunks
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # -----------------------
107
+ # Processing Logic
108
+ # -----------------------
109
 
110
+ def process_entry(data: Dict, parent_context: Dict = None, parent_id: str = None) -> List[Dict]:
111
  """
112
+ Recursive processor that expands JSON entries into granular atomic chunks.
 
113
  """
114
  context = {**(parent_context or {}), **data}
115
  chunks = []
116
 
117
+ # Hierarchy fields
118
+ if parent_context:
119
+ if parent_context.get("title"):
120
+ context["parent_title"] = parent_context.get("title")
121
+ if parent_context.get("parent_title"):
122
+ context["grandparent_title"] = parent_context.get("parent_title")
123
+
124
+ # Handle list of plain items (rules, exclusions)
125
  list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
126
  if list_key:
 
127
  for item in data[list_key]:
128
  if isinstance(item, str):
129
+ chunks.append(create_chunk(context, f"Rule under '{context.get('title')}': {item}.", parent_id=parent_id))
130
  return chunks
131
 
132
+ # Handle delegation
133
+ if "delegation" in data:
134
+ chunks.extend(format_delegation(data["delegation"], context, parent_id=parent_id))
135
+
136
+ # Handle description (atomic chunk)
137
+ if data.get("description"):
138
+ chunks.append(create_chunk(context, f"Description: {data['description']}", parent_id=parent_id))
139
+
140
+ # Handle composition
141
+ if "composition" in data:
142
+ for item in data["composition"]:
143
+ if isinstance(item, dict):
144
+ for role, members in item.items():
145
+ member_text = members if isinstance(members, str) else ", ".join(members)
146
+ chunks.append(create_chunk(context,
147
+ f"Committee composition: {role} = {member_text}",
148
+ extra_metadata={"role": role},
149
+ parent_id=parent_id))
150
+
151
+ # Handle remarks
152
+ if "remarks" in data and data["remarks"]:
153
+ chunks.extend(format_remarks(data["remarks"], context, parent_id=parent_id))
154
+
155
+ # Recurse into nested dict lists (subclauses, methods, etc.)
156
  for key, value in data.items():
157
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
158
  for item in value:
159
+ chunks.extend(process_entry(item, context, parent_id=context.get("id", None)))
 
 
 
 
 
 
160
 
161
  return chunks
162
 
163
+
164
+ # -----------------------
165
+ # Main
166
+ # -----------------------
167
+
168
  def main():
169
+ print(f"Processing '{INPUT_FILE}' with improved chunking...")
 
170
  all_chunks = []
171
+
172
+ # Read file
173
  try:
174
  with open(INPUT_FILE, 'r', encoding='utf-8') as f:
175
  for i, line in enumerate(f):
 
185
  print(f"Error: Input file '{INPUT_FILE}' not found.")
186
  return
187
 
188
+ print(f"Generated {len(all_chunks)} raw chunks.")
189
 
190
+ # Deduplicate based on text+metadata hash
191
+ seen = set()
192
+ unique_chunks = []
193
+ for ch in all_chunks:
194
+ sig = json.dumps((ch["text"], ch["metadata"]), sort_keys=True)
195
+ if sig not in seen:
196
+ seen.add(sig)
197
+ unique_chunks.append(ch)
198
 
199
+ print(f"Deduplicated to {len(unique_chunks)} unique chunks.")
200
+
201
+ # Write output
202
  with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
203
  for chunk in unique_chunks:
204
+ f.write(json.dumps(chunk, ensure_ascii=False) + '\n')
205
+
206
+ print(f"Successfully wrote improved granular chunks to {OUTPUT_FILE}")
207
 
 
208
 
209
  if __name__ == "__main__":
210
  main()