Kalpokoch commited on
Commit
04c50c5
·
verified ·
1 Parent(s): ac06855

Update create_granular_chunks.py

Browse files
Files changed (1) hide show
  1. create_granular_chunks.py +12 -17
create_granular_chunks.py CHANGED
@@ -24,7 +24,6 @@ def create_chunk(context: Dict, text: str) -> Dict:
24
  "title": context.get("title"),
25
  "source_description": context.get("description"),
26
  }
27
- # Add any other relevant context keys to metadata
28
  for key, value in context.items():
29
  if key not in metadata and isinstance(value, (str, int, float, bool)):
30
  metadata[key] = value
@@ -36,14 +35,20 @@ def create_chunk(context: Dict, text: str) -> Dict:
36
  }
37
 
38
  def format_delegation_text(delegation: Any) -> str:
39
- """Formats a delegation dictionary or string into a readable string."""
 
 
 
 
 
40
  if not isinstance(delegation, dict):
41
  return str(delegation)
42
- parts = [f"the limit for {auth} is {limit}" for auth, limit in delegation.items() if limit and str(limit).lower() != 'nil']
 
43
  return ", ".join(parts) if parts else "No specific delegation provided."
44
 
45
  def format_remarks(remarks: Any) -> str:
46
- """Safely formats the 'remarks' field, handling strings, lists of strings, and lists of dictionaries."""
47
  if isinstance(remarks, list):
48
  remark_parts = []
49
  for item in remarks:
@@ -57,22 +62,18 @@ def format_remarks(remarks: Any) -> str:
57
 
58
  def build_descriptive_text(context: Dict) -> str:
59
  """
60
- BEST PRACTICE: Intelligently builds a single, descriptive, natural language sentence
61
- by combining all relevant fields from the context. This creates the most semantically
62
- rich chunk possible for the embedding model.
63
  """
64
  text_parts = []
65
 
66
- # Start with the highest-level identifiers
67
  if context.get("title"):
68
  text_parts.append(f"Regarding the policy for '{context['title']}'")
69
 
70
- # Add the specific description, method, or sub-clause title
71
- specific_desc = context.get('description') or context.get('method') or context.get('title')
72
  if specific_desc and specific_desc != context.get('title'):
73
  text_parts.append(f"specifically for '{specific_desc}'")
74
 
75
- # Add the core rule or delegation information
76
  if "delegation" in context:
77
  delegation_text = format_delegation_text(context["delegation"])
78
  text_parts.append(f", the financial delegations are: {delegation_text}.")
@@ -85,7 +86,6 @@ def build_descriptive_text(context: Dict) -> str:
85
  composition_parts.append(member_text)
86
  text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
87
 
88
- # Append any remarks for crucial extra context
89
  if "remarks" in context and context["remarks"]:
90
  remarks_text = format_remarks(context["remarks"])
91
  text_parts.append(f" Important remarks include: {remarks_text}")
@@ -101,7 +101,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
101
  chunks = []
102
 
103
  # --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
104
- # This creates small, specific chunks for each item in a list.
105
  list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
106
  if list_key:
107
  base_title = context.get('title', 'a policy')
@@ -111,7 +110,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
111
  return chunks
112
 
113
  # --- Handler 2: Recursive Traversal ---
114
- # This is the main recursive step. It traverses deeper into nested lists of dictionaries.
115
  has_recursed = False
116
  for key, value in data.items():
117
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
@@ -120,9 +118,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
120
  has_recursed = True
121
 
122
  # --- Handler 3: Leaf Node Creation ---
123
- # If the function has not recursed and the entry contains a core piece of information
124
- # (like 'delegation' or 'composition'), it's a "leaf" node. We create a single,
125
- # comprehensive chunk for it. This is the core of the new strategy.
126
  if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
127
  text = build_descriptive_text(context)
128
  chunks.append(create_chunk(context, text))
 
24
  "title": context.get("title"),
25
  "source_description": context.get("description"),
26
  }
 
27
  for key, value in context.items():
28
  if key not in metadata and isinstance(value, (str, int, float, bool)):
29
  metadata[key] = value
 
35
  }
36
 
37
  def format_delegation_text(delegation: Any) -> str:
38
+ """
39
+ Formats a delegation dictionary or string into a readable string.
40
+ --- ACCURACY FIX ---
41
+ This function now explicitly includes "NIL" or "---" values instead of skipping them.
42
+ This is crucial for the model to correctly answer questions about roles with no power.
43
+ """
44
  if not isinstance(delegation, dict):
45
  return str(delegation)
46
+ # Use "is NIL" for None or "---", otherwise use "is [limit]"
47
+ parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
48
  return ", ".join(parts) if parts else "No specific delegation provided."
49
 
50
  def format_remarks(remarks: Any) -> str:
51
+ """Safely formats the 'remarks' field, handling various data types."""
52
  if isinstance(remarks, list):
53
  remark_parts = []
54
  for item in remarks:
 
62
 
63
  def build_descriptive_text(context: Dict) -> str:
64
  """
65
+ Intelligently builds a single, descriptive, natural language sentence
66
+ by combining all relevant fields from the context.
 
67
  """
68
  text_parts = []
69
 
 
70
  if context.get("title"):
71
  text_parts.append(f"Regarding the policy for '{context['title']}'")
72
 
73
+ specific_desc = context.get('description') or context.get('method')
 
74
  if specific_desc and specific_desc != context.get('title'):
75
  text_parts.append(f"specifically for '{specific_desc}'")
76
 
 
77
  if "delegation" in context:
78
  delegation_text = format_delegation_text(context["delegation"])
79
  text_parts.append(f", the financial delegations are: {delegation_text}.")
 
86
  composition_parts.append(member_text)
87
  text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
88
 
 
89
  if "remarks" in context and context["remarks"]:
90
  remarks_text = format_remarks(context["remarks"])
91
  text_parts.append(f" Important remarks include: {remarks_text}")
 
101
  chunks = []
102
 
103
  # --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
 
104
  list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
105
  if list_key:
106
  base_title = context.get('title', 'a policy')
 
110
  return chunks
111
 
112
  # --- Handler 2: Recursive Traversal ---
 
113
  has_recursed = False
114
  for key, value in data.items():
115
  if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
 
118
  has_recursed = True
119
 
120
  # --- Handler 3: Leaf Node Creation ---
 
 
 
121
  if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
122
  text = build_descriptive_text(context)
123
  chunks.append(create_chunk(context, text))