Spaces:
Sleeping
Sleeping
Update create_granular_chunks.py
Browse files- create_granular_chunks.py +12 -17
create_granular_chunks.py
CHANGED
|
@@ -24,7 +24,6 @@ def create_chunk(context: Dict, text: str) -> Dict:
|
|
| 24 |
"title": context.get("title"),
|
| 25 |
"source_description": context.get("description"),
|
| 26 |
}
|
| 27 |
-
# Add any other relevant context keys to metadata
|
| 28 |
for key, value in context.items():
|
| 29 |
if key not in metadata and isinstance(value, (str, int, float, bool)):
|
| 30 |
metadata[key] = value
|
|
@@ -36,14 +35,20 @@ def create_chunk(context: Dict, text: str) -> Dict:
|
|
| 36 |
}
|
| 37 |
|
| 38 |
def format_delegation_text(delegation: Any) -> str:
|
| 39 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
if not isinstance(delegation, dict):
|
| 41 |
return str(delegation)
|
| 42 |
-
|
|
|
|
| 43 |
return ", ".join(parts) if parts else "No specific delegation provided."
|
| 44 |
|
| 45 |
def format_remarks(remarks: Any) -> str:
|
| 46 |
-
"""Safely formats the 'remarks' field, handling
|
| 47 |
if isinstance(remarks, list):
|
| 48 |
remark_parts = []
|
| 49 |
for item in remarks:
|
|
@@ -57,22 +62,18 @@ def format_remarks(remarks: Any) -> str:
|
|
| 57 |
|
| 58 |
def build_descriptive_text(context: Dict) -> str:
|
| 59 |
"""
|
| 60 |
-
|
| 61 |
-
by combining all relevant fields from the context.
|
| 62 |
-
rich chunk possible for the embedding model.
|
| 63 |
"""
|
| 64 |
text_parts = []
|
| 65 |
|
| 66 |
-
# Start with the highest-level identifiers
|
| 67 |
if context.get("title"):
|
| 68 |
text_parts.append(f"Regarding the policy for '{context['title']}'")
|
| 69 |
|
| 70 |
-
|
| 71 |
-
specific_desc = context.get('description') or context.get('method') or context.get('title')
|
| 72 |
if specific_desc and specific_desc != context.get('title'):
|
| 73 |
text_parts.append(f"specifically for '{specific_desc}'")
|
| 74 |
|
| 75 |
-
# Add the core rule or delegation information
|
| 76 |
if "delegation" in context:
|
| 77 |
delegation_text = format_delegation_text(context["delegation"])
|
| 78 |
text_parts.append(f", the financial delegations are: {delegation_text}.")
|
|
@@ -85,7 +86,6 @@ def build_descriptive_text(context: Dict) -> str:
|
|
| 85 |
composition_parts.append(member_text)
|
| 86 |
text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
|
| 87 |
|
| 88 |
-
# Append any remarks for crucial extra context
|
| 89 |
if "remarks" in context and context["remarks"]:
|
| 90 |
remarks_text = format_remarks(context["remarks"])
|
| 91 |
text_parts.append(f" Important remarks include: {remarks_text}")
|
|
@@ -101,7 +101,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
|
|
| 101 |
chunks = []
|
| 102 |
|
| 103 |
# --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
|
| 104 |
-
# This creates small, specific chunks for each item in a list.
|
| 105 |
list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
|
| 106 |
if list_key:
|
| 107 |
base_title = context.get('title', 'a policy')
|
|
@@ -111,7 +110,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
|
|
| 111 |
return chunks
|
| 112 |
|
| 113 |
# --- Handler 2: Recursive Traversal ---
|
| 114 |
-
# This is the main recursive step. It traverses deeper into nested lists of dictionaries.
|
| 115 |
has_recursed = False
|
| 116 |
for key, value in data.items():
|
| 117 |
if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
|
|
@@ -120,9 +118,6 @@ def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
|
|
| 120 |
has_recursed = True
|
| 121 |
|
| 122 |
# --- Handler 3: Leaf Node Creation ---
|
| 123 |
-
# If the function has not recursed and the entry contains a core piece of information
|
| 124 |
-
# (like 'delegation' or 'composition'), it's a "leaf" node. We create a single,
|
| 125 |
-
# comprehensive chunk for it. This is the core of the new strategy.
|
| 126 |
if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
|
| 127 |
text = build_descriptive_text(context)
|
| 128 |
chunks.append(create_chunk(context, text))
|
|
|
|
| 24 |
"title": context.get("title"),
|
| 25 |
"source_description": context.get("description"),
|
| 26 |
}
|
|
|
|
| 27 |
for key, value in context.items():
|
| 28 |
if key not in metadata and isinstance(value, (str, int, float, bool)):
|
| 29 |
metadata[key] = value
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
def format_delegation_text(delegation: Any) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Formats a delegation dictionary or string into a readable string.
|
| 40 |
+
--- ACCURACY FIX ---
|
| 41 |
+
This function now explicitly includes "NIL" or "---" values instead of skipping them.
|
| 42 |
+
This is crucial for the model to correctly answer questions about roles with no power.
|
| 43 |
+
"""
|
| 44 |
if not isinstance(delegation, dict):
|
| 45 |
return str(delegation)
|
| 46 |
+
# Use "is NIL" for None or "---", otherwise use "is [limit]"
|
| 47 |
+
parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
|
| 48 |
return ", ".join(parts) if parts else "No specific delegation provided."
|
| 49 |
|
| 50 |
def format_remarks(remarks: Any) -> str:
|
| 51 |
+
"""Safely formats the 'remarks' field, handling various data types."""
|
| 52 |
if isinstance(remarks, list):
|
| 53 |
remark_parts = []
|
| 54 |
for item in remarks:
|
|
|
|
| 62 |
|
| 63 |
def build_descriptive_text(context: Dict) -> str:
|
| 64 |
"""
|
| 65 |
+
Intelligently builds a single, descriptive, natural language sentence
|
| 66 |
+
by combining all relevant fields from the context.
|
|
|
|
| 67 |
"""
|
| 68 |
text_parts = []
|
| 69 |
|
|
|
|
| 70 |
if context.get("title"):
|
| 71 |
text_parts.append(f"Regarding the policy for '{context['title']}'")
|
| 72 |
|
| 73 |
+
specific_desc = context.get('description') or context.get('method')
|
|
|
|
| 74 |
if specific_desc and specific_desc != context.get('title'):
|
| 75 |
text_parts.append(f"specifically for '{specific_desc}'")
|
| 76 |
|
|
|
|
| 77 |
if "delegation" in context:
|
| 78 |
delegation_text = format_delegation_text(context["delegation"])
|
| 79 |
text_parts.append(f", the financial delegations are: {delegation_text}.")
|
|
|
|
| 86 |
composition_parts.append(member_text)
|
| 87 |
text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
|
| 88 |
|
|
|
|
| 89 |
if "remarks" in context and context["remarks"]:
|
| 90 |
remarks_text = format_remarks(context["remarks"])
|
| 91 |
text_parts.append(f" Important remarks include: {remarks_text}")
|
|
|
|
| 101 |
chunks = []
|
| 102 |
|
| 103 |
# --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
|
|
|
|
| 104 |
list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
|
| 105 |
if list_key:
|
| 106 |
base_title = context.get('title', 'a policy')
|
|
|
|
| 110 |
return chunks
|
| 111 |
|
| 112 |
# --- Handler 2: Recursive Traversal ---
|
|
|
|
| 113 |
has_recursed = False
|
| 114 |
for key, value in data.items():
|
| 115 |
if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
|
|
|
|
| 118 |
has_recursed = True
|
| 119 |
|
| 120 |
# --- Handler 3: Leaf Node Creation ---
|
|
|
|
|
|
|
|
|
|
| 121 |
if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
|
| 122 |
text = build_descriptive_text(context)
|
| 123 |
chunks.append(create_chunk(context, text))
|