field_semantic_mapping / utils_grouping.py
Tanishq Salkar
initial visual mapping code added to hf
db81e28
import re
def clean_section_title(raw_title):
"""
Turns '2. PURCHASE PRICE (U.S. currency)' -> 'Purchase Price'
"""
if not raw_title: return "General Information"
# Remove leading numbers/bullets (e.g., "1.", "A.")
clean = re.sub(r'^[A-Z0-9]+\.\s*', '', raw_title)
# Remove things in parentheses (e.g., "(U.S. Currency)")
clean = re.sub(r'\s*\(.*?\)', '', clean)
# Title Case
return clean.strip().title()
UNGROUPABLE_TYPES = ["signature", "initial"]
def group_fields_by_section(fields):
"""
Organizes flat fields into logical groups based on the
'section' context extracted by the Vision model.
"""
groups_map = {}
for f in fields:
# Get the raw section from Vision (now populated!)
if f.get("semanticType") in UNGROUPABLE_TYPES:
continue
raw_section = f.get("section", "General Information")
group_title = clean_section_title(raw_section)
# Create a stable ID for the group
group_id = f"grp_{group_title.lower().replace(' ', '_')[:30]}"
# Create group if not exists
if group_id not in groups_map:
groups_map[group_id] = {
"id": group_id,
"title": group_title,
"fieldIds": []
}
# Link field to group
groups_map[group_id]["fieldIds"].append(f["id"])
# Mutate the field object to include the link
f["groupId"] = group_id
# Sort groups by the page/y-position of their first field
sorted_groups = sorted(
groups_map.values(),
key=lambda g: [
next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("page", 0),
next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("rect", {}).get("y", 0)
]
)
return sorted_groups, fields