import re def clean_section_title(raw_title): """ Turns '2. PURCHASE PRICE (U.S. currency)' -> 'Purchase Price' """ if not raw_title: return "General Information" # Remove leading numbers/bullets (e.g., "1.", "A.") clean = re.sub(r'^[A-Z0-9]+\.\s*', '', raw_title) # Remove things in parentheses (e.g., "(U.S. Currency)") clean = re.sub(r'\s*\(.*?\)', '', clean) # Title Case return clean.strip().title() UNGROUPABLE_TYPES = ["signature", "initial"] def group_fields_by_section(fields): """ Organizes flat fields into logical groups based on the 'section' context extracted by the Vision model. """ groups_map = {} for f in fields: # Get the raw section from Vision (now populated!) if f.get("semanticType") in UNGROUPABLE_TYPES: continue raw_section = f.get("section", "General Information") group_title = clean_section_title(raw_section) # Create a stable ID for the group group_id = f"grp_{group_title.lower().replace(' ', '_')[:30]}" # Create group if not exists if group_id not in groups_map: groups_map[group_id] = { "id": group_id, "title": group_title, "fieldIds": [] } # Link field to group groups_map[group_id]["fieldIds"].append(f["id"]) # Mutate the field object to include the link f["groupId"] = group_id # Sort groups by the page/y-position of their first field sorted_groups = sorted( groups_map.values(), key=lambda g: [ next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("page", 0), next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("rect", {}).get("y", 0) ] ) return sorted_groups, fields