Spaces:

tsalkar
/

field_semantic_mapping

Runtime error

App Files Files Community

Tanishq Salkar commited on Feb 4

Commit

db81e28

1 Parent(s): d3a7652

initial visual mapping code added to hf

Browse files

Files changed (8) hide show

api.py +190 -0
config.py +18 -0
engine_mapping.py +114 -0
engine_vision.py +162 -0
requirements.txt +69 -0
schema_definitions.py +75 -0
utils_geometry.py +109 -0
utils_grouping.py +59 -0

api.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import json
+import asyncio
+import requests
+import fitz
+import shutil
+import tempfile
+from datetime import datetime
+from fastapi import FastAPI, UploadFile, File, HTTPException
+import config
+import utils_geometry as utils
+from engine_vision import process_page_smart
+from engine_mapping import map_fields_to_schema
+from utils_grouping import group_fields_by_section
+app = FastAPI(title="Smart Contract Processor API")
+# code just to create a new commit
+def get_fields_from_local_api(pdf_path):
+    """
+    Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields.
+    Identical logic to main.py, just adapted to take a specific path.
+    """
+    print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}")
+    fields_by_page = {}
+    try:
+        with open(pdf_path, 'rb') as f:
+            response = requests.post(
+                config.COMMON_FORMS_API_URL,
+                files={'file': f},
+                stream=True,
+                timeout=60
+            )
+            for line in response.iter_lines():
+                if not line: continue
+                data = json.loads(line)
+                if data.get("status") == "success":
+                    fields_by_page[data["page"]] = data.get("fields", [])
+                elif data.get("status") == "error":
+                    print(f"Model API Error on page {data.get('page')}: {data.get('msg')}")
+    except Exception as e:
+        print(f"API Connection Error: {e}")
+        return None
+    return fields_by_page
+def get_pdf_metadata(doc, filename: str):
+    """
+    Extract PDF metadata including page sizes for ClaiPDFCollection.
+    """
+    page_sizes = []
+    for page in doc:
+        rect = page.rect
+        page_sizes.append({
+            "rotation": page.rotation,
+            "width": rect.width,
+            "height": rect.height
+        })
+    # Get title from PDF metadata or use filename
+    pdf_title = doc.metadata.get("title", "") if doc.metadata else ""
+    if not pdf_title:
+        pdf_title = os.path.splitext(filename)[0] if filename else "Document"
+    return {
+        "name": filename or "document.pdf",
+        "title": pdf_title,
+        "pageSizes": page_sizes
+    }
+def resolve_intermediate_format(all_fields, pdf_metadata):
+    """
+    Returns an intermediate format that will be transformed to ClaiSchema
+    in the Next.js layer. Uses tempIds for internal reference.
+    This format is consumed by transform-to-clai-schema.ts which generates
+    proper ClaiSchema-compliant IDs using TypeScript utilities.
+    """
+    groups, updated_fields = group_fields_by_section(all_fields)
+    participants = {}
+    final_fields = []
+    routing_counter = 1
+    for f in updated_fields:
+        raw_role = str(f.get("role", "System")).strip().title()
+        participant_temp_id = None
+        if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]:
+            participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}"
+            if participant_temp_id not in participants:
+                participants[participant_temp_id] = {
+                    "tempId": participant_temp_id,
+                    "role": "signer",
+                    "type": "unknown",
+                    "label": raw_role,
+                    "routingOrder": routing_counter,
+                    "definer": "PREPARER"
+                }
+                routing_counter += 1
+        final_fields.append({
+            "tempId": f["id"],
+            "aliasId": f.get("aliasId"),
+            "groupTempId": f.get("groupId"),
+            "participantTempId": participant_temp_id,
+            "label": f["label"],
+            "semanticType": f["semanticType"],
+            "isDynamic": f.get("isDynamic", False),
+            "page": f["page"],
+            "rect": f["rect"]
+        })
+    # Transform groups to use tempId
+    groups_with_temp_ids = []
+    for g in groups:
+        groups_with_temp_ids.append({
+            "tempId": g["id"],
+            "title": g["title"],
+            "fieldTempIds": g["fieldIds"]
+        })
+    return {
+        "participants": list(participants.values()),
+        "groups": groups_with_temp_ids,
+        "fields": final_fields,
+        "pdfMetadata": pdf_metadata
+    }
+# ==============================================================================
+# API ENDPOINT (Replaces async main())
+# ==============================================================================
+@app.post("/process-pdf")
+async def process_pdf(file: UploadFile = File(...)):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    doc = None
+    try:
+        utils.setup_debug_dir()
+        print(f"Starting process for uploaded file: {file.filename}")
+        raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path)
+        if not raw_fields:
+            raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).")
+        doc = fitz.open(tmp_path)
+        # Extract PDF metadata for ClaiPDFCollection
+        pdf_metadata = get_pdf_metadata(doc, file.filename)
+        # Extract text context for vision processing
+        text_sample = ""
+        for i in range(min(2, len(doc))):
+            text_sample += doc[i].get_text()
+        global_ctx = " ".join(text_sample.split())[:1500]
+        # Process pages with vision and mapping
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES)
+        tasks = []
+        for page_num, fields in raw_fields.items():
+            tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx))
+        results = await asyncio.gather(*tasks)
+        flat_results = [item for sublist in results for item in sublist]
+        mapped_results = await map_fields_to_schema(flat_results)
+        # Return intermediate format for Next.js transformation
+        intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata)
+        return intermediate_response
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if doc:
+            doc.close()
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+        print(f"Cleanup complete for {tmp_path}")

config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")
+COMMON_FORMS_API_URL = "https://tsalkar-pdf-field-extractor-backend.hf.space/extract-fields-stream"
+TARGET_FILE = "docs/template_pdf.pdf"
+OUTPUT_FILE = "final_output_smart_schema2.json"
+DEBUG_DIR = "debug_artifacts"
+BOX_COLOR = "#00FF00"     # Neon Green
+BOX_WIDTH = 3
+BADGE_COLOR = "#000000"
+BADGE_BG = "#00FF00"
+# --- PERFORMANCE ---
+MAX_CONCURRENT_PAGES = 15
+VISION_BATCH_SIZE = 20
+MAPPING_BATCH_SIZE = 50

engine_mapping.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import json
+from openai import AsyncOpenAI
+import config
+import utils_geometry as utils
+from schema_definitions import REAL_ESTATE_SCHEMA_MAP
+import re
+# You can use a different client here if you want a different key/model!
+client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
+async def map_fields_to_schema(extracted_fields):
+    print(f"\nPhase 2: Mapping {len(extracted_fields)} fields to Schema...")
+    inputs = []
+    for f in extracted_fields:
+        inputs.append({
+            "uuid": f["id"],
+            "label": f["label"],
+            "role": f["role"],
+            "detected_type": f["detected_type"]
+        })
+    mapped_updates = {}
+    batches = [inputs[i:i + config.MAPPING_BATCH_SIZE] for i in range(0, len(inputs), config.MAPPING_BATCH_SIZE)]
+    schema_str = json.dumps(REAL_ESTATE_SCHEMA_MAP, indent=0)
+    for i, batch in enumerate(batches):
+        print(f"   Mapping Batch {i+1}/{len(batches)}...")
+        input_str = json.dumps(batch, indent=0)
+        system_prompt = f"""
+        You are a Data Schema Mapper for Real Estate Contracts.
+        TASK:
+        Map the input fields to the provided "Schema Definition".
+        RULES:
+        1. Match based on meaning: "Sale Price" -> "purchasePrice".
+        2. STRICTLY CHECK TYPES:
+           - Input "detected_type" MUST be compatible with Schema "type".
+        3. Use "role" to disambiguate: "Name" + "Buyer" -> "buyerName".
+        4. If no good match exists, return null for "schema_key".
+        SCHEMA DEFINITION:
+        {schema_str}
+        Return JSON: {{ "mappings": [ {{ "uuid": "...", "schema_key": "..." }} ] }}
+        """
+        try:
+            response = await client.chat.completions.create(
+                model="gpt-4o",
+                response_format={"type": "json_object"},
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": f"INPUTS:\n{input_str}"}
+                ],
+                temperature=0.0
+            )
+            data = json.loads(response.choices[0].message.content)
+            utils.save_debug_json(data, f"mapping_batch_{i}_llm")
+            for m in data.get("mappings", []):
+                mapped_updates[m["uuid"]] = m.get("schema_key")
+        except Exception as e:
+            print(f"Mapping Error Batch {i}: {e}")
+    # Apply Updates
+    # for f in extracted_fields:
+    #     matched_key = mapped_updates.get(f["id"])
+    #     f["aliasId"] = matched_key
+    #     if matched_key and matched_key in REAL_ESTATE_SCHEMA_MAP:
+    #         f["semanticType"] = REAL_ESTATE_SCHEMA_MAP[matched_key]["type"]
+    #     else:
+    #         f["semanticType"] = f["detected_type"]
+    # return extracted_fields
+    # Apply Updates
+    for f in extracted_fields:
+        matched_key = mapped_updates.get(f["id"])
+        # 1. Canonical Match (It exists in your strict schema)
+        if matched_key and matched_key in REAL_ESTATE_SCHEMA_MAP:
+            f["aliasId"] = matched_key
+            f["semanticType"] = REAL_ESTATE_SCHEMA_MAP[matched_key]["type"]
+            f["isDynamic"] = False
+        # 2. Dynamic Match (It's valid data, but not in your schema yet)
+        else:
+            # Generate a safe slug: "Loan Amount" -> "custom_loan_amount"
+            # We sanitize the label to make it a valid JSON key
+            # clean_slug = "".join(c if c.isalnum() else "_" for c in f["label"].lower())
+            # clean_slug = clean_slug.strip("_")[:50] # Limit length
+            parts =  re.split(r'[^a-zA-Z0-9]+', f["label"].lower())
+            result = parts[0]
+            current_len = len(result)
+            max_length = 20
+            for word in parts[1:]:
+                add = word.capitalize()
+                if current_len + len(add) > max_length:
+                    break
+                result += add
+            f["aliasId"] = f"{result}"
+            f["semanticType"] = f.get("detected_type", "shortText")
+            f["isDynamic"] = True
+    return extracted_fields

engine_vision.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import json
+import asyncio
+import base64
+import uuid
+import time
+import random
+from openai import AsyncOpenAI
+import config
+import utils_geometry as utils
+from schema_definitions import VALID_FIELD_TYPES
+client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
+#updated the code to push
+# ==========================================
+# WORKER: PROCESS SINGLE BATCH (With Retries)
+# ==========================================
+async def process_single_batch(semaphore, doc, page_num, batch_idx, batch_fields, global_context):
+    async with semaphore:
+        prompt_items = []
+        for f in batch_fields:
+            anchors = f["debug_anchors"]
+            prompt_items.append(
+                f"- Box ID {f['temp_id']}:\n"
+                f"  Spatial Hints -> Left: '{anchors['left']}' | Above: '{anchors['above']}'\n"
+                f"  PDF Type Hint: {f.get('ft', 'text')}"
+            )
+        # B. Render Image
+        img_bytes = await asyncio.to_thread(utils.render_hollow_debug_image, doc, page_num, batch_fields)
+        if not img_bytes: return []
+        # Save debug artifact
+        tag = f"{page_num}_batch_{batch_idx}"
+        utils.save_debug_image(img_bytes, tag)
+        b64_img = base64.b64encode(img_bytes).decode('utf-8')
+        # C. System Prompt (UPDATED with section_context)
+        valid_types_str = ", ".join(VALID_FIELD_TYPES)
+        system_prompt = f"""
+        You are an expert Legal Document Processor.
+        CONTEXT: Real Estate Contract. Global Context: "{global_context}"
+        TASK: Analyze the Neon Green Boxes (IDs {batch_fields[0]['temp_id']} to {batch_fields[-1]['temp_id']}).
+        OUTPUT RULES:
+        For each box, return JSON with:
+        1. "visual_evidence": Text closest to the box.
+        2. "section_context": The BOLD HEADER or SECTION TITLE this field belongs to (e.g. "Purchase Price", "Property Description", "Closing Date").
+        3. "final_label": Precise natural label (e.g. "Purchase Price", "Seller Signature").
+        4. "role": Who fills this out? Choose ONLY from:
+           [Buyer, Seller, Agent, Broker, President, Reviewer, Disclosing Party, Receiving Party, N/A].
+           - If ambiguous, infer from section header (e.g. "Tenant's Signature" -> "Tenant").
+           - If strictly administrative (e.g. "Office Use Only"), return "System".
+        5. "detected_type": MUST be one of [{valid_types_str}].
+           - If it looks like money ($), use "dollar".
+           - If it looks like a date, use "date".
+           - If it's a signature, use "signature".
+        INPUT DATA:
+        {chr(10).join(prompt_items)}
+        Return JSON: {{ "fields": [ {{ "box_id": 1, ... }} ] }}
+        """
+        # D. Retry Logic (Restored from your original code)
+        MAX_RETRIES = 5
+        BASE_DELAY = 2
+        batch_results = []
+        page_height = doc[page_num].rect.height
+        for attempt in range(MAX_RETRIES):
+            try:
+                response = await client.chat.completions.create(
+                    model="gpt-4o", # Use gpt-4o for best vision
+                    response_format={"type": "json_object"},
+                    messages=[
+                        {"role": "user", "content": [
+                            {"type": "text", "text": system_prompt},
+                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"}}
+                        ]}
+                    ],
+                    temperature=0.0
+                )
+                content = response.choices[0].message.content
+                parsed = json.loads(content)
+                utils.save_debug_json(parsed, f"{tag}_vision_response")
+                results_map = {item["box_id"]: item for item in parsed.get("fields", [])}
+                for f in batch_fields:
+                    res = results_map.get(f["temp_id"], {})
+                    label = res.get("final_label", "")
+                    # Fallback Geometry Logic
+                    if not label or label == "Unknown":
+                        anchors = f["debug_anchors"]
+                        label = anchors["left"] if anchors["left"] else (anchors["above"] if anchors["above"] else "Unknown Field")
+                    norm_rect = utils.normalize_bbox_to_top_left(f["bbox"], page_height)
+                    batch_results.append({
+                        "id": f.get("name", str(uuid.uuid4())[:8]),
+                        "temp_id": f["temp_id"],
+                        "label": label,
+                        "section": res.get("section_context", "General Information"), # <--- Capturing Section Context
+                        "role": res.get("role", "System"),
+                        "detected_type": res.get("detected_type", "shortText"),
+                        "uiType": "checkbox" if f.get("ft") == "/Btn" else "text",
+                        "page": page_num,
+                        "rect": {
+                            "x": norm_rect["x0"], "y": norm_rect["y0"],
+                            "width": norm_rect["x1"] - norm_rect["x0"], "height": norm_rect["y1"] - norm_rect["y0"]
+                        },
+                        "debug_evidence": res.get("visual_evidence", "N/A")
+                    })
+                break # Success, exit retry loop
+            except Exception as e:
+                error_msg = str(e)
+                if "429" in error_msg or "Rate limit" in error_msg:
+                    wait_time = (BASE_DELAY * (2 ** attempt)) + (random.random() * 0.5)
+                    print(f"Rate Limit ({tag}). Waiting {wait_time:.2f}s...")
+                    await asyncio.sleep(wait_time) # Use await sleep for async!
+                else:
+                    print(f"Error {tag}: {e}")
+                    break
+        return batch_results
+#
+# ==========================================
+# ORCHESTRATOR: PROCESS PAGE
+# ==========================================
+async def process_page_smart(semaphore, doc, page_num, fields, global_context):
+    page = doc[page_num]
+    page_words = utils.get_words_from_page(page)
+    page_height = page.rect.height
+    # 1. Pre-calc anchors
+    for idx, f in enumerate(fields):
+        f["temp_id"] = idx + 1
+        f["debug_anchors"] = utils.calculate_smart_anchors(f["bbox"], page_words, page_height)
+    # 2. Create Batches
+    batches = [fields[i:i + config.VISION_BATCH_SIZE] for i in range(0, len(fields), config.VISION_BATCH_SIZE)]
+    print(f"📄 Page {page_num}: Queuing {len(batches)} batches for {len(fields)} fields...")
+    # 3. Spawn Parallel Tasks (Restored Concurrency)
+    tasks = []
+    for batch_idx, batch_fields in enumerate(batches):
+        task = asyncio.create_task(
+            process_single_batch(semaphore, doc, page_num, batch_idx, batch_fields, global_context)
+        )
+        tasks.append(task)
+    # 4. Gather Results
+    results = await asyncio.gather(*tasks)
+    return [item for sublist in results for item in sublist]

requirements.txt ADDED Viewed

	@@ -0,0 +1,69 @@

+# ------------------------------
+# Core numerical stack (avoid NumPy 2.x ABI issues)
+# ------------------------------
+numpy==1.26.4
+scipy<1.13
+# ------------------------------
+# Web / API service
+# ------------------------------
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+python-multipart>=0.0.6
+requests>=2.31.0
+# ------------------------------
+# PDF parsing / rendering used by your API files (fitz)
+# ------------------------------
+PyMuPDF>=1.23.0
+# ------------------------------
+# OpenAI client used by engine_vision / engine_mapping
+# ------------------------------
+openai>=1.0.0
+# ------------------------------
+# Imaging helpers
+# ------------------------------
+pillow>=10.0.0
+opencv-python-headless>=4.9.0.80
+# ------------------------------
+# CommonForms + model deps (PDF field extractor service)
+# ------------------------------
+commonforms>=0.2.1
+formalpdf==0.1.6
+cryptography>=3.1
+pypdf>=6.1.1
+# ------------------------------
+# Hugging Face / RF-DETR compatibility pins
+# (prevents: ImportError: cannot import name 'torch_int')
+# ------------------------------
+huggingface-hub==0.36.0
+transformers==4.57.3
+peft==0.18.1
+rfdetr==1.4.0.post0
+# ------------------------------
+# Torch stack (match torchvision exactly)
+# ------------------------------
+torch==2.8.0
+torchvision==0.23.0
+# ------------------------------
+# ONNX inference
+# ------------------------------
+onnx>=1.16.0
+onnxruntime>=1.23.1
+onnxslim>=0.1.71
+# ------------------------------
+# Ultralytics (YOLO utilities used by CommonForms stack)
+# ------------------------------
+ultralytics>=8.3.204
+# ------------------------------
+# FastAPI baseline constraint
+# ------------------------------
+pydantic>=2.6,<3

schema_definitions.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Schema definitions for PDF field extraction.
+This module loads the real estate alias IDs from a JSON file that is exported
+from the TypeScript source (real-estate-property-alias-ids.ts) using the
+export-alias-ids.js script.
+To update the schema:
+1. Modify the TypeScript source file
+2. Run: node scripts/export-alias-ids.js
+3. The JSON file will be updated automatically
+"""
+import json
+import os
+# Valid field types that the Vision model can detect
+VALID_FIELD_TYPES = [
+    "checkbox",
+    "date",
+    "dollar",
+    "email",
+    "fullName",
+    "initial",
+    "longText",
+    "number",
+    "phone",
+    "shortText",
+    "signature",
+    "usAddress",
+    "ssn",
+    "iban"
+]
+def load_real_estate_schema():
+    """
+    Load the real estate schema from the exported JSON file.
+    Falls back to an empty dict if the file doesn't exist.
+    """
+    json_path = os.path.join(os.path.dirname(__file__), "real_estate_alias_ids.json")
+    if not os.path.exists(json_path):
+        print(f"Warning: {json_path} not found. Run 'node scripts/export-alias-ids.js' to generate it.")
+        return {}
+    try:
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        # Transform the JSON format to match the expected format
+        # JSON has: { key: { description, type, name } }
+        # We need: { key: { desc, type } }
+        schema = {}
+        for key, value in data.items():
+            schema[key] = {
+                "desc": value.get("description", ""),
+                "type": value.get("type", "shortText"),
+                "name": value.get("name", key)
+            }
+        return schema
+    except Exception as e:
+        print(f"Error loading real estate schema: {e}")
+        return {}
+# Load the schema on module import
+REAL_ESTATE_SCHEMA_MAP = load_real_estate_schema()
+# Print info about loaded schema
+if REAL_ESTATE_SCHEMA_MAP:
+    print(f"Loaded {len(REAL_ESTATE_SCHEMA_MAP)} alias IDs from real_estate_alias_ids.json")
+else:
+    print("Warning: No alias IDs loaded. Schema mapping will create dynamic fields for all detections.")

utils_geometry.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import json
+import shutil
+import fitz  # PyMuPDF
+import io
+from PIL import Image, ImageDraw, ImageFont
+import config
+def setup_debug_dir():
+    if os.path.exists(config.DEBUG_DIR):
+        shutil.rmtree(config.DEBUG_DIR)
+    os.makedirs(config.DEBUG_DIR)
+    print(f"Debug directory cleared: {config.DEBUG_DIR}/")
+def save_debug_image(image_bytes, name):
+    path = os.path.join(config.DEBUG_DIR, f"{name}.jpg")
+    with open(path, "wb") as f:
+        f.write(image_bytes)
+    return path
+def save_debug_json(data, name):
+    path = os.path.join(config.DEBUG_DIR, f"{name}.json")
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+def normalize_bbox_to_top_left(bbox, page_height):
+    """Convert PDF Bottom-Left coords to Image Top-Left coords."""
+    return {
+        "x0": bbox["x0"],
+        "y0": page_height - bbox["y1"],
+        "x1": bbox["x1"],
+        "y1": page_height - bbox["y0"]
+    }
+def get_words_from_page(page):
+    return page.get_text("words")
+def calculate_smart_anchors(field_bbox, words, page_height):
+    norm_bbox = normalize_bbox_to_top_left(field_bbox, page_height)
+    fx0, fy0, fx1, fy1 = norm_bbox["x0"], norm_bbox["y0"], norm_bbox["x1"], norm_bbox["y1"]
+    SEARCH_RADIUS = 150
+    Y_ALIGNMENT_TOLERANCE = 12
+    closest_left = []
+    closest_right = []
+    closest_above = []
+    for w in words:
+        wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4]
+        w_center_y = (wy0 + wy1) / 2
+        f_center_y = (fy0 + fy1) / 2
+        # Left
+        if wx1 < fx0 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
+            if fx0 - wx1 < SEARCH_RADIUS: closest_left.append((fx0 - wx1, text))
+        # Right
+        if wx0 > fx1 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
+            if wx0 - fx1 < SEARCH_RADIUS: closest_right.append((wx0 - fx1, text))
+        # Above
+        overlap = max(0, min(fx1, wx1) - max(fx0, wx0))
+        if wy1 < fy0 and overlap > 0:
+            if fy0 - wy1 < SEARCH_RADIUS: closest_above.append((fy0 - wy1, text))
+    closest_left.sort(key=lambda x: x[0])
+    closest_right.sort(key=lambda x: x[0])
+    closest_above.sort(key=lambda x: x[0])
+    def join_text(candidates): return " ".join([c[1] for c in candidates[:4]])
+    return {
+        "left": join_text(closest_left),
+        "right": join_text(closest_right),
+        "above": join_text(closest_above)
+    }
+def render_hollow_debug_image(doc, page_num, fields):
+    if page_num >= len(doc): return None
+    page = doc[page_num]
+    zoom = 2.0
+    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    draw = ImageDraw.Draw(img)
+    scale_x = pix.width / page.rect.width
+    scale_y = pix.height / page.rect.height
+    page_h = page.rect.height
+    try: font = ImageFont.truetype("arial.ttf", 30)
+    except: font = ImageFont.load_default()
+    for f in fields:
+        vis_id = f["temp_id"]
+        bbox = f["bbox"]
+        x0_bl = bbox["x0"] * scale_x
+        y0_bl = (page_h - bbox["y1"]) * scale_y
+        x1_bl = bbox["x1"] * scale_x
+        y1_bl = (page_h - bbox["y0"]) * scale_y
+        draw.rectangle([x0_bl, y0_bl, x1_bl, y1_bl], outline=config.BOX_COLOR, width=config.BOX_WIDTH)
+        badge_w, badge_h = 50, 35
+        bx0, by0 = x0_bl - 10, y0_bl - badge_h - 2
+        draw.rectangle([bx0, by0, bx0 + badge_w, by0 + badge_h], fill=config.BADGE_BG)
+        draw.text((bx0 + 10, by0 + 5), str(vis_id), fill=config.BADGE_COLOR, font=font)
+    buffer = io.BytesIO()
+    img.save(buffer, format="JPEG", quality=85)
+    return buffer.getvalue()

utils_grouping.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import re
+def clean_section_title(raw_title):
+    """
+    Turns '2. PURCHASE PRICE (U.S. currency)' -> 'Purchase Price'
+    """
+    if not raw_title: return "General Information"
+    # Remove leading numbers/bullets (e.g., "1.", "A.")
+    clean = re.sub(r'^[A-Z0-9]+\.\s*', '', raw_title)
+    # Remove things in parentheses (e.g., "(U.S. Currency)")
+    clean = re.sub(r'\s*\(.*?\)', '', clean)
+    # Title Case
+    return clean.strip().title()
+UNGROUPABLE_TYPES = ["signature", "initial"]
+def group_fields_by_section(fields):
+    """
+    Organizes flat fields into logical groups based on the
+    'section' context extracted by the Vision model.
+    """
+    groups_map = {}
+    for f in fields:
+        # Get the raw section from Vision (now populated!)
+        if f.get("semanticType") in UNGROUPABLE_TYPES:
+            continue
+        raw_section = f.get("section", "General Information")
+        group_title = clean_section_title(raw_section)
+        # Create a stable ID for the group
+        group_id = f"grp_{group_title.lower().replace(' ', '_')[:30]}"
+        # Create group if not exists
+        if group_id not in groups_map:
+            groups_map[group_id] = {
+                "id": group_id,
+                "title": group_title,
+                "fieldIds": []
+            }
+        # Link field to group
+        groups_map[group_id]["fieldIds"].append(f["id"])
+        # Mutate the field object to include the link
+        f["groupId"] = group_id
+    # Sort groups by the page/y-position of their first field
+    sorted_groups = sorted(
+        groups_map.values(),
+        key=lambda g: [
+            next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("page", 0),
+            next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("rect", {}).get("y", 0)
+        ]
+    )
+    return sorted_groups, fields