Spaces:

tsalkar
/

field_semantic_mapping

Runtime error

File size: 6,528 Bytes

db81e28

import os
import json
import asyncio
import requests
import fitz  
import shutil
import tempfile
from datetime import datetime
from fastapi import FastAPI, UploadFile, File, HTTPException
import config
import utils_geometry as utils
from engine_vision import process_page_smart
from engine_mapping import map_fields_to_schema
from utils_grouping import group_fields_by_section 

app = FastAPI(title="Smart Contract Processor API")

# code just to create a new commit
def get_fields_from_local_api(pdf_path):
    """
    Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields.
    Identical logic to main.py, just adapted to take a specific path.
    """
    print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}")
    fields_by_page = {}
    try:
        with open(pdf_path, 'rb') as f:
            response = requests.post(
                config.COMMON_FORMS_API_URL, 
                files={'file': f}, 
                stream=True, 
                timeout=60
            )
            
            for line in response.iter_lines():
                if not line: continue
                data = json.loads(line)
                if data.get("status") == "success":
                    fields_by_page[data["page"]] = data.get("fields", [])
                elif data.get("status") == "error":
                    print(f"Model API Error on page {data.get('page')}: {data.get('msg')}")
                    
    except Exception as e:
        print(f"API Connection Error: {e}")
        return None
        
    return fields_by_page


def get_pdf_metadata(doc, filename: str):
    """
    Extract PDF metadata including page sizes for ClaiPDFCollection.
    """
    page_sizes = []
    for page in doc:
        rect = page.rect
        page_sizes.append({
            "rotation": page.rotation,
            "width": rect.width,
            "height": rect.height
        })
    
    # Get title from PDF metadata or use filename
    pdf_title = doc.metadata.get("title", "") if doc.metadata else ""
    if not pdf_title:
        pdf_title = os.path.splitext(filename)[0] if filename else "Document"
    
    return {
        "name": filename or "document.pdf",
        "title": pdf_title,
        "pageSizes": page_sizes
    }


def resolve_intermediate_format(all_fields, pdf_metadata):
    """
    Returns an intermediate format that will be transformed to ClaiSchema 
    in the Next.js layer. Uses tempIds for internal reference.
    
    This format is consumed by transform-to-clai-schema.ts which generates
    proper ClaiSchema-compliant IDs using TypeScript utilities.
    """
    groups, updated_fields = group_fields_by_section(all_fields)

    participants = {}
    final_fields = []
    routing_counter = 1
    
    for f in updated_fields:
        raw_role = str(f.get("role", "System")).strip().title()
        participant_temp_id = None
        
        if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]:
            participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}"
            if participant_temp_id not in participants:
                participants[participant_temp_id] = {
                    "tempId": participant_temp_id,
                    "role": "signer",
                    "type": "unknown",
                    "label": raw_role,
                    "routingOrder": routing_counter,
                    "definer": "PREPARER"
                }
                routing_counter += 1

        final_fields.append({
            "tempId": f["id"],
            "aliasId": f.get("aliasId"),
            "groupTempId": f.get("groupId"),
            "participantTempId": participant_temp_id, 
            "label": f["label"],
            "semanticType": f["semanticType"],
            "isDynamic": f.get("isDynamic", False),
            "page": f["page"],
            "rect": f["rect"]
        })
    
    # Transform groups to use tempId
    groups_with_temp_ids = []
    for g in groups:
        groups_with_temp_ids.append({
            "tempId": g["id"],
            "title": g["title"],
            "fieldTempIds": g["fieldIds"]
        })
        
    return {
        "participants": list(participants.values()),
        "groups": groups_with_temp_ids,
        "fields": final_fields,
        "pdfMetadata": pdf_metadata
    }

# ==============================================================================
# API ENDPOINT (Replaces async main())
# ==============================================================================

@app.post("/process-pdf")
async def process_pdf(file: UploadFile = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        shutil.copyfileobj(file.file, tmp)
        tmp_path = tmp.name

    doc = None
    try:
        utils.setup_debug_dir()

        print(f"Starting process for uploaded file: {file.filename}")
        raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path)
        
        if not raw_fields:
            raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).")
        
        doc = fitz.open(tmp_path)
        
        # Extract PDF metadata for ClaiPDFCollection
        pdf_metadata = get_pdf_metadata(doc, file.filename)
        
        # Extract text context for vision processing
        text_sample = ""
        for i in range(min(2, len(doc))): 
            text_sample += doc[i].get_text()
        global_ctx = " ".join(text_sample.split())[:1500]

        # Process pages with vision and mapping
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES)
        tasks = []
        for page_num, fields in raw_fields.items():
            tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx))
        
        results = await asyncio.gather(*tasks)
        flat_results = [item for sublist in results for item in sublist]
        mapped_results = await map_fields_to_schema(flat_results)
        
        # Return intermediate format for Next.js transformation
        intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata)
        
        return intermediate_response

    except Exception as e:
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

    finally:
        if doc:
            doc.close()
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
        print(f"Cleanup complete for {tmp_path}")