import os import json import asyncio import requests import fitz import shutil import tempfile from datetime import datetime from fastapi import FastAPI, UploadFile, File, HTTPException import config import utils_geometry as utils from engine_vision import process_page_smart from engine_mapping import map_fields_to_schema from utils_grouping import group_fields_by_section app = FastAPI(title="Smart Contract Processor API") # code just to create a new commit def get_fields_from_local_api(pdf_path): """ Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields. Identical logic to main.py, just adapted to take a specific path. """ print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}") fields_by_page = {} try: with open(pdf_path, 'rb') as f: response = requests.post( config.COMMON_FORMS_API_URL, files={'file': f}, stream=True, timeout=60 ) for line in response.iter_lines(): if not line: continue data = json.loads(line) if data.get("status") == "success": fields_by_page[data["page"]] = data.get("fields", []) elif data.get("status") == "error": print(f"Model API Error on page {data.get('page')}: {data.get('msg')}") except Exception as e: print(f"API Connection Error: {e}") return None return fields_by_page def get_pdf_metadata(doc, filename: str): """ Extract PDF metadata including page sizes for ClaiPDFCollection. """ page_sizes = [] for page in doc: rect = page.rect page_sizes.append({ "rotation": page.rotation, "width": rect.width, "height": rect.height }) # Get title from PDF metadata or use filename pdf_title = doc.metadata.get("title", "") if doc.metadata else "" if not pdf_title: pdf_title = os.path.splitext(filename)[0] if filename else "Document" return { "name": filename or "document.pdf", "title": pdf_title, "pageSizes": page_sizes } def resolve_intermediate_format(all_fields, pdf_metadata): """ Returns an intermediate format that will be transformed to ClaiSchema in the Next.js layer. Uses tempIds for internal reference. This format is consumed by transform-to-clai-schema.ts which generates proper ClaiSchema-compliant IDs using TypeScript utilities. """ groups, updated_fields = group_fields_by_section(all_fields) participants = {} final_fields = [] routing_counter = 1 for f in updated_fields: raw_role = str(f.get("role", "System")).strip().title() participant_temp_id = None if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]: participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}" if participant_temp_id not in participants: participants[participant_temp_id] = { "tempId": participant_temp_id, "role": "signer", "type": "unknown", "label": raw_role, "routingOrder": routing_counter, "definer": "PREPARER" } routing_counter += 1 final_fields.append({ "tempId": f["id"], "aliasId": f.get("aliasId"), "groupTempId": f.get("groupId"), "participantTempId": participant_temp_id, "label": f["label"], "semanticType": f["semanticType"], "isDynamic": f.get("isDynamic", False), "page": f["page"], "rect": f["rect"] }) # Transform groups to use tempId groups_with_temp_ids = [] for g in groups: groups_with_temp_ids.append({ "tempId": g["id"], "title": g["title"], "fieldTempIds": g["fieldIds"] }) return { "participants": list(participants.values()), "groups": groups_with_temp_ids, "fields": final_fields, "pdfMetadata": pdf_metadata } # ============================================================================== # API ENDPOINT (Replaces async main()) # ============================================================================== @app.post("/process-pdf") async def process_pdf(file: UploadFile = File(...)): with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: shutil.copyfileobj(file.file, tmp) tmp_path = tmp.name doc = None try: utils.setup_debug_dir() print(f"Starting process for uploaded file: {file.filename}") raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path) if not raw_fields: raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).") doc = fitz.open(tmp_path) # Extract PDF metadata for ClaiPDFCollection pdf_metadata = get_pdf_metadata(doc, file.filename) # Extract text context for vision processing text_sample = "" for i in range(min(2, len(doc))): text_sample += doc[i].get_text() global_ctx = " ".join(text_sample.split())[:1500] # Process pages with vision and mapping semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES) tasks = [] for page_num, fields in raw_fields.items(): tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx)) results = await asyncio.gather(*tasks) flat_results = [item for sublist in results for item in sublist] mapped_results = await map_fields_to_schema(flat_results) # Return intermediate format for Next.js transformation intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata) return intermediate_response except Exception as e: import traceback traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) finally: if doc: doc.close() if os.path.exists(tmp_path): os.remove(tmp_path) print(f"Cleanup complete for {tmp_path}")