Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import asyncio | |
| import requests | |
| import fitz | |
| import shutil | |
| import tempfile | |
| from datetime import datetime | |
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| import config | |
| import utils_geometry as utils | |
| from engine_vision import process_page_smart | |
| from engine_mapping import map_fields_to_schema | |
| from utils_grouping import group_fields_by_section | |
| app = FastAPI(title="Smart Contract Processor API") | |
| # code just to create a new commit | |
| def get_fields_from_local_api(pdf_path): | |
| """ | |
| Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields. | |
| Identical logic to main.py, just adapted to take a specific path. | |
| """ | |
| print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}") | |
| fields_by_page = {} | |
| try: | |
| with open(pdf_path, 'rb') as f: | |
| response = requests.post( | |
| config.COMMON_FORMS_API_URL, | |
| files={'file': f}, | |
| stream=True, | |
| timeout=60 | |
| ) | |
| for line in response.iter_lines(): | |
| if not line: continue | |
| data = json.loads(line) | |
| if data.get("status") == "success": | |
| fields_by_page[data["page"]] = data.get("fields", []) | |
| elif data.get("status") == "error": | |
| print(f"Model API Error on page {data.get('page')}: {data.get('msg')}") | |
| except Exception as e: | |
| print(f"API Connection Error: {e}") | |
| return None | |
| return fields_by_page | |
| def get_pdf_metadata(doc, filename: str): | |
| """ | |
| Extract PDF metadata including page sizes for ClaiPDFCollection. | |
| """ | |
| page_sizes = [] | |
| for page in doc: | |
| rect = page.rect | |
| page_sizes.append({ | |
| "rotation": page.rotation, | |
| "width": rect.width, | |
| "height": rect.height | |
| }) | |
| # Get title from PDF metadata or use filename | |
| pdf_title = doc.metadata.get("title", "") if doc.metadata else "" | |
| if not pdf_title: | |
| pdf_title = os.path.splitext(filename)[0] if filename else "Document" | |
| return { | |
| "name": filename or "document.pdf", | |
| "title": pdf_title, | |
| "pageSizes": page_sizes | |
| } | |
| def resolve_intermediate_format(all_fields, pdf_metadata): | |
| """ | |
| Returns an intermediate format that will be transformed to ClaiSchema | |
| in the Next.js layer. Uses tempIds for internal reference. | |
| This format is consumed by transform-to-clai-schema.ts which generates | |
| proper ClaiSchema-compliant IDs using TypeScript utilities. | |
| """ | |
| groups, updated_fields = group_fields_by_section(all_fields) | |
| participants = {} | |
| final_fields = [] | |
| routing_counter = 1 | |
| for f in updated_fields: | |
| raw_role = str(f.get("role", "System")).strip().title() | |
| participant_temp_id = None | |
| if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]: | |
| participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}" | |
| if participant_temp_id not in participants: | |
| participants[participant_temp_id] = { | |
| "tempId": participant_temp_id, | |
| "role": "signer", | |
| "type": "unknown", | |
| "label": raw_role, | |
| "routingOrder": routing_counter, | |
| "definer": "PREPARER" | |
| } | |
| routing_counter += 1 | |
| final_fields.append({ | |
| "tempId": f["id"], | |
| "aliasId": f.get("aliasId"), | |
| "groupTempId": f.get("groupId"), | |
| "participantTempId": participant_temp_id, | |
| "label": f["label"], | |
| "semanticType": f["semanticType"], | |
| "isDynamic": f.get("isDynamic", False), | |
| "page": f["page"], | |
| "rect": f["rect"] | |
| }) | |
| # Transform groups to use tempId | |
| groups_with_temp_ids = [] | |
| for g in groups: | |
| groups_with_temp_ids.append({ | |
| "tempId": g["id"], | |
| "title": g["title"], | |
| "fieldTempIds": g["fieldIds"] | |
| }) | |
| return { | |
| "participants": list(participants.values()), | |
| "groups": groups_with_temp_ids, | |
| "fields": final_fields, | |
| "pdfMetadata": pdf_metadata | |
| } | |
| # ============================================================================== | |
| # API ENDPOINT (Replaces async main()) | |
| # ============================================================================== | |
| async def process_pdf(file: UploadFile = File(...)): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| shutil.copyfileobj(file.file, tmp) | |
| tmp_path = tmp.name | |
| doc = None | |
| try: | |
| utils.setup_debug_dir() | |
| print(f"Starting process for uploaded file: {file.filename}") | |
| raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path) | |
| if not raw_fields: | |
| raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).") | |
| doc = fitz.open(tmp_path) | |
| # Extract PDF metadata for ClaiPDFCollection | |
| pdf_metadata = get_pdf_metadata(doc, file.filename) | |
| # Extract text context for vision processing | |
| text_sample = "" | |
| for i in range(min(2, len(doc))): | |
| text_sample += doc[i].get_text() | |
| global_ctx = " ".join(text_sample.split())[:1500] | |
| # Process pages with vision and mapping | |
| semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES) | |
| tasks = [] | |
| for page_num, fields in raw_fields.items(): | |
| tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx)) | |
| results = await asyncio.gather(*tasks) | |
| flat_results = [item for sublist in results for item in sublist] | |
| mapped_results = await map_fields_to_schema(flat_results) | |
| # Return intermediate format for Next.js transformation | |
| intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata) | |
| return intermediate_response | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if doc: | |
| doc.close() | |
| if os.path.exists(tmp_path): | |
| os.remove(tmp_path) | |
| print(f"Cleanup complete for {tmp_path}") |