Spaces:
Runtime error
Runtime error
File size: 6,528 Bytes
db81e28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | import os
import json
import asyncio
import requests
import fitz
import shutil
import tempfile
from datetime import datetime
from fastapi import FastAPI, UploadFile, File, HTTPException
import config
import utils_geometry as utils
from engine_vision import process_page_smart
from engine_mapping import map_fields_to_schema
from utils_grouping import group_fields_by_section
app = FastAPI(title="Smart Contract Processor API")
# code just to create a new commit
def get_fields_from_local_api(pdf_path):
"""
Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields.
Identical logic to main.py, just adapted to take a specific path.
"""
print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}")
fields_by_page = {}
try:
with open(pdf_path, 'rb') as f:
response = requests.post(
config.COMMON_FORMS_API_URL,
files={'file': f},
stream=True,
timeout=60
)
for line in response.iter_lines():
if not line: continue
data = json.loads(line)
if data.get("status") == "success":
fields_by_page[data["page"]] = data.get("fields", [])
elif data.get("status") == "error":
print(f"Model API Error on page {data.get('page')}: {data.get('msg')}")
except Exception as e:
print(f"API Connection Error: {e}")
return None
return fields_by_page
def get_pdf_metadata(doc, filename: str):
"""
Extract PDF metadata including page sizes for ClaiPDFCollection.
"""
page_sizes = []
for page in doc:
rect = page.rect
page_sizes.append({
"rotation": page.rotation,
"width": rect.width,
"height": rect.height
})
# Get title from PDF metadata or use filename
pdf_title = doc.metadata.get("title", "") if doc.metadata else ""
if not pdf_title:
pdf_title = os.path.splitext(filename)[0] if filename else "Document"
return {
"name": filename or "document.pdf",
"title": pdf_title,
"pageSizes": page_sizes
}
def resolve_intermediate_format(all_fields, pdf_metadata):
"""
Returns an intermediate format that will be transformed to ClaiSchema
in the Next.js layer. Uses tempIds for internal reference.
This format is consumed by transform-to-clai-schema.ts which generates
proper ClaiSchema-compliant IDs using TypeScript utilities.
"""
groups, updated_fields = group_fields_by_section(all_fields)
participants = {}
final_fields = []
routing_counter = 1
for f in updated_fields:
raw_role = str(f.get("role", "System")).strip().title()
participant_temp_id = None
if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]:
participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}"
if participant_temp_id not in participants:
participants[participant_temp_id] = {
"tempId": participant_temp_id,
"role": "signer",
"type": "unknown",
"label": raw_role,
"routingOrder": routing_counter,
"definer": "PREPARER"
}
routing_counter += 1
final_fields.append({
"tempId": f["id"],
"aliasId": f.get("aliasId"),
"groupTempId": f.get("groupId"),
"participantTempId": participant_temp_id,
"label": f["label"],
"semanticType": f["semanticType"],
"isDynamic": f.get("isDynamic", False),
"page": f["page"],
"rect": f["rect"]
})
# Transform groups to use tempId
groups_with_temp_ids = []
for g in groups:
groups_with_temp_ids.append({
"tempId": g["id"],
"title": g["title"],
"fieldTempIds": g["fieldIds"]
})
return {
"participants": list(participants.values()),
"groups": groups_with_temp_ids,
"fields": final_fields,
"pdfMetadata": pdf_metadata
}
# ==============================================================================
# API ENDPOINT (Replaces async main())
# ==============================================================================
@app.post("/process-pdf")
async def process_pdf(file: UploadFile = File(...)):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
shutil.copyfileobj(file.file, tmp)
tmp_path = tmp.name
doc = None
try:
utils.setup_debug_dir()
print(f"Starting process for uploaded file: {file.filename}")
raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path)
if not raw_fields:
raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).")
doc = fitz.open(tmp_path)
# Extract PDF metadata for ClaiPDFCollection
pdf_metadata = get_pdf_metadata(doc, file.filename)
# Extract text context for vision processing
text_sample = ""
for i in range(min(2, len(doc))):
text_sample += doc[i].get_text()
global_ctx = " ".join(text_sample.split())[:1500]
# Process pages with vision and mapping
semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES)
tasks = []
for page_num, fields in raw_fields.items():
tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx))
results = await asyncio.gather(*tasks)
flat_results = [item for sublist in results for item in sublist]
mapped_results = await map_fields_to_schema(flat_results)
# Return intermediate format for Next.js transformation
intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata)
return intermediate_response
except Exception as e:
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))
finally:
if doc:
doc.close()
if os.path.exists(tmp_path):
os.remove(tmp_path)
print(f"Cleanup complete for {tmp_path}") |