Tanishq Salkar commited on
Commit
db81e28
·
1 Parent(s): d3a7652

initial visual mapping code added to hf

Browse files
Files changed (8) hide show
  1. api.py +190 -0
  2. config.py +18 -0
  3. engine_mapping.py +114 -0
  4. engine_vision.py +162 -0
  5. requirements.txt +69 -0
  6. schema_definitions.py +75 -0
  7. utils_geometry.py +109 -0
  8. utils_grouping.py +59 -0
api.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import asyncio
4
+ import requests
5
+ import fitz
6
+ import shutil
7
+ import tempfile
8
+ from datetime import datetime
9
+ from fastapi import FastAPI, UploadFile, File, HTTPException
10
+ import config
11
+ import utils_geometry as utils
12
+ from engine_vision import process_page_smart
13
+ from engine_mapping import map_fields_to_schema
14
+ from utils_grouping import group_fields_by_section
15
+
16
+ app = FastAPI(title="Smart Contract Processor API")
17
+
18
+ # code just to create a new commit
19
+ def get_fields_from_local_api(pdf_path):
20
+ """
21
+ Sends the PDF to the local model_api to get Bounding Boxes i.e neon green boxes surrounding the fields.
22
+ Identical logic to main.py, just adapted to take a specific path.
23
+ """
24
+ print(f"Sending to Model API: {config.COMMON_FORMS_API_URL}")
25
+ fields_by_page = {}
26
+ try:
27
+ with open(pdf_path, 'rb') as f:
28
+ response = requests.post(
29
+ config.COMMON_FORMS_API_URL,
30
+ files={'file': f},
31
+ stream=True,
32
+ timeout=60
33
+ )
34
+
35
+ for line in response.iter_lines():
36
+ if not line: continue
37
+ data = json.loads(line)
38
+ if data.get("status") == "success":
39
+ fields_by_page[data["page"]] = data.get("fields", [])
40
+ elif data.get("status") == "error":
41
+ print(f"Model API Error on page {data.get('page')}: {data.get('msg')}")
42
+
43
+ except Exception as e:
44
+ print(f"API Connection Error: {e}")
45
+ return None
46
+
47
+ return fields_by_page
48
+
49
+
50
+ def get_pdf_metadata(doc, filename: str):
51
+ """
52
+ Extract PDF metadata including page sizes for ClaiPDFCollection.
53
+ """
54
+ page_sizes = []
55
+ for page in doc:
56
+ rect = page.rect
57
+ page_sizes.append({
58
+ "rotation": page.rotation,
59
+ "width": rect.width,
60
+ "height": rect.height
61
+ })
62
+
63
+ # Get title from PDF metadata or use filename
64
+ pdf_title = doc.metadata.get("title", "") if doc.metadata else ""
65
+ if not pdf_title:
66
+ pdf_title = os.path.splitext(filename)[0] if filename else "Document"
67
+
68
+ return {
69
+ "name": filename or "document.pdf",
70
+ "title": pdf_title,
71
+ "pageSizes": page_sizes
72
+ }
73
+
74
+
75
+ def resolve_intermediate_format(all_fields, pdf_metadata):
76
+ """
77
+ Returns an intermediate format that will be transformed to ClaiSchema
78
+ in the Next.js layer. Uses tempIds for internal reference.
79
+
80
+ This format is consumed by transform-to-clai-schema.ts which generates
81
+ proper ClaiSchema-compliant IDs using TypeScript utilities.
82
+ """
83
+ groups, updated_fields = group_fields_by_section(all_fields)
84
+
85
+ participants = {}
86
+ final_fields = []
87
+ routing_counter = 1
88
+
89
+ for f in updated_fields:
90
+ raw_role = str(f.get("role", "System")).strip().title()
91
+ participant_temp_id = None
92
+
93
+ if raw_role.lower() not in ["system", "n/a", "unknown", "none", ""]:
94
+ participant_temp_id = f"part_{raw_role.lower().replace(' ', '_')}"
95
+ if participant_temp_id not in participants:
96
+ participants[participant_temp_id] = {
97
+ "tempId": participant_temp_id,
98
+ "role": "signer",
99
+ "type": "unknown",
100
+ "label": raw_role,
101
+ "routingOrder": routing_counter,
102
+ "definer": "PREPARER"
103
+ }
104
+ routing_counter += 1
105
+
106
+ final_fields.append({
107
+ "tempId": f["id"],
108
+ "aliasId": f.get("aliasId"),
109
+ "groupTempId": f.get("groupId"),
110
+ "participantTempId": participant_temp_id,
111
+ "label": f["label"],
112
+ "semanticType": f["semanticType"],
113
+ "isDynamic": f.get("isDynamic", False),
114
+ "page": f["page"],
115
+ "rect": f["rect"]
116
+ })
117
+
118
+ # Transform groups to use tempId
119
+ groups_with_temp_ids = []
120
+ for g in groups:
121
+ groups_with_temp_ids.append({
122
+ "tempId": g["id"],
123
+ "title": g["title"],
124
+ "fieldTempIds": g["fieldIds"]
125
+ })
126
+
127
+ return {
128
+ "participants": list(participants.values()),
129
+ "groups": groups_with_temp_ids,
130
+ "fields": final_fields,
131
+ "pdfMetadata": pdf_metadata
132
+ }
133
+
134
+ # ==============================================================================
135
+ # API ENDPOINT (Replaces async main())
136
+ # ==============================================================================
137
+
138
+ @app.post("/process-pdf")
139
+ async def process_pdf(file: UploadFile = File(...)):
140
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
141
+ shutil.copyfileobj(file.file, tmp)
142
+ tmp_path = tmp.name
143
+
144
+ doc = None
145
+ try:
146
+ utils.setup_debug_dir()
147
+
148
+ print(f"Starting process for uploaded file: {file.filename}")
149
+ raw_fields = await asyncio.to_thread(get_fields_from_local_api, tmp_path)
150
+
151
+ if not raw_fields:
152
+ raise HTTPException(status_code=500, detail="Failed to extract fields from Model API (Local Port 8000).")
153
+
154
+ doc = fitz.open(tmp_path)
155
+
156
+ # Extract PDF metadata for ClaiPDFCollection
157
+ pdf_metadata = get_pdf_metadata(doc, file.filename)
158
+
159
+ # Extract text context for vision processing
160
+ text_sample = ""
161
+ for i in range(min(2, len(doc))):
162
+ text_sample += doc[i].get_text()
163
+ global_ctx = " ".join(text_sample.split())[:1500]
164
+
165
+ # Process pages with vision and mapping
166
+ semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_PAGES)
167
+ tasks = []
168
+ for page_num, fields in raw_fields.items():
169
+ tasks.append(process_page_smart(semaphore, doc, page_num, fields, global_ctx))
170
+
171
+ results = await asyncio.gather(*tasks)
172
+ flat_results = [item for sublist in results for item in sublist]
173
+ mapped_results = await map_fields_to_schema(flat_results)
174
+
175
+ # Return intermediate format for Next.js transformation
176
+ intermediate_response = resolve_intermediate_format(mapped_results, pdf_metadata)
177
+
178
+ return intermediate_response
179
+
180
+ except Exception as e:
181
+ import traceback
182
+ traceback.print_exc()
183
+ raise HTTPException(status_code=500, detail=str(e))
184
+
185
+ finally:
186
+ if doc:
187
+ doc.close()
188
+ if os.path.exists(tmp_path):
189
+ os.remove(tmp_path)
190
+ print(f"Cleanup complete for {tmp_path}")
config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")
3
+
4
+ COMMON_FORMS_API_URL = "https://tsalkar-pdf-field-extractor-backend.hf.space/extract-fields-stream"
5
+
6
+ TARGET_FILE = "docs/template_pdf.pdf"
7
+ OUTPUT_FILE = "final_output_smart_schema2.json"
8
+ DEBUG_DIR = "debug_artifacts"
9
+
10
+ BOX_COLOR = "#00FF00" # Neon Green
11
+ BOX_WIDTH = 3
12
+ BADGE_COLOR = "#000000"
13
+ BADGE_BG = "#00FF00"
14
+
15
+ # --- PERFORMANCE ---
16
+ MAX_CONCURRENT_PAGES = 15
17
+ VISION_BATCH_SIZE = 20
18
+ MAPPING_BATCH_SIZE = 50
engine_mapping.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from openai import AsyncOpenAI
3
+ import config
4
+ import utils_geometry as utils
5
+ from schema_definitions import REAL_ESTATE_SCHEMA_MAP
6
+ import re
7
+
8
+ # You can use a different client here if you want a different key/model!
9
+ client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
10
+
11
+ async def map_fields_to_schema(extracted_fields):
12
+ print(f"\nPhase 2: Mapping {len(extracted_fields)} fields to Schema...")
13
+
14
+ inputs = []
15
+ for f in extracted_fields:
16
+ inputs.append({
17
+ "uuid": f["id"],
18
+ "label": f["label"],
19
+ "role": f["role"],
20
+ "detected_type": f["detected_type"]
21
+ })
22
+
23
+ mapped_updates = {}
24
+ batches = [inputs[i:i + config.MAPPING_BATCH_SIZE] for i in range(0, len(inputs), config.MAPPING_BATCH_SIZE)]
25
+
26
+ schema_str = json.dumps(REAL_ESTATE_SCHEMA_MAP, indent=0)
27
+
28
+ for i, batch in enumerate(batches):
29
+ print(f" Mapping Batch {i+1}/{len(batches)}...")
30
+ input_str = json.dumps(batch, indent=0)
31
+
32
+ system_prompt = f"""
33
+ You are a Data Schema Mapper for Real Estate Contracts.
34
+
35
+ TASK:
36
+ Map the input fields to the provided "Schema Definition".
37
+
38
+ RULES:
39
+ 1. Match based on meaning: "Sale Price" -> "purchasePrice".
40
+ 2. STRICTLY CHECK TYPES:
41
+ - Input "detected_type" MUST be compatible with Schema "type".
42
+ 3. Use "role" to disambiguate: "Name" + "Buyer" -> "buyerName".
43
+ 4. If no good match exists, return null for "schema_key".
44
+
45
+ SCHEMA DEFINITION:
46
+ {schema_str}
47
+
48
+ Return JSON: {{ "mappings": [ {{ "uuid": "...", "schema_key": "..." }} ] }}
49
+ """
50
+
51
+ try:
52
+ response = await client.chat.completions.create(
53
+ model="gpt-4o",
54
+ response_format={"type": "json_object"},
55
+ messages=[
56
+ {"role": "system", "content": system_prompt},
57
+ {"role": "user", "content": f"INPUTS:\n{input_str}"}
58
+ ],
59
+ temperature=0.0
60
+ )
61
+
62
+ data = json.loads(response.choices[0].message.content)
63
+ utils.save_debug_json(data, f"mapping_batch_{i}_llm")
64
+
65
+ for m in data.get("mappings", []):
66
+ mapped_updates[m["uuid"]] = m.get("schema_key")
67
+
68
+ except Exception as e:
69
+ print(f"Mapping Error Batch {i}: {e}")
70
+
71
+ # Apply Updates
72
+ # for f in extracted_fields:
73
+ # matched_key = mapped_updates.get(f["id"])
74
+ # f["aliasId"] = matched_key
75
+
76
+ # if matched_key and matched_key in REAL_ESTATE_SCHEMA_MAP:
77
+ # f["semanticType"] = REAL_ESTATE_SCHEMA_MAP[matched_key]["type"]
78
+ # else:
79
+ # f["semanticType"] = f["detected_type"]
80
+
81
+ # return extracted_fields
82
+ # Apply Updates
83
+ for f in extracted_fields:
84
+ matched_key = mapped_updates.get(f["id"])
85
+
86
+ # 1. Canonical Match (It exists in your strict schema)
87
+ if matched_key and matched_key in REAL_ESTATE_SCHEMA_MAP:
88
+ f["aliasId"] = matched_key
89
+ f["semanticType"] = REAL_ESTATE_SCHEMA_MAP[matched_key]["type"]
90
+ f["isDynamic"] = False
91
+
92
+ # 2. Dynamic Match (It's valid data, but not in your schema yet)
93
+ else:
94
+ # Generate a safe slug: "Loan Amount" -> "custom_loan_amount"
95
+ # We sanitize the label to make it a valid JSON key
96
+
97
+ # clean_slug = "".join(c if c.isalnum() else "_" for c in f["label"].lower())
98
+ # clean_slug = clean_slug.strip("_")[:50] # Limit length
99
+ parts = re.split(r'[^a-zA-Z0-9]+', f["label"].lower())
100
+ result = parts[0]
101
+ current_len = len(result)
102
+ max_length = 20
103
+ for word in parts[1:]:
104
+ add = word.capitalize()
105
+ if current_len + len(add) > max_length:
106
+ break
107
+ result += add
108
+
109
+
110
+ f["aliasId"] = f"{result}"
111
+ f["semanticType"] = f.get("detected_type", "shortText")
112
+ f["isDynamic"] = True
113
+
114
+ return extracted_fields
engine_vision.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import asyncio
3
+ import base64
4
+ import uuid
5
+ import time
6
+ import random
7
+ from openai import AsyncOpenAI
8
+ import config
9
+ import utils_geometry as utils
10
+ from schema_definitions import VALID_FIELD_TYPES
11
+
12
+ client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
13
+ #updated the code to push
14
+
15
+ # ==========================================
16
+ # WORKER: PROCESS SINGLE BATCH (With Retries)
17
+ # ==========================================
18
+ async def process_single_batch(semaphore, doc, page_num, batch_idx, batch_fields, global_context):
19
+ async with semaphore:
20
+ prompt_items = []
21
+ for f in batch_fields:
22
+ anchors = f["debug_anchors"]
23
+ prompt_items.append(
24
+ f"- Box ID {f['temp_id']}:\n"
25
+ f" Spatial Hints -> Left: '{anchors['left']}' | Above: '{anchors['above']}'\n"
26
+ f" PDF Type Hint: {f.get('ft', 'text')}"
27
+ )
28
+
29
+ # B. Render Image
30
+ img_bytes = await asyncio.to_thread(utils.render_hollow_debug_image, doc, page_num, batch_fields)
31
+ if not img_bytes: return []
32
+
33
+ # Save debug artifact
34
+ tag = f"{page_num}_batch_{batch_idx}"
35
+ utils.save_debug_image(img_bytes, tag)
36
+ b64_img = base64.b64encode(img_bytes).decode('utf-8')
37
+
38
+ # C. System Prompt (UPDATED with section_context)
39
+ valid_types_str = ", ".join(VALID_FIELD_TYPES)
40
+
41
+ system_prompt = f"""
42
+ You are an expert Legal Document Processor.
43
+ CONTEXT: Real Estate Contract. Global Context: "{global_context}"
44
+
45
+ TASK: Analyze the Neon Green Boxes (IDs {batch_fields[0]['temp_id']} to {batch_fields[-1]['temp_id']}).
46
+
47
+ OUTPUT RULES:
48
+ For each box, return JSON with:
49
+ 1. "visual_evidence": Text closest to the box.
50
+ 2. "section_context": The BOLD HEADER or SECTION TITLE this field belongs to (e.g. "Purchase Price", "Property Description", "Closing Date").
51
+ 3. "final_label": Precise natural label (e.g. "Purchase Price", "Seller Signature").
52
+ 4. "role": Who fills this out? Choose ONLY from:
53
+ [Buyer, Seller, Agent, Broker, President, Reviewer, Disclosing Party, Receiving Party, N/A].
54
+ - If ambiguous, infer from section header (e.g. "Tenant's Signature" -> "Tenant").
55
+ - If strictly administrative (e.g. "Office Use Only"), return "System".
56
+ 5. "detected_type": MUST be one of [{valid_types_str}].
57
+ - If it looks like money ($), use "dollar".
58
+ - If it looks like a date, use "date".
59
+ - If it's a signature, use "signature".
60
+
61
+ INPUT DATA:
62
+ {chr(10).join(prompt_items)}
63
+
64
+ Return JSON: {{ "fields": [ {{ "box_id": 1, ... }} ] }}
65
+ """
66
+
67
+ # D. Retry Logic (Restored from your original code)
68
+ MAX_RETRIES = 5
69
+ BASE_DELAY = 2
70
+ batch_results = []
71
+ page_height = doc[page_num].rect.height
72
+
73
+ for attempt in range(MAX_RETRIES):
74
+ try:
75
+ response = await client.chat.completions.create(
76
+ model="gpt-4o", # Use gpt-4o for best vision
77
+ response_format={"type": "json_object"},
78
+ messages=[
79
+ {"role": "user", "content": [
80
+ {"type": "text", "text": system_prompt},
81
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"}}
82
+ ]}
83
+ ],
84
+ temperature=0.0
85
+ )
86
+
87
+ content = response.choices[0].message.content
88
+ parsed = json.loads(content)
89
+ utils.save_debug_json(parsed, f"{tag}_vision_response")
90
+
91
+ results_map = {item["box_id"]: item for item in parsed.get("fields", [])}
92
+
93
+ for f in batch_fields:
94
+ res = results_map.get(f["temp_id"], {})
95
+ label = res.get("final_label", "")
96
+
97
+ # Fallback Geometry Logic
98
+ if not label or label == "Unknown":
99
+ anchors = f["debug_anchors"]
100
+ label = anchors["left"] if anchors["left"] else (anchors["above"] if anchors["above"] else "Unknown Field")
101
+
102
+ norm_rect = utils.normalize_bbox_to_top_left(f["bbox"], page_height)
103
+
104
+ batch_results.append({
105
+ "id": f.get("name", str(uuid.uuid4())[:8]),
106
+ "temp_id": f["temp_id"],
107
+ "label": label,
108
+ "section": res.get("section_context", "General Information"), # <--- Capturing Section Context
109
+ "role": res.get("role", "System"),
110
+ "detected_type": res.get("detected_type", "shortText"),
111
+ "uiType": "checkbox" if f.get("ft") == "/Btn" else "text",
112
+ "page": page_num,
113
+ "rect": {
114
+ "x": norm_rect["x0"], "y": norm_rect["y0"],
115
+ "width": norm_rect["x1"] - norm_rect["x0"], "height": norm_rect["y1"] - norm_rect["y0"]
116
+ },
117
+ "debug_evidence": res.get("visual_evidence", "N/A")
118
+ })
119
+ break # Success, exit retry loop
120
+
121
+ except Exception as e:
122
+ error_msg = str(e)
123
+ if "429" in error_msg or "Rate limit" in error_msg:
124
+ wait_time = (BASE_DELAY * (2 ** attempt)) + (random.random() * 0.5)
125
+ print(f"Rate Limit ({tag}). Waiting {wait_time:.2f}s...")
126
+ await asyncio.sleep(wait_time) # Use await sleep for async!
127
+ else:
128
+ print(f"Error {tag}: {e}")
129
+ break
130
+
131
+ return batch_results
132
+
133
+ #
134
+ # ==========================================
135
+ # ORCHESTRATOR: PROCESS PAGE
136
+ # ==========================================
137
+ async def process_page_smart(semaphore, doc, page_num, fields, global_context):
138
+ page = doc[page_num]
139
+ page_words = utils.get_words_from_page(page)
140
+ page_height = page.rect.height
141
+
142
+ # 1. Pre-calc anchors
143
+ for idx, f in enumerate(fields):
144
+ f["temp_id"] = idx + 1
145
+ f["debug_anchors"] = utils.calculate_smart_anchors(f["bbox"], page_words, page_height)
146
+
147
+ # 2. Create Batches
148
+ batches = [fields[i:i + config.VISION_BATCH_SIZE] for i in range(0, len(fields), config.VISION_BATCH_SIZE)]
149
+
150
+ print(f"📄 Page {page_num}: Queuing {len(batches)} batches for {len(fields)} fields...")
151
+
152
+ # 3. Spawn Parallel Tasks (Restored Concurrency)
153
+ tasks = []
154
+ for batch_idx, batch_fields in enumerate(batches):
155
+ task = asyncio.create_task(
156
+ process_single_batch(semaphore, doc, page_num, batch_idx, batch_fields, global_context)
157
+ )
158
+ tasks.append(task)
159
+
160
+ # 4. Gather Results
161
+ results = await asyncio.gather(*tasks)
162
+ return [item for sublist in results for item in sublist]
requirements.txt ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------
2
+ # Core numerical stack (avoid NumPy 2.x ABI issues)
3
+ # ------------------------------
4
+ numpy==1.26.4
5
+ scipy<1.13
6
+
7
+ # ------------------------------
8
+ # Web / API service
9
+ # ------------------------------
10
+ fastapi>=0.104.0
11
+ uvicorn[standard]>=0.24.0
12
+ python-multipart>=0.0.6
13
+ requests>=2.31.0
14
+
15
+ # ------------------------------
16
+ # PDF parsing / rendering used by your API files (fitz)
17
+ # ------------------------------
18
+ PyMuPDF>=1.23.0
19
+
20
+ # ------------------------------
21
+ # OpenAI client used by engine_vision / engine_mapping
22
+ # ------------------------------
23
+ openai>=1.0.0
24
+
25
+ # ------------------------------
26
+ # Imaging helpers
27
+ # ------------------------------
28
+ pillow>=10.0.0
29
+ opencv-python-headless>=4.9.0.80
30
+
31
+ # ------------------------------
32
+ # CommonForms + model deps (PDF field extractor service)
33
+ # ------------------------------
34
+ commonforms>=0.2.1
35
+ formalpdf==0.1.6
36
+ cryptography>=3.1
37
+ pypdf>=6.1.1
38
+
39
+ # ------------------------------
40
+ # Hugging Face / RF-DETR compatibility pins
41
+ # (prevents: ImportError: cannot import name 'torch_int')
42
+ # ------------------------------
43
+ huggingface-hub==0.36.0
44
+ transformers==4.57.3
45
+ peft==0.18.1
46
+ rfdetr==1.4.0.post0
47
+
48
+ # ------------------------------
49
+ # Torch stack (match torchvision exactly)
50
+ # ------------------------------
51
+ torch==2.8.0
52
+ torchvision==0.23.0
53
+
54
+ # ------------------------------
55
+ # ONNX inference
56
+ # ------------------------------
57
+ onnx>=1.16.0
58
+ onnxruntime>=1.23.1
59
+ onnxslim>=0.1.71
60
+
61
+ # ------------------------------
62
+ # Ultralytics (YOLO utilities used by CommonForms stack)
63
+ # ------------------------------
64
+ ultralytics>=8.3.204
65
+
66
+ # ------------------------------
67
+ # FastAPI baseline constraint
68
+ # ------------------------------
69
+ pydantic>=2.6,<3
schema_definitions.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Schema definitions for PDF field extraction.
3
+
4
+ This module loads the real estate alias IDs from a JSON file that is exported
5
+ from the TypeScript source (real-estate-property-alias-ids.ts) using the
6
+ export-alias-ids.js script.
7
+
8
+ To update the schema:
9
+ 1. Modify the TypeScript source file
10
+ 2. Run: node scripts/export-alias-ids.js
11
+ 3. The JSON file will be updated automatically
12
+ """
13
+
14
+ import json
15
+ import os
16
+
17
+ # Valid field types that the Vision model can detect
18
+ VALID_FIELD_TYPES = [
19
+ "checkbox",
20
+ "date",
21
+ "dollar",
22
+ "email",
23
+ "fullName",
24
+ "initial",
25
+ "longText",
26
+ "number",
27
+ "phone",
28
+ "shortText",
29
+ "signature",
30
+ "usAddress",
31
+ "ssn",
32
+ "iban"
33
+ ]
34
+
35
+
36
+ def load_real_estate_schema():
37
+ """
38
+ Load the real estate schema from the exported JSON file.
39
+ Falls back to an empty dict if the file doesn't exist.
40
+ """
41
+ json_path = os.path.join(os.path.dirname(__file__), "real_estate_alias_ids.json")
42
+
43
+ if not os.path.exists(json_path):
44
+ print(f"Warning: {json_path} not found. Run 'node scripts/export-alias-ids.js' to generate it.")
45
+ return {}
46
+
47
+ try:
48
+ with open(json_path, "r") as f:
49
+ data = json.load(f)
50
+
51
+ # Transform the JSON format to match the expected format
52
+ # JSON has: { key: { description, type, name } }
53
+ # We need: { key: { desc, type } }
54
+ schema = {}
55
+ for key, value in data.items():
56
+ schema[key] = {
57
+ "desc": value.get("description", ""),
58
+ "type": value.get("type", "shortText"),
59
+ "name": value.get("name", key)
60
+ }
61
+
62
+ return schema
63
+ except Exception as e:
64
+ print(f"Error loading real estate schema: {e}")
65
+ return {}
66
+
67
+
68
+ # Load the schema on module import
69
+ REAL_ESTATE_SCHEMA_MAP = load_real_estate_schema()
70
+
71
+ # Print info about loaded schema
72
+ if REAL_ESTATE_SCHEMA_MAP:
73
+ print(f"Loaded {len(REAL_ESTATE_SCHEMA_MAP)} alias IDs from real_estate_alias_ids.json")
74
+ else:
75
+ print("Warning: No alias IDs loaded. Schema mapping will create dynamic fields for all detections.")
utils_geometry.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import shutil
4
+ import fitz # PyMuPDF
5
+ import io
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import config
8
+
9
+ def setup_debug_dir():
10
+ if os.path.exists(config.DEBUG_DIR):
11
+ shutil.rmtree(config.DEBUG_DIR)
12
+ os.makedirs(config.DEBUG_DIR)
13
+ print(f"Debug directory cleared: {config.DEBUG_DIR}/")
14
+
15
+ def save_debug_image(image_bytes, name):
16
+ path = os.path.join(config.DEBUG_DIR, f"{name}.jpg")
17
+ with open(path, "wb") as f:
18
+ f.write(image_bytes)
19
+ return path
20
+
21
+ def save_debug_json(data, name):
22
+ path = os.path.join(config.DEBUG_DIR, f"{name}.json")
23
+ with open(path, "w") as f:
24
+ json.dump(data, f, indent=2)
25
+
26
+ def normalize_bbox_to_top_left(bbox, page_height):
27
+ """Convert PDF Bottom-Left coords to Image Top-Left coords."""
28
+ return {
29
+ "x0": bbox["x0"],
30
+ "y0": page_height - bbox["y1"],
31
+ "x1": bbox["x1"],
32
+ "y1": page_height - bbox["y0"]
33
+ }
34
+
35
+ def get_words_from_page(page):
36
+ return page.get_text("words")
37
+
38
+ def calculate_smart_anchors(field_bbox, words, page_height):
39
+ norm_bbox = normalize_bbox_to_top_left(field_bbox, page_height)
40
+ fx0, fy0, fx1, fy1 = norm_bbox["x0"], norm_bbox["y0"], norm_bbox["x1"], norm_bbox["y1"]
41
+
42
+ SEARCH_RADIUS = 150
43
+ Y_ALIGNMENT_TOLERANCE = 12
44
+
45
+ closest_left = []
46
+ closest_right = []
47
+ closest_above = []
48
+
49
+ for w in words:
50
+ wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4]
51
+ w_center_y = (wy0 + wy1) / 2
52
+ f_center_y = (fy0 + fy1) / 2
53
+
54
+ # Left
55
+ if wx1 < fx0 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
56
+ if fx0 - wx1 < SEARCH_RADIUS: closest_left.append((fx0 - wx1, text))
57
+ # Right
58
+ if wx0 > fx1 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
59
+ if wx0 - fx1 < SEARCH_RADIUS: closest_right.append((wx0 - fx1, text))
60
+ # Above
61
+ overlap = max(0, min(fx1, wx1) - max(fx0, wx0))
62
+ if wy1 < fy0 and overlap > 0:
63
+ if fy0 - wy1 < SEARCH_RADIUS: closest_above.append((fy0 - wy1, text))
64
+
65
+ closest_left.sort(key=lambda x: x[0])
66
+ closest_right.sort(key=lambda x: x[0])
67
+ closest_above.sort(key=lambda x: x[0])
68
+
69
+ def join_text(candidates): return " ".join([c[1] for c in candidates[:4]])
70
+
71
+ return {
72
+ "left": join_text(closest_left),
73
+ "right": join_text(closest_right),
74
+ "above": join_text(closest_above)
75
+ }
76
+
77
+ def render_hollow_debug_image(doc, page_num, fields):
78
+ if page_num >= len(doc): return None
79
+ page = doc[page_num]
80
+ zoom = 2.0
81
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
82
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
83
+ draw = ImageDraw.Draw(img)
84
+
85
+ scale_x = pix.width / page.rect.width
86
+ scale_y = pix.height / page.rect.height
87
+ page_h = page.rect.height
88
+
89
+ try: font = ImageFont.truetype("arial.ttf", 30)
90
+ except: font = ImageFont.load_default()
91
+
92
+ for f in fields:
93
+ vis_id = f["temp_id"]
94
+ bbox = f["bbox"]
95
+ x0_bl = bbox["x0"] * scale_x
96
+ y0_bl = (page_h - bbox["y1"]) * scale_y
97
+ x1_bl = bbox["x1"] * scale_x
98
+ y1_bl = (page_h - bbox["y0"]) * scale_y
99
+
100
+ draw.rectangle([x0_bl, y0_bl, x1_bl, y1_bl], outline=config.BOX_COLOR, width=config.BOX_WIDTH)
101
+
102
+ badge_w, badge_h = 50, 35
103
+ bx0, by0 = x0_bl - 10, y0_bl - badge_h - 2
104
+ draw.rectangle([bx0, by0, bx0 + badge_w, by0 + badge_h], fill=config.BADGE_BG)
105
+ draw.text((bx0 + 10, by0 + 5), str(vis_id), fill=config.BADGE_COLOR, font=font)
106
+
107
+ buffer = io.BytesIO()
108
+ img.save(buffer, format="JPEG", quality=85)
109
+ return buffer.getvalue()
utils_grouping.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_section_title(raw_title):
4
+ """
5
+ Turns '2. PURCHASE PRICE (U.S. currency)' -> 'Purchase Price'
6
+ """
7
+ if not raw_title: return "General Information"
8
+
9
+ # Remove leading numbers/bullets (e.g., "1.", "A.")
10
+ clean = re.sub(r'^[A-Z0-9]+\.\s*', '', raw_title)
11
+ # Remove things in parentheses (e.g., "(U.S. Currency)")
12
+ clean = re.sub(r'\s*\(.*?\)', '', clean)
13
+ # Title Case
14
+ return clean.strip().title()
15
+
16
+ UNGROUPABLE_TYPES = ["signature", "initial"]
17
+
18
+ def group_fields_by_section(fields):
19
+ """
20
+ Organizes flat fields into logical groups based on the
21
+ 'section' context extracted by the Vision model.
22
+ """
23
+ groups_map = {}
24
+
25
+ for f in fields:
26
+ # Get the raw section from Vision (now populated!)
27
+ if f.get("semanticType") in UNGROUPABLE_TYPES:
28
+ continue
29
+
30
+ raw_section = f.get("section", "General Information")
31
+ group_title = clean_section_title(raw_section)
32
+
33
+ # Create a stable ID for the group
34
+ group_id = f"grp_{group_title.lower().replace(' ', '_')[:30]}"
35
+
36
+ # Create group if not exists
37
+ if group_id not in groups_map:
38
+ groups_map[group_id] = {
39
+ "id": group_id,
40
+ "title": group_title,
41
+ "fieldIds": []
42
+ }
43
+
44
+ # Link field to group
45
+ groups_map[group_id]["fieldIds"].append(f["id"])
46
+
47
+ # Mutate the field object to include the link
48
+ f["groupId"] = group_id
49
+
50
+ # Sort groups by the page/y-position of their first field
51
+ sorted_groups = sorted(
52
+ groups_map.values(),
53
+ key=lambda g: [
54
+ next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("page", 0),
55
+ next((x for x in fields if x["id"] == g["fieldIds"][0]), {}).get("rect", {}).get("y", 0)
56
+ ]
57
+ )
58
+
59
+ return sorted_groups, fields