Spaces:
Sleeping
Sleeping
Alfonso Velasco commited on
Commit ·
2ddaa4e
1
Parent(s): 1af4bc8
fix coordinate scaling error
Browse files
app.py
CHANGED
|
@@ -43,7 +43,7 @@ except Exception as e:
|
|
| 43 |
class DocumentRequest(BaseModel):
|
| 44 |
pdf: str = None
|
| 45 |
image: str = None
|
| 46 |
-
split_wide_pages: bool = True
|
| 47 |
|
| 48 |
@app.get("/")
|
| 49 |
def home():
|
|
@@ -66,10 +66,10 @@ async def extract_document(request: DocumentRequest):
|
|
| 66 |
except Exception as e:
|
| 67 |
raise HTTPException(status_code=500, detail=str(e))
|
| 68 |
|
| 69 |
-
def process_image_chunk(image: Image.Image
|
| 70 |
"""
|
| 71 |
-
Process a single image or image chunk and return extractions
|
| 72 |
-
|
| 73 |
"""
|
| 74 |
img_width, img_height = image.size
|
| 75 |
|
|
@@ -78,7 +78,7 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
|
|
| 78 |
image,
|
| 79 |
truncation=True,
|
| 80 |
padding="max_length",
|
| 81 |
-
max_length=1024,
|
| 82 |
return_tensors="pt"
|
| 83 |
)
|
| 84 |
except Exception as e:
|
|
@@ -114,18 +114,12 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
|
|
| 114 |
if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
|
| 115 |
continue
|
| 116 |
|
| 117 |
-
# Convert to chunk coordinates
|
| 118 |
x = (x_norm / 1000.0) * img_width
|
| 119 |
y = (y_norm / 1000.0) * img_height
|
| 120 |
x2 = (x2_norm / 1000.0) * img_width
|
| 121 |
y2 = (y2_norm / 1000.0) * img_height
|
| 122 |
|
| 123 |
-
# Add offset to get coordinates in full page space
|
| 124 |
-
x += offset_x
|
| 125 |
-
y += offset_y
|
| 126 |
-
x2 += offset_x
|
| 127 |
-
y2 += offset_y
|
| 128 |
-
|
| 129 |
width = x2 - x
|
| 130 |
height = y2 - y
|
| 131 |
|
|
@@ -192,46 +186,48 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 192 |
chunk_width = MAX_WIDTH
|
| 193 |
|
| 194 |
for chunk_idx in range(num_chunks):
|
| 195 |
-
# Calculate chunk boundaries
|
| 196 |
start_x = chunk_idx * (chunk_width - OVERLAP)
|
| 197 |
end_x = min(start_x + chunk_width, img_width)
|
| 198 |
|
| 199 |
-
# Crop chunk
|
| 200 |
chunk = full_image.crop((start_x, 0, end_x, img_height))
|
| 201 |
|
| 202 |
print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
|
| 203 |
|
| 204 |
-
# Process chunk
|
| 205 |
-
|
| 206 |
-
chunk_results = process_image_chunk(
|
| 207 |
-
chunk,
|
| 208 |
-
offset_x=chunk_offset_pdf,
|
| 209 |
-
offset_y=0
|
| 210 |
-
)
|
| 211 |
|
| 212 |
-
#
|
| 213 |
for result in chunk_results:
|
| 214 |
bbox = result['bbox']
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
bbox['
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
page_results.extend(chunk_results)
|
| 221 |
|
| 222 |
print(f" Total extractions from all chunks: {len(page_results)}")
|
| 223 |
|
| 224 |
else:
|
| 225 |
-
# Process full page
|
| 226 |
-
chunk_results = process_image_chunk(full_image
|
| 227 |
|
| 228 |
-
# Scale coordinates
|
| 229 |
for result in chunk_results:
|
| 230 |
bbox = result['bbox']
|
| 231 |
-
bbox['x'] =
|
| 232 |
-
bbox['y'] =
|
| 233 |
-
bbox['width'] =
|
| 234 |
-
bbox['height'] =
|
| 235 |
|
| 236 |
page_results = chunk_results
|
| 237 |
|
|
@@ -278,7 +274,10 @@ def process_image(image_bytes):
|
|
| 278 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 279 |
img_width, img_height = image.size
|
| 280 |
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
return {
|
| 284 |
"document_type": "image",
|
|
|
|
| 43 |
class DocumentRequest(BaseModel):
|
| 44 |
pdf: str = None
|
| 45 |
image: str = None
|
| 46 |
+
split_wide_pages: bool = True
|
| 47 |
|
| 48 |
@app.get("/")
|
| 49 |
def home():
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
raise HTTPException(status_code=500, detail=str(e))
|
| 68 |
|
| 69 |
+
def process_image_chunk(image: Image.Image) -> List[Dict]:
|
| 70 |
"""
|
| 71 |
+
Process a single image or image chunk and return extractions with coordinates
|
| 72 |
+
relative to the chunk (0,0 at top-left of chunk).
|
| 73 |
"""
|
| 74 |
img_width, img_height = image.size
|
| 75 |
|
|
|
|
| 78 |
image,
|
| 79 |
truncation=True,
|
| 80 |
padding="max_length",
|
| 81 |
+
max_length=1024,
|
| 82 |
return_tensors="pt"
|
| 83 |
)
|
| 84 |
except Exception as e:
|
|
|
|
| 114 |
if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
|
| 115 |
continue
|
| 116 |
|
| 117 |
+
# Convert normalized coordinates to chunk pixel coordinates
|
| 118 |
x = (x_norm / 1000.0) * img_width
|
| 119 |
y = (y_norm / 1000.0) * img_height
|
| 120 |
x2 = (x2_norm / 1000.0) * img_width
|
| 121 |
y2 = (y2_norm / 1000.0) * img_height
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
width = x2 - x
|
| 124 |
height = y2 - y
|
| 125 |
|
|
|
|
| 186 |
chunk_width = MAX_WIDTH
|
| 187 |
|
| 188 |
for chunk_idx in range(num_chunks):
|
| 189 |
+
# Calculate chunk boundaries in rendered image pixels
|
| 190 |
start_x = chunk_idx * (chunk_width - OVERLAP)
|
| 191 |
end_x = min(start_x + chunk_width, img_width)
|
| 192 |
|
| 193 |
+
# Crop chunk from rendered image
|
| 194 |
chunk = full_image.crop((start_x, 0, end_x, img_height))
|
| 195 |
|
| 196 |
print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
|
| 197 |
|
| 198 |
+
# Process chunk (returns coordinates relative to chunk)
|
| 199 |
+
chunk_results = process_image_chunk(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
+
# Transform chunk-relative coordinates to full page coordinates
|
| 202 |
for result in chunk_results:
|
| 203 |
bbox = result['bbox']
|
| 204 |
+
|
| 205 |
+
# Add chunk offset (in rendered image pixels)
|
| 206 |
+
bbox['x'] += start_x
|
| 207 |
+
# y stays the same (no vertical splitting)
|
| 208 |
+
# bbox['y'] is already correct
|
| 209 |
+
|
| 210 |
+
# Now scale from rendered image pixels to PDF points
|
| 211 |
+
bbox['x'] = bbox['x'] / RENDER_SCALE
|
| 212 |
+
bbox['y'] = bbox['y'] / RENDER_SCALE
|
| 213 |
+
bbox['width'] = bbox['width'] / RENDER_SCALE
|
| 214 |
+
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 215 |
|
| 216 |
page_results.extend(chunk_results)
|
| 217 |
|
| 218 |
print(f" Total extractions from all chunks: {len(page_results)}")
|
| 219 |
|
| 220 |
else:
|
| 221 |
+
# Process full page (no splitting needed)
|
| 222 |
+
chunk_results = process_image_chunk(full_image)
|
| 223 |
|
| 224 |
+
# Scale coordinates from rendered image pixels to PDF points
|
| 225 |
for result in chunk_results:
|
| 226 |
bbox = result['bbox']
|
| 227 |
+
bbox['x'] = bbox['x'] / RENDER_SCALE
|
| 228 |
+
bbox['y'] = bbox['y'] / RENDER_SCALE
|
| 229 |
+
bbox['width'] = bbox['width'] / RENDER_SCALE
|
| 230 |
+
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 231 |
|
| 232 |
page_results = chunk_results
|
| 233 |
|
|
|
|
| 274 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 275 |
img_width, img_height = image.size
|
| 276 |
|
| 277 |
+
# Process the image
|
| 278 |
+
results = process_image_chunk(image)
|
| 279 |
+
|
| 280 |
+
# Coordinates are already in image pixels, no scaling needed for standalone images
|
| 281 |
|
| 282 |
return {
|
| 283 |
"document_type": "image",
|