Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 21, 2025

Commit

2ddaa4e

1 Parent(s): 1af4bc8

fix coordinate scaling error

Browse files

Files changed (1) hide show

app.py +33 -34

app.py CHANGED Viewed

@@ -43,7 +43,7 @@ except Exception as e:
 class DocumentRequest(BaseModel):
     pdf: str = None
     image: str = None
-    split_wide_pages: bool = True  # New option to split wide pages
 @app.get("/")
 def home():
@@ -66,10 +66,10 @@ async def extract_document(request: DocumentRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float = 0) -> List[Dict]:
     """
-    Process a single image or image chunk and return extractions.
-    offset_x and offset_y are used when processing chunks of a larger image.
     """
     img_width, img_height = image.size
@@ -78,7 +78,7 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
             image,
             truncation=True,
             padding="max_length",
-            max_length=1024,  # Increased limit
             return_tensors="pt"
         )
     except Exception as e:
@@ -114,18 +114,12 @@ def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float
             if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                 continue
-            # Convert to chunk coordinates
             x = (x_norm / 1000.0) * img_width
             y = (y_norm / 1000.0) * img_height
             x2 = (x2_norm / 1000.0) * img_width
             y2 = (y2_norm / 1000.0) * img_height
-            # Add offset to get coordinates in full page space
-            x += offset_x
-            y += offset_y
-            x2 += offset_x
-            y2 += offset_y
             width = x2 - x
             height = y2 - y
@@ -192,46 +186,48 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                 chunk_width = MAX_WIDTH
                 for chunk_idx in range(num_chunks):
-                    # Calculate chunk boundaries
                     start_x = chunk_idx * (chunk_width - OVERLAP)
                     end_x = min(start_x + chunk_width, img_width)
-                    # Crop chunk
                     chunk = full_image.crop((start_x, 0, end_x, img_height))
                     print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
-                    # Process chunk and adjust coordinates
-                    chunk_offset_pdf = start_x / RENDER_SCALE
-                    chunk_results = process_image_chunk(
-                        chunk,
-                        offset_x=chunk_offset_pdf,
-                        offset_y=0
-                    )
-                    # Scale coordinates back to PDF space
                     for result in chunk_results:
                         bbox = result['bbox']
-                        bbox['x'] /= RENDER_SCALE
-                        bbox['y'] /= RENDER_SCALE
-                        bbox['width'] /= RENDER_SCALE
-                        bbox['height'] /= RENDER_SCALE
                     page_results.extend(chunk_results)
                 print(f"  Total extractions from all chunks: {len(page_results)}")
             else:
-                # Process full page
-                chunk_results = process_image_chunk(full_image, 0, 0)
-                # Scale coordinates back to PDF space
                 for result in chunk_results:
                     bbox = result['bbox']
-                    bbox['x'] = (bbox['x'] / img_width) * page_width
-                    bbox['y'] = (bbox['y'] / img_height) * page_height
-                    bbox['width'] = (bbox['width'] / img_width) * page_width
-                    bbox['height'] = (bbox['height'] / img_height) * page_height
                 page_results = chunk_results
@@ -278,7 +274,10 @@ def process_image(image_bytes):
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     img_width, img_height = image.size
-    results = process_image_chunk(image, 0, 0)
     return {
         "document_type": "image",

 class DocumentRequest(BaseModel):
     pdf: str = None
     image: str = None
+    split_wide_pages: bool = True
 @app.get("/")
 def home():
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+def process_image_chunk(image: Image.Image) -> List[Dict]:
     """
+    Process a single image or image chunk and return extractions with coordinates
+    relative to the chunk (0,0 at top-left of chunk).
     """
     img_width, img_height = image.size
             image,
             truncation=True,
             padding="max_length",
+            max_length=1024,
             return_tensors="pt"
         )
     except Exception as e:
             if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                 continue
+            # Convert normalized coordinates to chunk pixel coordinates
             x = (x_norm / 1000.0) * img_width
             y = (y_norm / 1000.0) * img_height
             x2 = (x2_norm / 1000.0) * img_width
             y2 = (y2_norm / 1000.0) * img_height
             width = x2 - x
             height = y2 - y
                 chunk_width = MAX_WIDTH
                 for chunk_idx in range(num_chunks):
+                    # Calculate chunk boundaries in rendered image pixels
                     start_x = chunk_idx * (chunk_width - OVERLAP)
                     end_x = min(start_x + chunk_width, img_width)
+                    # Crop chunk from rendered image
                     chunk = full_image.crop((start_x, 0, end_x, img_height))
                     print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
+                    # Process chunk (returns coordinates relative to chunk)
+                    chunk_results = process_image_chunk(chunk)
+                    # Transform chunk-relative coordinates to full page coordinates
                     for result in chunk_results:
                         bbox = result['bbox']
+                        # Add chunk offset (in rendered image pixels)
+                        bbox['x'] += start_x
+                        # y stays the same (no vertical splitting)
+                        # bbox['y'] is already correct
+                        # Now scale from rendered image pixels to PDF points
+                        bbox['x'] = bbox['x'] / RENDER_SCALE
+                        bbox['y'] = bbox['y'] / RENDER_SCALE
+                        bbox['width'] = bbox['width'] / RENDER_SCALE
+                        bbox['height'] = bbox['height'] / RENDER_SCALE
                     page_results.extend(chunk_results)
                 print(f"  Total extractions from all chunks: {len(page_results)}")
             else:
+                # Process full page (no splitting needed)
+                chunk_results = process_image_chunk(full_image)
+                # Scale coordinates from rendered image pixels to PDF points
                 for result in chunk_results:
                     bbox = result['bbox']
+                    bbox['x'] = bbox['x'] / RENDER_SCALE
+                    bbox['y'] = bbox['y'] / RENDER_SCALE
+                    bbox['width'] = bbox['width'] / RENDER_SCALE
+                    bbox['height'] = bbox['height'] / RENDER_SCALE
                 page_results = chunk_results
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     img_width, img_height = image.size
+    # Process the image
+    results = process_image_chunk(image)
+    # Coordinates are already in image pixels, no scaling needed for standalone images
     return {
         "document_type": "image",