Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 23, 2025

Commit

50304f8

1 Parent(s): 0f430a1

fix chunk

Browse files

Files changed (1) hide show

app.py +57 -68

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import tempfile
 import os
 import math
-# Fix the OMP_NUM_THREADS issue
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
@@ -72,10 +71,7 @@ async def extract_document(request: DocumentRequest):
         raise HTTPException(status_code=500, detail=str(e))
 def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
-    """
-    Process a single image or image chunk and return extractions with coordinates
-    relative to the chunk (0,0 at top-left of chunk).
-    """
     img_width, img_height = image.size
     if img_width < 1 or img_height < 1:
@@ -121,7 +117,6 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
     except RuntimeError as e:
         if "CUDA" in str(e):
             print(f"CUDA error encountered: {e}")
-            print("Falling back to CPU...")
             encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
             model.cpu()
             with torch.no_grad():
@@ -149,15 +144,12 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
     for idx, (token, box) in enumerate(zip(tokens, boxes)):
         try:
             if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
-                x_norm = box[0]
-                y_norm = box[1]
-                x2_norm = box[2]
-                y2_norm = box[3]
                 if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                     continue
-                # Convert normalized coordinates to chunk pixel coordinates
                 x = (x_norm / 1000.0) * img_width
                 y = (y_norm / 1000.0) * img_height
                 x2 = (x2_norm / 1000.0) * img_width
@@ -191,26 +183,15 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
     return results
-def should_split_page(rendered_width: int, rendered_height: int,
-                     original_rotation: int, max_width: int) -> Tuple[bool, str]:
-    """
-    Determine if a page should be split and in which direction.
-    For rotated pages, we check against the RENDERED dimensions.
-    """
     if rendered_width > max_width:
         return (True, "horizontal")
     return (False, None)
 def split_image_intelligently(image: Image.Image, max_width: int,
                              overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
-    """
-    Split image into overlapping chunks along the width.
-    Returns:
-        List of (chunk_image, x_offset) tuples where x_offset is the pixel position
-        in the RENDERED image where this chunk starts.
-    """
     img_width, img_height = image.size
     if img_width <= max_width:
@@ -242,13 +223,11 @@ def split_image_intelligently(image: Image.Image, max_width: int,
 def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
     """
-    Process PDF and extract structured content with proper coordinate handling for rotated pages.
-    KEY INSIGHT: For rotated pages, PyMuPDF renders them already rotated. So:
-    - A page with rotation=270° and original size 1224x792 gets rendered as if it were 792x1224
-    - The rendered image dimensions match the "effective" dimensions
-    - We split based on rendered dimensions
-    - Coordinates in results should be in the EFFECTIVE coordinate space
     """
     RENDER_SCALE = 3.0
     MAX_WIDTH = 2000  # Maximum width for a chunk in rendered pixels
@@ -276,7 +255,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                 print(f"  Original dimensions: {original_width}x{original_height}")
                 print(f"  Rotation: {original_rotation}°")
-                # CRITICAL: Determine effective dimensions (what the page looks like after rotation)
                 if original_rotation in [90, 270]:
                     effective_pdf_width = original_height
                     effective_pdf_height = original_width
@@ -286,25 +265,41 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                 print(f"  Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
-                # Render page - PyMuPDF automatically handles rotation
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                 full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
                 rendered_width, rendered_height = full_image.size
-                print(f"  Rendered dimensions: {rendered_width}x{rendered_height}")
-                # Verify: rendered dimensions should match effective dimensions * scale
                 expected_rendered_width = effective_pdf_width * RENDER_SCALE
                 expected_rendered_height = effective_pdf_height * RENDER_SCALE
-                print(f"  Expected rendered: {expected_rendered_width}x{expected_rendered_height}")
                 page_results = []
-                # Decide if we need to split based on RENDERED dimensions
                 should_split_decision, split_direction = should_split_page(
-                    rendered_width, rendered_height, original_rotation, MAX_WIDTH
                 )
                 if split_wide and should_split_decision:
@@ -320,28 +315,26 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                         chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
                         print(f"      Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
-                        if chunk_results:
                             print(f"      Sample items from chunk {chunk_idx + 1}:")
                             for i, item in enumerate(chunk_results[:3]):
                                 print(f"        Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
-                        # Transform coordinates:
-                        # 1. Add x_offset to move from chunk coordinates to full rendered image coordinates
-                        # 2. Divide by RENDER_SCALE to convert to PDF points in effective coordinate space
                         for result in chunk_results:
                             bbox = result['bbox']
-                            # Step 1: Chunk coordinates -> Rendered image coordinates
-                            chunk_x = bbox['x']
-                            rendered_x = chunk_x + x_offset
-                            # Step 2: Rendered coordinates -> PDF points (effective coordinate space)
-                            pdf_x = rendered_x / RENDER_SCALE
-                            pdf_y = bbox['y'] / RENDER_SCALE
-                            pdf_width = bbox['width'] / RENDER_SCALE
-                            pdf_height = bbox['height'] / RENDER_SCALE
-                            # Update bbox with PDF coordinates
                             bbox['x'] = pdf_x
                             bbox['y'] = pdf_y
                             bbox['width'] = pdf_width
@@ -349,7 +342,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                             # Debug first item
                             if result == chunk_results[0]:
-                                print(f"      Coordinate transform: chunk_x={chunk_x:.1f}px + offset={x_offset}px = rendered_x={rendered_x:.1f}px → pdf_x={pdf_x:.1f}pts")
                         page_results.extend(chunk_results)
@@ -360,13 +353,12 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                     print("  Processing full page without splitting...")
                     chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
-                    # Scale coordinates from rendered image pixels to PDF points
                     for result in chunk_results:
                         bbox = result['bbox']
-                        bbox['x'] = bbox['x'] / RENDER_SCALE
-                        bbox['y'] = bbox['y'] / RENDER_SCALE
-                        bbox['width'] = bbox['width'] / RENDER_SCALE
-                        bbox['height'] = bbox['height'] / RENDER_SCALE
                     page_results = chunk_results
                     print(f"  Extracted {len(chunk_results)} items")
@@ -383,13 +375,13 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
                     print(f"    X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
                     print(f"    Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
-                    # Warn if coordinates exceed page dimensions
-                    if max(x_coords) > effective_pdf_width:
-                        print(f"  WARNING: Some X coordinates exceed effective page width!")
-                    if max(y_coords) > effective_pdf_height:
-                        print(f"  WARNING: Some Y coordinates exceed effective page height!")
-                # Return results with proper dimensions
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {
@@ -427,10 +419,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
     }
 def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
-    """
-    Remove duplicate extractions using spatial clustering.
-    Tolerance is in PDF points.
-    """
     if not results:
         return []
@@ -479,7 +468,7 @@ def process_image(image_bytes):
     print(f"Processing single image: {img_width}x{img_height}")
-    should_split_decision, _ = should_split_page(img_width, img_height, 0, 2000)
     if should_split_decision:
         print("  Image is wide, splitting into chunks...")

 import os
 import math
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
         raise HTTPException(status_code=500, detail=str(e))
 def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
+    """Process a single image chunk and return extractions."""
     img_width, img_height = image.size
     if img_width < 1 or img_height < 1:
     except RuntimeError as e:
         if "CUDA" in str(e):
             print(f"CUDA error encountered: {e}")
             encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
             model.cpu()
             with torch.no_grad():
     for idx, (token, box) in enumerate(zip(tokens, boxes)):
         try:
             if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
+                x_norm, y_norm, x2_norm, y2_norm = box
                 if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                     continue
+                # Convert normalized coordinates to pixel coordinates
                 x = (x_norm / 1000.0) * img_width
                 y = (y_norm / 1000.0) * img_height
                 x2 = (x2_norm / 1000.0) * img_width
     return results
+def should_split_page(rendered_width: int, rendered_height: int, max_width: int) -> Tuple[bool, str]:
+    """Determine if a page should be split based on rendered dimensions."""
     if rendered_width > max_width:
         return (True, "horizontal")
     return (False, None)
 def split_image_intelligently(image: Image.Image, max_width: int,
                              overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
+    """Split image into overlapping chunks along the width."""
     img_width, img_height = image.size
     if img_width <= max_width:
 def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
     """
+    Process PDF with proper handling of rotated pages.
+    KEY FIX: We now work with ACTUAL rendered dimensions instead of assuming
+    they match the effective dimensions. We map coordinates based on the
+    actual render, then transform them to the effective coordinate space.
     """
     RENDER_SCALE = 3.0
     MAX_WIDTH = 2000  # Maximum width for a chunk in rendered pixels
                 print(f"  Original dimensions: {original_width}x{original_height}")
                 print(f"  Rotation: {original_rotation}°")
+                # Determine effective dimensions (what the page looks like when properly oriented)
                 if original_rotation in [90, 270]:
                     effective_pdf_width = original_height
                     effective_pdf_height = original_width
                 print(f"  Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
+                # Render the page - PyMuPDF may not rotate it as expected
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                 full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
                 rendered_width, rendered_height = full_image.size
+                print(f"  Actual rendered dimensions: {rendered_width}x{rendered_height}")
+                # Detect if dimensions don't match expectations
                 expected_rendered_width = effective_pdf_width * RENDER_SCALE
                 expected_rendered_height = effective_pdf_height * RENDER_SCALE
+                dimensions_swapped = False
+                if (abs(rendered_width - expected_rendered_height) < 10 and
+                    abs(rendered_height - expected_rendered_width) < 10):
+                    print(f"  ⚠️  Dimensions are swapped! Rotating image 90° to match expected orientation.")
+                    # Rotate the image to match expected orientation
+                    full_image = full_image.rotate(-90, expand=True)
+                    rendered_width, rendered_height = full_image.size
+                    print(f"  After rotation: {rendered_width}x{rendered_height}")
+                    dimensions_swapped = True
+                # Calculate the scale factor from rendered pixels to effective PDF points
+                # This handles any discrepancies between expected and actual rendering
+                scale_x = rendered_width / (effective_pdf_width * RENDER_SCALE)
+                scale_y = rendered_height / (effective_pdf_height * RENDER_SCALE)
+                print(f"  Scale factors: x={scale_x:.4f}, y={scale_y:.4f}")
                 page_results = []
+                # Decide if we need to split
                 should_split_decision, split_direction = should_split_page(
+                    rendered_width, rendered_height, MAX_WIDTH
                 )
                 if split_wide and should_split_decision:
                         chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
                         print(f"      Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
+                        if chunk_results and chunk_idx < 2:
                             print(f"      Sample items from chunk {chunk_idx + 1}:")
                             for i, item in enumerate(chunk_results[:3]):
                                 print(f"        Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
+                        # Transform coordinates from chunk space to PDF effective space
                         for result in chunk_results:
                             bbox = result['bbox']
+                            # Step 1: Chunk coordinates -> Full rendered image coordinates
+                            rendered_x = bbox['x'] + x_offset
+                            rendered_y = bbox['y']
+                            # Step 2: Rendered coordinates -> PDF points in effective space
+                            # Account for the actual render scale and any dimension swapping
+                            pdf_x = rendered_x / (RENDER_SCALE * scale_x)
+                            pdf_y = rendered_y / (RENDER_SCALE * scale_y)
+                            pdf_width = bbox['width'] / (RENDER_SCALE * scale_x)
+                            pdf_height = bbox['height'] / (RENDER_SCALE * scale_y)
                             bbox['x'] = pdf_x
                             bbox['y'] = pdf_y
                             bbox['width'] = pdf_width
                             # Debug first item
                             if result == chunk_results[0]:
+                                print(f"      Transform: chunk_x={bbox['x'] - pdf_x + rendered_x - x_offset:.1f}px + offset={x_offset}px = rendered_x={rendered_x:.1f}px → pdf_x={pdf_x:.1f}pts")
                         page_results.extend(chunk_results)
                     print("  Processing full page without splitting...")
                     chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
                     for result in chunk_results:
                         bbox = result['bbox']
+                        bbox['x'] = bbox['x'] / (RENDER_SCALE * scale_x)
+                        bbox['y'] = bbox['y'] / (RENDER_SCALE * scale_y)
+                        bbox['width'] = bbox['width'] / (RENDER_SCALE * scale_x)
+                        bbox['height'] = bbox['height'] / (RENDER_SCALE * scale_y)
                     page_results = chunk_results
                     print(f"  Extracted {len(chunk_results)} items")
                     print(f"    X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
                     print(f"    Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
+                    if max(x_coords) > effective_pdf_width + 10:
+                        print(f"  ⚠️  WARNING: Some X coordinates still exceed effective page width!")
+                    elif max(x_coords) > effective_pdf_width:
+                        print(f"  ℹ️  Note: Max X slightly exceeds width (likely edge items), but within tolerance")
+                    else:
+                        print(f"  ✓ All coordinates within expected bounds")
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {
     }
 def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
+    """Remove duplicate extractions using spatial clustering."""
     if not results:
         return []
     print(f"Processing single image: {img_width}x{img_height}")
+    should_split_decision, _ = should_split_page(img_width, img_height, 2000)
     if should_split_decision:
         print("  Image is wide, splitting into chunks...")