Spaces:

alfonsovelp
/

llm_document

Running on T4

App Files Files Community

Alfonso Velasco commited on Oct 23, 2025

Commit

330c438

1 Parent(s): 259596e

fix chunk

Browse files

Files changed (1) hide show

app.py +82 -67

app.py CHANGED Viewed

@@ -208,104 +208,92 @@ def should_split_page(rendered_width: int, rendered_height: int,
     # For rotated pages (90 or 270), the page has already been rotated in the rendered image
     # So we just check the rendered dimensions directly
-    aspect_ratio = rendered_width / rendered_height if rendered_height > 0 else 1
-    # Don't split if page is portrait or square-ish
-    if aspect_ratio <= 1.3:
-        return False, "none"
-    # Check if page is too wide
     if rendered_width > max_width:
-        # For very wide pages (like 2-page spreads), split horizontally
-        if aspect_ratio > 1.8:
-            return True, "horizontal"
-        # For moderately wide pages, try to fit
-        else:
-            return True, "horizontal"
-    return False, "none"
-def split_image_intelligently(image: Image.Image, max_width: int, overlap_ratio: float = 0.15) -> List[Tuple[Image.Image, int]]:
     """
-    Split an image into overlapping chunks intelligently.
-    Returns list of (chunk_image, x_offset) tuples.
     """
     img_width, img_height = image.size
-    chunks = []
     # Calculate overlap in pixels
     overlap_pixels = int(max_width * overlap_ratio)
-    # Calculate effective step size
     step_size = max_width - overlap_pixels
-    if step_size <= 0:
-        step_size = max_width // 2
-    # Calculate number of chunks needed
-    num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
-    # If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
-    if num_chunks == 2:
-        second_chunk_width = img_width - step_size
-        if second_chunk_width < max_width * 0.6:  # If second chunk would be less than 60% of max
-            # Split into two equal chunks with overlap
-            chunk_width = (img_width + overlap_pixels) // 2
-            chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
-            chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
-                          img_width - chunk_width))
-            return chunks
-    # Standard overlapping chunks
-    for i in range(num_chunks):
-        start_x = i * step_size
-        end_x = min(start_x + max_width, img_width)
-        # Ensure we don't create tiny slivers
-        if end_x - start_x < max_width * 0.3:  # Skip if less than 30% of max width
-            continue
-        chunk = image.crop((start_x, 0, end_x, img_height))
-        chunks.append((chunk, start_x))
-        print(f"  Chunk {i+1}/{num_chunks}: x={start_x}-{end_x} (width={end_x-start_x})")
     return chunks
-def process_pdf(pdf_bytes, split_wide: bool = True):
-    """Process PDF document, optionally splitting wide pages into chunks"""
     all_results = []
-    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
         tmp_file.write(pdf_bytes)
         tmp_file.flush()
-        try:
-            pdf_document = fitz.open(tmp_file.name)
-        except Exception as e:
-            os.unlink(tmp_file.name)
-            raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
-        # Configuration
-        RENDER_SCALE = 2.0
-        MAX_WIDTH = 2000  # Increased for better quality
-        MAX_TOKENS = 768  # Increased token limit for complex documents
         for page_num in range(len(pdf_document)):
             try:
                 page = pdf_document[page_num]
-                page_rect = page.rect
-                # Original page dimensions before any rotation
-                original_width = page_rect.width
-                original_height = page_rect.height
                 original_rotation = page.rotation
-                print(f"\nPage {page_num + 1}:")
                 print(f"  Original dimensions: {original_width}x{original_height}")
                 print(f"  Rotation: {original_rotation}°")
-                # Render page - PyMuPDF automatically applies rotation
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
@@ -342,17 +330,33 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                         chunk_width, chunk_height = chunk_image.size
                         print(f"    Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
                         # Process chunk with increased token limit
                         chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
-                        print(f"      Extracted {len(chunk_results)} items")
                         # Transform chunk-relative coordinates to full page coordinates
                         for result in chunk_results:
                             bbox = result['bbox']
                             # Add chunk offset (in rendered image pixels)
                             bbox['x'] += x_offset
                             # Scale from rendered image pixels to PDF points
                             # Use effective dimensions for proper scaling
                             bbox['x'] = bbox['x'] / RENDER_SCALE
@@ -362,6 +366,8 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                         page_results.extend(chunk_results)
                 else:
                     # Process full page without splitting
                     print("  Processing full page without splitting...")
@@ -382,6 +388,11 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                 unique_results = deduplicate_results(page_results)
                 print(f"  After deduplication: {len(unique_results)} unique items")
                 # Return results with both original and effective dimensions
                 all_results.append({
                     "page": page_num + 1,
@@ -394,6 +405,10 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                         "height": effective_pdf_height
                     },
                     "rotation": original_rotation,
                     "extractions": unique_results
                 })

     # For rotated pages (90 or 270), the page has already been rotated in the rendered image
     # So we just check the rendered dimensions directly
+    # Check if width exceeds max_width
     if rendered_width > max_width:
+        return (True, "horizontal")
+    # Could add vertical splitting logic here if needed
+    # if rendered_height > max_height:
+    #     return (True, "vertical")
+    return (False, None)
+def split_image_intelligently(image: Image.Image, max_width: int,
+                             overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
     """
+    Split image into overlapping chunks.
+    Returns:
+        List of (chunk_image, x_offset) tuples where x_offset is the pixel position
+        in the original image where this chunk starts.
     """
     img_width, img_height = image.size
+    if img_width <= max_width:
+        return [(image, 0)]
     # Calculate overlap in pixels
     overlap_pixels = int(max_width * overlap_ratio)
     step_size = max_width - overlap_pixels
+    chunks = []
+    x_position = 0
+    while x_position < img_width:
+        # Calculate the right edge of this chunk
+        right_edge = min(x_position + max_width, img_width)
+        # If this would be a very small last chunk, extend the previous chunk instead
+        if right_edge < img_width and (img_width - right_edge) < (max_width * 0.3):
+            right_edge = img_width
+        # Crop the chunk
+        chunk = image.crop((x_position, 0, right_edge, img_height))
+        chunks.append((chunk, x_position))
+        print(f"  Created chunk at x={x_position}, width={right_edge - x_position}")
+        # If we've reached the end, break
+        if right_edge >= img_width:
+            break
+        # Move to next chunk position
+        x_position += step_size
     return chunks
+def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
+    """
+    Process PDF and extract structured content.
+    """
+    RENDER_SCALE = 3.0  # High resolution for better OCR
+    MAX_WIDTH = 2000  # Maximum width for a single chunk (in rendered pixels)
+    MAX_TOKENS = 768  # Increased from 512 for better coverage
     all_results = []
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
         tmp_file.write(pdf_bytes)
         tmp_file.flush()
+        pdf_document = fitz.open(tmp_file.name)
         for page_num in range(len(pdf_document)):
             try:
                 page = pdf_document[page_num]
+                # Get original page dimensions and rotation
+                original_rect = page.rect
+                original_width = original_rect.width
+                original_height = original_rect.height
                 original_rotation = page.rotation
+                print(f"\nProcessing page {page_num + 1}:")
                 print(f"  Original dimensions: {original_width}x{original_height}")
                 print(f"  Rotation: {original_rotation}°")
+                # Render page at high resolution
+                # PyMuPDF automatically handles rotation when rendering
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                         chunk_width, chunk_height = chunk_image.size
                         print(f"    Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
+                        # IMPORTANT: Save chunk for debugging
+                        if chunk_idx == 0:
+                            print(f"      DEBUG: Saving first chunk for inspection")
+                            # chunk_image.save(f"/tmp/debug_chunk_{page_num}_{chunk_idx}.png")
                         # Process chunk with increased token limit
                         chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
+                        print(f"      Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
+                        # DEBUG: Print sample of first few items
+                        if chunk_results:
+                            print(f"      Sample items from chunk {chunk_idx + 1}:")
+                            for i, item in enumerate(chunk_results[:3]):
+                                print(f"        Item {i+1}: text='{item['text']}', x={item['bbox']['x']:.1f}")
                         # Transform chunk-relative coordinates to full page coordinates
                         for result in chunk_results:
                             bbox = result['bbox']
                             # Add chunk offset (in rendered image pixels)
+                            original_chunk_x = bbox['x']
                             bbox['x'] += x_offset
+                            # DEBUG: Print transformation for first item in each chunk
+                            if result == chunk_results[0]:
+                                print(f"      Coordinate transform: chunk_x={original_chunk_x:.1f} + offset={x_offset} = page_x={bbox['x']:.1f}")
                             # Scale from rendered image pixels to PDF points
                             # Use effective dimensions for proper scaling
                             bbox['x'] = bbox['x'] / RENDER_SCALE
                         page_results.extend(chunk_results)
+                    print(f"  Total items before deduplication: {len(page_results)}")
                 else:
                     # Process full page without splitting
                     print("  Processing full page without splitting...")
                 unique_results = deduplicate_results(page_results)
                 print(f"  After deduplication: {len(unique_results)} unique items")
+                # DEBUG: Print x-coordinate range of results
+                if unique_results:
+                    x_coords = [item['bbox']['x'] for item in unique_results]
+                    print(f"  X-coordinate range: {min(x_coords):.1f} to {max(x_coords):.1f}")
                 # Return results with both original and effective dimensions
                 all_results.append({
                     "page": page_num + 1,
                         "height": effective_pdf_height
                     },
                     "rotation": original_rotation,
+                    "rendered_dimensions": {
+                        "width": rendered_width,
+                        "height": rendered_height
+                    },
                     "extractions": unique_results
                 })