Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 23, 2025

Commit

259596e

1 Parent(s): c9e5fd6

fix chunk

Browse files

Files changed (1) hide show

app.py +228 -104

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import Dict, Any, List
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 import torch
 from PIL import Image
@@ -9,6 +9,7 @@ import base64
 import fitz  # PyMuPDF
 import tempfile
 import os
 # Fix the OMP_NUM_THREADS issue
 os.environ['OMP_NUM_THREADS'] = '1'
@@ -73,7 +74,7 @@ async def extract_document(request: DocumentRequest):
         print(f"Error in extract_document: {error_details}")
         raise HTTPException(status_code=500, detail=str(e))
-def process_image_chunk(image: Image.Image) -> List[Dict]:
     """
     Process a single image or image chunk and return extractions with coordinates
     relative to the chunk (0,0 at top-left of chunk).
@@ -90,7 +91,7 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
             image,
             truncation=True,
             padding="max_length",
-            max_length=512,  # Reduced from 1024 for better stability
             return_tensors="pt"
         )
     except Exception as e:
@@ -98,11 +99,11 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
         try:
             encoding = processor(
                 image,
-                text=[""] * 512,
-                boxes=[[0, 0, 0, 0]] * 512,
                 truncation=True,
                 padding="max_length",
-                max_length=512,
                 return_tensors="pt"
             )
         except Exception as e2:
@@ -198,6 +199,79 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
     return results
 def process_pdf(pdf_bytes, split_wide: bool = True):
     """Process PDF document, optionally splitting wide pages into chunks"""
     all_results = []
@@ -212,102 +286,86 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
             os.unlink(tmp_file.name)
             raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
         RENDER_SCALE = 2.0
-        MAX_WIDTH = 1800  # Reduced from 2000 for better stability
-        OVERLAP = 150     # Reduced overlap
         for page_num in range(len(pdf_document)):
             try:
                 page = pdf_document[page_num]
                 page_rect = page.rect
-                page_width = page_rect.width
-                page_height = page_rect.height
-                print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
-                # Render page
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                 full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
-                img_width, img_height = full_image.size
-                print(f"Rendered image: {img_width}x{img_height}")
                 page_results = []
-                # Check if page is too wide and should be split
-                if split_wide and img_width > MAX_WIDTH:
-                    print(f"Page is wide ({img_width}px), splitting into chunks...")
-                    # Calculate proper number of chunks with safer logic
-                    step_size = MAX_WIDTH - OVERLAP
-                    if step_size <= 0:
-                        step_size = MAX_WIDTH // 2  # Fallback
-                    num_chunks = max(1, ((img_width - OVERLAP) + step_size - 1) // step_size)
-                    print(f"Will create {num_chunks} chunks with step size {step_size}")
-                    for chunk_idx in range(num_chunks):
-                        # Calculate chunk boundaries in rendered image pixels
-                        start_x = chunk_idx * step_size
-                        end_x = min(start_x + MAX_WIDTH, img_width)
-                        # Ensure chunk has valid dimensions
-                        if end_x <= start_x:
-                            print(f"  Skipping invalid chunk {chunk_idx + 1}: start_x={start_x}, end_x={end_x}")
-                            continue
-                        chunk_actual_width = end_x - start_x
-                        # Skip chunks that are too narrow
-                        if chunk_actual_width < 100:
-                            print(f"  Skipping narrow chunk {chunk_idx + 1}: width={chunk_actual_width}")
-                            continue
-                        print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x} (width={chunk_actual_width})")
-                        try:
-                            # Crop chunk from rendered image
-                            chunk = full_image.crop((start_x, 0, end_x, img_height))
-                            # Verify chunk dimensions
-                            verify_width, verify_height = chunk.size
-                            print(f"    Chunk actual size: {verify_width}x{verify_height}")
-                            # Process chunk (returns coordinates relative to chunk)
-                            chunk_results = process_image_chunk(chunk)
-                            print(f"    Extracted {len(chunk_results)} items from chunk")
-                            # Transform chunk-relative coordinates to full page coordinates
-                            for result in chunk_results:
-                                bbox = result['bbox']
-                                # Add chunk offset (in rendered image pixels)
-                                bbox['x'] += start_x
-                                # y stays the same (no vertical splitting)
-                                # Now scale from rendered image pixels to PDF points
-                                bbox['x'] = bbox['x'] / RENDER_SCALE
-                                bbox['y'] = bbox['y'] / RENDER_SCALE
-                                bbox['width'] = bbox['width'] / RENDER_SCALE
-                                bbox['height'] = bbox['height'] / RENDER_SCALE
-                            page_results.extend(chunk_results)
-                        except Exception as e:
-                            print(f"  Error processing chunk {chunk_idx + 1}: {e}")
-                            import traceback
-                            traceback.print_exc()
-                            continue
-                    print(f"  Total extractions from all chunks: {len(page_results)}")
                 else:
-                    # Process full page (no splitting needed)
-                    print("Processing full page without splitting")
-                    chunk_results = process_image_chunk(full_image)
                     # Scale coordinates from rendered image pixels to PDF points
                     for result in chunk_results:
@@ -318,35 +376,24 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                         bbox['height'] = bbox['height'] / RENDER_SCALE
                     page_results = chunk_results
-                # Remove duplicates from overlapping chunks
-                unique_results = []
-                seen_boxes = set()
-                DEDUP_TOLERANCE = 5  # pixels tolerance for deduplication
-                for result in page_results:
-                    bbox = result['bbox']
-                    box_tuple = (
-                        round(bbox['x'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
-                        round(bbox['y'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
-                        round(bbox['width'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
-                        round(bbox['height'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE
-                    )
-                    if box_tuple not in seen_boxes:
-                        seen_boxes.add(box_tuple)
-                        unique_results.append(result)
-                print(f"  After deduplication: {len(unique_results)} unique extractions")
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {
-                        "width": page_width,
-                        "height": page_height
                     },
-                    "rotation": page.rotation,
                     "extractions": unique_results
                 })
@@ -358,6 +405,7 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {"width": 0, "height": 0},
                     "rotation": 0,
                     "extractions": [],
                     "error": str(e)
@@ -372,15 +420,87 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
         "pages": all_results
     }
 def process_image(image_bytes):
     """Process single image"""
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     img_width, img_height = image.size
-    # Process the image
-    results = process_image_chunk(image)
-    # Coordinates are already in image pixels, no scaling needed for standalone images
     return {
         "document_type": "image",
@@ -389,4 +509,8 @@ def process_image(image_bytes):
             "height": img_height
         },
         "extractions": results
-    }

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from typing import Dict, Any, List, Tuple, Optional
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 import torch
 from PIL import Image
 import fitz  # PyMuPDF
 import tempfile
 import os
+import math
 # Fix the OMP_NUM_THREADS issue
 os.environ['OMP_NUM_THREADS'] = '1'
         print(f"Error in extract_document: {error_details}")
         raise HTTPException(status_code=500, detail=str(e))
+def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
     """
     Process a single image or image chunk and return extractions with coordinates
     relative to the chunk (0,0 at top-left of chunk).
             image,
             truncation=True,
             padding="max_length",
+            max_length=max_tokens,
             return_tensors="pt"
         )
     except Exception as e:
         try:
             encoding = processor(
                 image,
+                text=[""] * max_tokens,
+                boxes=[[0, 0, 0, 0]] * max_tokens,
                 truncation=True,
                 padding="max_length",
+                max_length=max_tokens,
                 return_tensors="pt"
             )
         except Exception as e2:
     return results
+def should_split_page(rendered_width: int, rendered_height: int,
+                     original_rotation: int, max_width: int) -> Tuple[bool, str]:
+    """
+    Determine if a page should be split and in which direction.
+    Returns (should_split, split_direction)
+    """
+    # For rotated pages (90 or 270), the page has already been rotated in the rendered image
+    # So we just check the rendered dimensions directly
+    aspect_ratio = rendered_width / rendered_height if rendered_height > 0 else 1
+    # Don't split if page is portrait or square-ish
+    if aspect_ratio <= 1.3:
+        return False, "none"
+    # Check if page is too wide
+    if rendered_width > max_width:
+        # For very wide pages (like 2-page spreads), split horizontally
+        if aspect_ratio > 1.8:
+            return True, "horizontal"
+        # For moderately wide pages, try to fit
+        else:
+            return True, "horizontal"
+    return False, "none"
+def split_image_intelligently(image: Image.Image, max_width: int, overlap_ratio: float = 0.15) -> List[Tuple[Image.Image, int]]:
+    """
+    Split an image into overlapping chunks intelligently.
+    Returns list of (chunk_image, x_offset) tuples.
+    """
+    img_width, img_height = image.size
+    chunks = []
+    # Calculate overlap in pixels
+    overlap_pixels = int(max_width * overlap_ratio)
+    # Calculate effective step size
+    step_size = max_width - overlap_pixels
+    if step_size <= 0:
+        step_size = max_width // 2
+    # Calculate number of chunks needed
+    num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
+    # If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
+    if num_chunks == 2:
+        second_chunk_width = img_width - step_size
+        if second_chunk_width < max_width * 0.6:  # If second chunk would be less than 60% of max
+            # Split into two equal chunks with overlap
+            chunk_width = (img_width + overlap_pixels) // 2
+            chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
+            chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
+                          img_width - chunk_width))
+            return chunks
+    # Standard overlapping chunks
+    for i in range(num_chunks):
+        start_x = i * step_size
+        end_x = min(start_x + max_width, img_width)
+        # Ensure we don't create tiny slivers
+        if end_x - start_x < max_width * 0.3:  # Skip if less than 30% of max width
+            continue
+        chunk = image.crop((start_x, 0, end_x, img_height))
+        chunks.append((chunk, start_x))
+        print(f"  Chunk {i+1}/{num_chunks}: x={start_x}-{end_x} (width={end_x-start_x})")
+    return chunks
 def process_pdf(pdf_bytes, split_wide: bool = True):
     """Process PDF document, optionally splitting wide pages into chunks"""
     all_results = []
             os.unlink(tmp_file.name)
             raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
+        # Configuration
         RENDER_SCALE = 2.0
+        MAX_WIDTH = 2000  # Increased for better quality
+        MAX_TOKENS = 768  # Increased token limit for complex documents
         for page_num in range(len(pdf_document)):
             try:
                 page = pdf_document[page_num]
                 page_rect = page.rect
+                # Original page dimensions before any rotation
+                original_width = page_rect.width
+                original_height = page_rect.height
+                original_rotation = page.rotation
+                print(f"\nPage {page_num + 1}:")
+                print(f"  Original dimensions: {original_width}x{original_height}")
+                print(f"  Rotation: {original_rotation}°")
+                # Render page - PyMuPDF automatically applies rotation
                 mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                 full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
+                rendered_width, rendered_height = full_image.size
+                print(f"  Rendered dimensions: {rendered_width}x{rendered_height}")
+                # Determine effective dimensions after rotation for coordinate mapping
+                if original_rotation in [90, 270]:
+                    # Page has been rotated, so effective dimensions are swapped
+                    effective_pdf_width = original_height
+                    effective_pdf_height = original_width
+                else:
+                    effective_pdf_width = original_width
+                    effective_pdf_height = original_height
+                print(f"  Effective PDF dimensions: {effective_pdf_width}x{effective_pdf_height}")
                 page_results = []
+                # Decide if we need to split
+                should_split, split_direction = should_split_page(
+                    rendered_width, rendered_height, original_rotation, MAX_WIDTH
+                )
+                if split_wide and should_split:
+                    print(f"  Splitting page ({split_direction})...")
+                    chunks = split_image_intelligently(full_image, MAX_WIDTH, overlap_ratio=0.2)
+                    print(f"  Created {len(chunks)} chunks")
+                    for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
+                        chunk_width, chunk_height = chunk_image.size
+                        print(f"    Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
+                        # Process chunk with increased token limit
+                        chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
+                        print(f"      Extracted {len(chunk_results)} items")
+                        # Transform chunk-relative coordinates to full page coordinates
+                        for result in chunk_results:
+                            bbox = result['bbox']
+                            # Add chunk offset (in rendered image pixels)
+                            bbox['x'] += x_offset
+                            # Scale from rendered image pixels to PDF points
+                            # Use effective dimensions for proper scaling
+                            bbox['x'] = bbox['x'] / RENDER_SCALE
+                            bbox['y'] = bbox['y'] / RENDER_SCALE
+                            bbox['width'] = bbox['width'] / RENDER_SCALE
+                            bbox['height'] = bbox['height'] / RENDER_SCALE
+                        page_results.extend(chunk_results)
                 else:
+                    # Process full page without splitting
+                    print("  Processing full page without splitting...")
+                    chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
                     # Scale coordinates from rendered image pixels to PDF points
                     for result in chunk_results:
                         bbox['height'] = bbox['height'] / RENDER_SCALE
                     page_results = chunk_results
+                    print(f"  Extracted {len(chunk_results)} items")
+                # Enhanced deduplication with spatial clustering
+                unique_results = deduplicate_results(page_results)
+                print(f"  After deduplication: {len(unique_results)} unique items")
+                # Return results with both original and effective dimensions
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {
+                        "width": original_width,
+                        "height": original_height
+                    },
+                    "effective_dimensions": {
+                        "width": effective_pdf_width,
+                        "height": effective_pdf_height
                     },
+                    "rotation": original_rotation,
                     "extractions": unique_results
                 })
                 all_results.append({
                     "page": page_num + 1,
                     "page_dimensions": {"width": 0, "height": 0},
+                    "effective_dimensions": {"width": 0, "height": 0},
                     "rotation": 0,
                     "extractions": [],
                     "error": str(e)
         "pages": all_results
     }
+def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
+    """
+    Remove duplicate extractions using spatial clustering.
+    Tolerance is in PDF points.
+    """
+    if not results:
+        return []
+    unique_results = []
+    processed_indices = set()
+    for i, result in enumerate(results):
+        if i in processed_indices:
+            continue
+        bbox = result['bbox']
+        center_x = bbox['x'] + bbox['width'] / 2
+        center_y = bbox['y'] + bbox['height'] / 2
+        # Find all results that are close to this one
+        cluster = [result]
+        cluster_indices = {i}
+        for j, other in enumerate(results):
+            if j <= i or j in processed_indices:
+                continue
+            other_bbox = other['bbox']
+            other_center_x = other_bbox['x'] + other_bbox['width'] / 2
+            other_center_y = other_bbox['y'] + other_bbox['height'] / 2
+            # Check if centers are within tolerance
+            dist = math.sqrt((center_x - other_center_x)**2 + (center_y - other_center_y)**2)
+            if dist < tolerance:
+                # Check if it's roughly the same size
+                size_ratio_w = bbox['width'] / other_bbox['width'] if other_bbox['width'] > 0 else 1
+                size_ratio_h = bbox['height'] / other_bbox['height'] if other_bbox['height'] > 0 else 1
+                if 0.7 < size_ratio_w < 1.3 and 0.7 < size_ratio_h < 1.3:
+                    cluster.append(other)
+                    cluster_indices.add(j)
+        # Choose the best result from the cluster (e.g., longest text)
+        best_result = max(cluster, key=lambda r: len(r.get('text', '')))
+        unique_results.append(best_result)
+        processed_indices.update(cluster_indices)
+    return unique_results
 def process_image(image_bytes):
     """Process single image"""
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     img_width, img_height = image.size
+    print(f"Processing single image: {img_width}x{img_height}")
+    # Check if image should be split
+    should_split, _ = should_split_page(img_width, img_height, 0, 2000)
+    if should_split:
+        print("  Image is wide, splitting into chunks...")
+        chunks = split_image_intelligently(image, 2000, overlap_ratio=0.2)
+        all_results = []
+        for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
+            chunk_results = process_image_chunk(chunk_image, max_tokens=768)
+            # Adjust coordinates for chunk offset
+            for result in chunk_results:
+                result['bbox']['x'] += x_offset
+            all_results.extend(chunk_results)
+        # Deduplicate
+        results = deduplicate_results(all_results)
+    else:
+        # Process the image as-is
+        results = process_image_chunk(image, max_tokens=768)
+    print(f"  Total extractions: {len(results)}")
     return {
         "document_type": "image",
             "height": img_height
         },
         "extractions": results
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)