Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 21, 2025

Commit

1af4bc8

1 Parent(s): dd88d34

endpoint

Browse files

Files changed (1) hide show

app.py +150 -185

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import Dict, Any
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 import torch
 from PIL import Image
@@ -29,7 +29,6 @@ try:
     model.to(device)
 except Exception as e:
     print(f"Error loading model: {e}")
-    # Fallback to no OCR if there's an issue
     processor = LayoutLMv3Processor.from_pretrained(
         "microsoft/layoutlmv3-base",
         apply_ocr=False
@@ -44,6 +43,7 @@ except Exception as e:
 class DocumentRequest(BaseModel):
     pdf: str = None
     image: str = None
 @app.get("/")
 def home():
@@ -52,190 +52,25 @@ def home():
 @app.post("/extract")
 async def extract_document(request: DocumentRequest):
     try:
-        # Determine input type
         file_data = request.pdf or request.image
         if not file_data:
             raise HTTPException(status_code=400, detail="No PDF or image provided")
-        # Decode base64
         file_bytes = base64.b64decode(file_data)
-        # Check if PDF or image
         if file_bytes.startswith(b'%PDF'):
-            return process_pdf(file_bytes)
         else:
             return process_image(file_bytes)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-def process_pdf(pdf_bytes):
-    """Process PDF document with proper coordinate scaling for any orientation"""
-    all_results = []
-    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
-        tmp_file.write(pdf_bytes)
-        tmp_file.flush()
-        pdf_document = fitz.open(tmp_file.name)
-        # Define render scale
-        RENDER_SCALE = 2.0
-        for page_num in range(len(pdf_document)):
-            page = pdf_document[page_num]
-            # CRITICAL FIX: Get the actual page rectangle
-            # This accounts for rotation and gives us the true page dimensions
-            page_rect = page.rect
-            page_width = page_rect.width
-            page_height = page_rect.height
-            print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
-            # Render page at consistent resolution
-            # The matrix handles rotation automatically
-            mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("png")
-            image = Image.open(io.BytesIO(img_data)).convert("RGB")
-            # Store rendered image dimensions
-            img_width, img_height = image.size
-            print(f"Rendered image: {img_width}x{img_height}")
-            # CRITICAL: Verify the scaling is correct
-            # The rendered image should be RENDER_SCALE times the page size
-            expected_width = page_width * RENDER_SCALE
-            expected_height = page_height * RENDER_SCALE
-            if abs(img_width - expected_width) > 5 or abs(img_height - expected_height) > 5:
-                print(f"WARNING: Image size mismatch! Expected {expected_width}x{expected_height}")
-            try:
-                # Try with OCR - increased max_length for wide documents
-                encoding = processor(
-                    image,
-                    truncation=True,
-                    padding="max_length",
-                    max_length=1024,  # Increased from 512 to handle wider documents
-                    return_tensors="pt"
-                )
-            except Exception as ocr_error:
-                print(f"OCR failed: {ocr_error}, using fallback")
-                # Fallback: process without OCR
-                encoding = processor(
-                    image,
-                    text=[""] * 512,  # Dummy text
-                    boxes=[[0, 0, 0, 0]] * 512,  # Dummy boxes
-                    truncation=True,
-                    padding="max_length",
-                    max_length=512,
-                    return_tensors="pt"
-                )
-            encoding = {k: v.to(device) for k, v in encoding.items() if isinstance(v, torch.Tensor)}
-            with torch.no_grad():
-                outputs = model(**encoding)
-            tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
-            boxes = encoding["bbox"][0].tolist()
-            page_results = []
-            processed_boxes = set()  # Track processed boxes to avoid duplicates
-            for token, box in zip(tokens, boxes):
-                if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
-                    # LayoutLMv3 returns normalized coordinates (0-1000)
-                    # These are normalized relative to the INPUT IMAGE dimensions
-                    x_norm = box[0]
-                    y_norm = box[1]
-                    x2_norm = box[2]
-                    y2_norm = box[3]
-                    # Skip invalid boxes
-                    if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
-                        continue
-                    # STEP 1: Convert normalized (0-1000) to rendered image pixel coordinates
-                    # CRITICAL: Use the ACTUAL rendered image dimensions
-                    x_img = (x_norm / 1000.0) * img_width
-                    y_img = (y_norm / 1000.0) * img_height
-                    x2_img = (x2_norm / 1000.0) * img_width
-                    y2_img = (y2_norm / 1000.0) * img_height
-                    # STEP 2: Scale back to PDF page coordinates
-                    # The image was rendered at RENDER_SCALE times the PDF size
-                    x = x_img / RENDER_SCALE
-                    y = y_img / RENDER_SCALE
-                    x2 = x2_img / RENDER_SCALE
-                    y2 = y2_img / RENDER_SCALE
-                    width = x2 - x
-                    height = y2 - y
-                    # Skip boxes that are too small
-                    if width < 1 or height < 1:
-                        continue
-                    # Validate bounds
-                    if x < 0 or y < 0 or x2 > page_width or y2 > page_height:
-                        # Allow small tolerance for rounding errors
-                        if (x < -2 or y < -2 or
-                            x2 > page_width + 2 or y2 > page_height + 2):
-                            print(f"Skipping out of bounds box: ({x:.1f},{y:.1f}) to ({x2:.1f},{y2:.1f})")
-                            continue
-                        # Clamp to valid bounds
-                        x = max(0, x)
-                        y = max(0, y)
-                        x2 = min(page_width, x2)
-                        y2 = min(page_height, y2)
-                        width = x2 - x
-                        height = y2 - y
-                    # Create box tuple for duplicate checking
-                    box_tuple = (round(x), round(y), round(width), round(height))
-                    if box_tuple in processed_boxes:
-                        continue
-                    processed_boxes.add(box_tuple)
-                    # Clean up token text (remove ## prefix from subwords)
-                    clean_token = token.replace('##', '')
-                    page_results.append({
-                        "text": clean_token,
-                        "bbox": {
-                            "x": x,
-                            "y": y,
-                            "width": width,
-                            "height": height
-                        }
-                    })
-            all_results.append({
-                "page": page_num + 1,
-                "page_dimensions": {
-                    "width": page_width,
-                    "height": page_height
-                },
-                "rotation": page.rotation,
-                "extractions": page_results
-            })
-        pdf_document.close()
-        os.unlink(tmp_file.name)  # Clean up temp file
-    return {
-        "document_type": "pdf",
-        "total_pages": len(all_results),
-        "pages": all_results
-    }
-def process_image(image_bytes):
-    """Process single image with proper coordinate scaling"""
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     img_width, img_height = image.size
     try:
@@ -243,19 +78,18 @@ def process_image(image_bytes):
             image,
             truncation=True,
             padding="max_length",
-            max_length=512,
             return_tensors="pt"
         )
     except Exception as e:
         print(f"OCR failed: {e}, using fallback")
-        # Fallback without OCR
         encoding = processor(
             image,
-            text=[""] * 512,
-            boxes=[[0, 0, 0, 0]] * 512,
             truncation=True,
             padding="max_length",
-            max_length=512,
             return_tensors="pt"
         )
@@ -272,37 +106,37 @@ def process_image(image_bytes):
     for token, box in zip(tokens, boxes):
         if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
-            # LayoutLMv3 returns normalized coordinates (0-1000)
-            # For images, we directly have the correct dimensions
             x_norm = box[0]
             y_norm = box[1]
             x2_norm = box[2]
             y2_norm = box[3]
-            # Skip invalid boxes
             if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                 continue
-            # Convert to actual image coordinates
             x = (x_norm / 1000.0) * img_width
             y = (y_norm / 1000.0) * img_height
             x2 = (x2_norm / 1000.0) * img_width
             y2 = (y2_norm / 1000.0) * img_height
             width = x2 - x
             height = y2 - y
-            # Skip boxes that are too small
             if width < 1 or height < 1:
                 continue
-            # Check for duplicates
             box_tuple = (round(x), round(y), round(width), round(height))
             if box_tuple in processed_boxes:
                 continue
             processed_boxes.add(box_tuple)
-            # Clean up token text
             clean_token = token.replace('##', '')
             results.append({
@@ -315,6 +149,137 @@ def process_image(image_bytes):
                 }
             })
     return {
         "document_type": "image",
         "image_dimensions": {

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from typing import Dict, Any, List
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 import torch
 from PIL import Image
     model.to(device)
 except Exception as e:
     print(f"Error loading model: {e}")
     processor = LayoutLMv3Processor.from_pretrained(
         "microsoft/layoutlmv3-base",
         apply_ocr=False
 class DocumentRequest(BaseModel):
     pdf: str = None
     image: str = None
+    split_wide_pages: bool = True  # New option to split wide pages
 @app.get("/")
 def home():
 @app.post("/extract")
 async def extract_document(request: DocumentRequest):
     try:
         file_data = request.pdf or request.image
         if not file_data:
             raise HTTPException(status_code=400, detail="No PDF or image provided")
         file_bytes = base64.b64decode(file_data)
         if file_bytes.startswith(b'%PDF'):
+            return process_pdf(pdf_bytes=file_bytes, split_wide=request.split_wide_pages)
         else:
             return process_image(file_bytes)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+def process_image_chunk(image: Image.Image, offset_x: float = 0, offset_y: float = 0) -> List[Dict]:
+    """
+    Process a single image or image chunk and return extractions.
+    offset_x and offset_y are used when processing chunks of a larger image.
+    """
     img_width, img_height = image.size
     try:
             image,
             truncation=True,
             padding="max_length",
+            max_length=1024,  # Increased limit
             return_tensors="pt"
         )
     except Exception as e:
         print(f"OCR failed: {e}, using fallback")
         encoding = processor(
             image,
+            text=[""] * 1024,
+            boxes=[[0, 0, 0, 0]] * 1024,
             truncation=True,
             padding="max_length",
+            max_length=1024,
             return_tensors="pt"
         )
     for token, box in zip(tokens, boxes):
         if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
             x_norm = box[0]
             y_norm = box[1]
             x2_norm = box[2]
             y2_norm = box[3]
             if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
                 continue
+            # Convert to chunk coordinates
             x = (x_norm / 1000.0) * img_width
             y = (y_norm / 1000.0) * img_height
             x2 = (x2_norm / 1000.0) * img_width
             y2 = (y2_norm / 1000.0) * img_height
+            # Add offset to get coordinates in full page space
+            x += offset_x
+            y += offset_y
+            x2 += offset_x
+            y2 += offset_y
             width = x2 - x
             height = y2 - y
             if width < 1 or height < 1:
                 continue
             box_tuple = (round(x), round(y), round(width), round(height))
             if box_tuple in processed_boxes:
                 continue
             processed_boxes.add(box_tuple)
             clean_token = token.replace('##', '')
             results.append({
                 }
             })
+    return results
+def process_pdf(pdf_bytes, split_wide: bool = True):
+    """Process PDF document, optionally splitting wide pages into chunks"""
+    all_results = []
+    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
+        tmp_file.write(pdf_bytes)
+        tmp_file.flush()
+        pdf_document = fitz.open(tmp_file.name)
+        RENDER_SCALE = 2.0
+        MAX_WIDTH = 2000  # Maximum width before splitting (in pixels after rendering)
+        OVERLAP = 200     # Overlap between chunks to avoid missing text at boundaries
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            page_rect = page.rect
+            page_width = page_rect.width
+            page_height = page_rect.height
+            print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
+            # Render page
+            mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+            full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
+            img_width, img_height = full_image.size
+            print(f"Rendered image: {img_width}x{img_height}")
+            page_results = []
+            # Check if page is too wide and should be split
+            if split_wide and img_width > MAX_WIDTH:
+                print(f"Page is wide ({img_width}px), splitting into chunks...")
+                num_chunks = (img_width + MAX_WIDTH - OVERLAP - 1) // (MAX_WIDTH - OVERLAP)
+                chunk_width = MAX_WIDTH
+                for chunk_idx in range(num_chunks):
+                    # Calculate chunk boundaries
+                    start_x = chunk_idx * (chunk_width - OVERLAP)
+                    end_x = min(start_x + chunk_width, img_width)
+                    # Crop chunk
+                    chunk = full_image.crop((start_x, 0, end_x, img_height))
+                    print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
+                    # Process chunk and adjust coordinates
+                    chunk_offset_pdf = start_x / RENDER_SCALE
+                    chunk_results = process_image_chunk(
+                        chunk,
+                        offset_x=chunk_offset_pdf,
+                        offset_y=0
+                    )
+                    # Scale coordinates back to PDF space
+                    for result in chunk_results:
+                        bbox = result['bbox']
+                        bbox['x'] /= RENDER_SCALE
+                        bbox['y'] /= RENDER_SCALE
+                        bbox['width'] /= RENDER_SCALE
+                        bbox['height'] /= RENDER_SCALE
+                    page_results.extend(chunk_results)
+                print(f"  Total extractions from all chunks: {len(page_results)}")
+            else:
+                # Process full page
+                chunk_results = process_image_chunk(full_image, 0, 0)
+                # Scale coordinates back to PDF space
+                for result in chunk_results:
+                    bbox = result['bbox']
+                    bbox['x'] = (bbox['x'] / img_width) * page_width
+                    bbox['y'] = (bbox['y'] / img_height) * page_height
+                    bbox['width'] = (bbox['width'] / img_width) * page_width
+                    bbox['height'] = (bbox['height'] / img_height) * page_height
+                page_results = chunk_results
+            # Remove duplicates from overlapping chunks
+            unique_results = []
+            seen_boxes = set()
+            for result in page_results:
+                bbox = result['bbox']
+                box_tuple = (
+                    round(bbox['x']),
+                    round(bbox['y']),
+                    round(bbox['width']),
+                    round(bbox['height'])
+                )
+                if box_tuple not in seen_boxes:
+                    seen_boxes.add(box_tuple)
+                    unique_results.append(result)
+            print(f"  After deduplication: {len(unique_results)} unique extractions")
+            all_results.append({
+                "page": page_num + 1,
+                "page_dimensions": {
+                    "width": page_width,
+                    "height": page_height
+                },
+                "rotation": page.rotation,
+                "extractions": unique_results
+            })
+        pdf_document.close()
+        os.unlink(tmp_file.name)
+    return {
+        "document_type": "pdf",
+        "total_pages": len(all_results),
+        "pages": all_results
+    }
+def process_image(image_bytes):
+    """Process single image"""
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    img_width, img_height = image.size
+    results = process_image_chunk(image, 0, 0)
     return {
         "document_type": "image",
         "image_dimensions": {