Spaces:

alfonsovelp
/

llm_document

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 23, 2025

Commit

c9e5fd6

1 Parent(s): 179cb76

fix chunk

Browse files

Files changed (1) hide show

app.py +211 -135

app.py CHANGED Viewed

@@ -68,6 +68,9 @@ async def extract_document(request: DocumentRequest):
             return process_image(file_bytes)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 def process_image_chunk(image: Image.Image) -> List[Dict]:
@@ -77,25 +80,34 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
     """
     img_width, img_height = image.size
     try:
         encoding = processor(
             image,
             truncation=True,
             padding="max_length",
-            max_length=1024,
             return_tensors="pt"
         )
     except Exception as e:
         print(f"OCR failed: {e}, using fallback")
-        encoding = processor(
-            image,
-            text=[""] * 1024,
-            boxes=[[0, 0, 0, 0]] * 1024,
-            truncation=True,
-            padding="max_length",
-            max_length=1024,
-            return_tensors="pt"
-        )
     # Move to device and ensure bbox is clamped to valid range
     encoding_device = {}
@@ -116,59 +128,73 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
             print(f"CUDA error encountered: {e}")
             print("Falling back to CPU...")
             # Move everything to CPU
-            encoding = {k: v.cpu() for k, v in encoding.items()}
             model.cpu()
             with torch.no_grad():
                 outputs = model(**encoding)
             # Move model back to original device
             model.to(device)
         else:
             raise
-    tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
-    boxes = encoding["bbox"][0].tolist()
     results = []
     processed_boxes = set()
-    for token, box in zip(tokens, boxes):
-        if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
-            x_norm = box[0]
-            y_norm = box[1]
-            x2_norm = box[2]
-            y2_norm = box[3]
-            if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
-                continue
-            # Convert normalized coordinates to chunk pixel coordinates
-            x = (x_norm / 1000.0) * img_width
-            y = (y_norm / 1000.0) * img_height
-            x2 = (x2_norm / 1000.0) * img_width
-            y2 = (y2_norm / 1000.0) * img_height
-            width = x2 - x
-            height = y2 - y
-            if width < 1 or height < 1:
-                continue
-            box_tuple = (round(x), round(y), round(width), round(height))
-            if box_tuple in processed_boxes:
-                continue
-            processed_boxes.add(box_tuple)
-            clean_token = token.replace('##', '')
-            results.append({
-                "text": clean_token,
-                "bbox": {
-                    "x": x,
-                    "y": y,
-                    "width": width,
-                    "height": height
-                }
-            })
     return results
@@ -180,112 +206,162 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
         tmp_file.write(pdf_bytes)
         tmp_file.flush()
-        pdf_document = fitz.open(tmp_file.name)
         RENDER_SCALE = 2.0
-        MAX_WIDTH = 2000  # Maximum width before splitting (in pixels after rendering)
-        OVERLAP = 200     # Overlap between chunks to avoid missing text at boundaries
         for page_num in range(len(pdf_document)):
-            page = pdf_document[page_num]
-            page_rect = page.rect
-            page_width = page_rect.width
-            page_height = page_rect.height
-            print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
-            # Render page
-            mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("png")
-            full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
-            img_width, img_height = full_image.size
-            print(f"Rendered image: {img_width}x{img_height}")
-            page_results = []
-            # Check if page is too wide and should be split
-            if split_wide and img_width > MAX_WIDTH:
-                print(f"Page is wide ({img_width}px), splitting into chunks...")
-                num_chunks = (img_width + MAX_WIDTH - OVERLAP - 1) // (MAX_WIDTH - OVERLAP)
-                chunk_width = MAX_WIDTH
-                for chunk_idx in range(num_chunks):
-                    # Calculate chunk boundaries in rendered image pixels
-                    start_x = chunk_idx * (chunk_width - OVERLAP)
-                    end_x = min(start_x + chunk_width, img_width)
-                    # Crop chunk from rendered image
-                    chunk = full_image.crop((start_x, 0, end_x, img_height))
-                    print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x}")
-                    # Process chunk (returns coordinates relative to chunk)
-                    chunk_results = process_image_chunk(chunk)
-                    # Transform chunk-relative coordinates to full page coordinates
-                    for result in chunk_results:
-                        bbox = result['bbox']
-                        # Add chunk offset (in rendered image pixels)
-                        bbox['x'] += start_x
-                        # y stays the same (no vertical splitting)
-                        # bbox['y'] is already correct
-                        # Now scale from rendered image pixels to PDF points
                         bbox['x'] = bbox['x'] / RENDER_SCALE
                         bbox['y'] = bbox['y'] / RENDER_SCALE
                         bbox['width'] = bbox['width'] / RENDER_SCALE
                         bbox['height'] = bbox['height'] / RENDER_SCALE
-                    page_results.extend(chunk_results)
-                print(f"  Total extractions from all chunks: {len(page_results)}")
-            else:
-                # Process full page (no splitting needed)
-                chunk_results = process_image_chunk(full_image)
-                # Scale coordinates from rendered image pixels to PDF points
-                for result in chunk_results:
                     bbox = result['bbox']
-                    bbox['x'] = bbox['x'] / RENDER_SCALE
-                    bbox['y'] = bbox['y'] / RENDER_SCALE
-                    bbox['width'] = bbox['width'] / RENDER_SCALE
-                    bbox['height'] = bbox['height'] / RENDER_SCALE
-                page_results = chunk_results
-            # Remove duplicates from overlapping chunks
-            unique_results = []
-            seen_boxes = set()
-            for result in page_results:
-                bbox = result['bbox']
-                box_tuple = (
-                    round(bbox['x']),
-                    round(bbox['y']),
-                    round(bbox['width']),
-                    round(bbox['height'])
-                )
-                if box_tuple not in seen_boxes:
-                    seen_boxes.add(box_tuple)
-                    unique_results.append(result)
-            print(f"  After deduplication: {len(unique_results)} unique extractions")
-            all_results.append({
-                "page": page_num + 1,
-                "page_dimensions": {
-                    "width": page_width,
-                    "height": page_height
-                },
-                "rotation": page.rotation,
-                "extractions": unique_results
-            })
         pdf_document.close()
         os.unlink(tmp_file.name)

             return process_image(file_bytes)
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error in extract_document: {error_details}")
         raise HTTPException(status_code=500, detail=str(e))
 def process_image_chunk(image: Image.Image) -> List[Dict]:
     """
     img_width, img_height = image.size
+    # Validate image dimensions
+    if img_width < 1 or img_height < 1:
+        print(f"Invalid image dimensions: {img_width}x{img_height}")
+        return []
     try:
         encoding = processor(
             image,
             truncation=True,
             padding="max_length",
+            max_length=512,  # Reduced from 1024 for better stability
             return_tensors="pt"
         )
     except Exception as e:
         print(f"OCR failed: {e}, using fallback")
+        try:
+            encoding = processor(
+                image,
+                text=[""] * 512,
+                boxes=[[0, 0, 0, 0]] * 512,
+                truncation=True,
+                padding="max_length",
+                max_length=512,
+                return_tensors="pt"
+            )
+        except Exception as e2:
+            print(f"Fallback also failed: {e2}")
+            return []
     # Move to device and ensure bbox is clamped to valid range
     encoding_device = {}
             print(f"CUDA error encountered: {e}")
             print("Falling back to CPU...")
             # Move everything to CPU
+            encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
             model.cpu()
             with torch.no_grad():
                 outputs = model(**encoding)
             # Move model back to original device
             model.to(device)
+        elif "index out of range" in str(e):
+            print(f"Index error in model processing: {e}")
+            return []
         else:
             raise
+    except Exception as e:
+        print(f"Unexpected error in model processing: {e}")
+        return []
+    try:
+        tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
+        boxes = encoding["bbox"][0].tolist()
+    except Exception as e:
+        print(f"Error extracting tokens/boxes: {e}")
+        return []
     results = []
     processed_boxes = set()
+    for idx, (token, box) in enumerate(zip(tokens, boxes)):
+        try:
+            if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
+                x_norm = box[0]
+                y_norm = box[1]
+                x2_norm = box[2]
+                y2_norm = box[3]
+                if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
+                    continue
+                # Convert normalized coordinates to chunk pixel coordinates
+                x = (x_norm / 1000.0) * img_width
+                y = (y_norm / 1000.0) * img_height
+                x2 = (x2_norm / 1000.0) * img_width
+                y2 = (y2_norm / 1000.0) * img_height
+                width = x2 - x
+                height = y2 - y
+                if width < 1 or height < 1:
+                    continue
+                box_tuple = (round(x), round(y), round(width), round(height))
+                if box_tuple in processed_boxes:
+                    continue
+                processed_boxes.add(box_tuple)
+                clean_token = token.replace('##', '')
+                results.append({
+                    "text": clean_token,
+                    "bbox": {
+                        "x": x,
+                        "y": y,
+                        "width": width,
+                        "height": height
+                    }
+                })
+        except Exception as e:
+            print(f"Error processing token at index {idx}: {e}")
+            continue
     return results
         tmp_file.write(pdf_bytes)
         tmp_file.flush()
+        try:
+            pdf_document = fitz.open(tmp_file.name)
+        except Exception as e:
+            os.unlink(tmp_file.name)
+            raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
         RENDER_SCALE = 2.0
+        MAX_WIDTH = 1800  # Reduced from 2000 for better stability
+        OVERLAP = 150     # Reduced overlap
         for page_num in range(len(pdf_document)):
+            try:
+                page = pdf_document[page_num]
+                page_rect = page.rect
+                page_width = page_rect.width
+                page_height = page_rect.height
+                print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
+                # Render page
+                mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("png")
+                full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
+                img_width, img_height = full_image.size
+                print(f"Rendered image: {img_width}x{img_height}")
+                page_results = []
+                # Check if page is too wide and should be split
+                if split_wide and img_width > MAX_WIDTH:
+                    print(f"Page is wide ({img_width}px), splitting into chunks...")
+                    # Calculate proper number of chunks with safer logic
+                    step_size = MAX_WIDTH - OVERLAP
+                    if step_size <= 0:
+                        step_size = MAX_WIDTH // 2  # Fallback
+                    num_chunks = max(1, ((img_width - OVERLAP) + step_size - 1) // step_size)
+                    print(f"Will create {num_chunks} chunks with step size {step_size}")
+                    for chunk_idx in range(num_chunks):
+                        # Calculate chunk boundaries in rendered image pixels
+                        start_x = chunk_idx * step_size
+                        end_x = min(start_x + MAX_WIDTH, img_width)
+                        # Ensure chunk has valid dimensions
+                        if end_x <= start_x:
+                            print(f"  Skipping invalid chunk {chunk_idx + 1}: start_x={start_x}, end_x={end_x}")
+                            continue
+                        chunk_actual_width = end_x - start_x
+                        # Skip chunks that are too narrow
+                        if chunk_actual_width < 100:
+                            print(f"  Skipping narrow chunk {chunk_idx + 1}: width={chunk_actual_width}")
+                            continue
+                        print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x} (width={chunk_actual_width})")
+                        try:
+                            # Crop chunk from rendered image
+                            chunk = full_image.crop((start_x, 0, end_x, img_height))
+                            # Verify chunk dimensions
+                            verify_width, verify_height = chunk.size
+                            print(f"    Chunk actual size: {verify_width}x{verify_height}")
+                            # Process chunk (returns coordinates relative to chunk)
+                            chunk_results = process_image_chunk(chunk)
+                            print(f"    Extracted {len(chunk_results)} items from chunk")
+                            # Transform chunk-relative coordinates to full page coordinates
+                            for result in chunk_results:
+                                bbox = result['bbox']
+                                # Add chunk offset (in rendered image pixels)
+                                bbox['x'] += start_x
+                                # y stays the same (no vertical splitting)
+                                # Now scale from rendered image pixels to PDF points
+                                bbox['x'] = bbox['x'] / RENDER_SCALE
+                                bbox['y'] = bbox['y'] / RENDER_SCALE
+                                bbox['width'] = bbox['width'] / RENDER_SCALE
+                                bbox['height'] = bbox['height'] / RENDER_SCALE
+                            page_results.extend(chunk_results)
+                        except Exception as e:
+                            print(f"  Error processing chunk {chunk_idx + 1}: {e}")
+                            import traceback
+                            traceback.print_exc()
+                            continue
+                    print(f"  Total extractions from all chunks: {len(page_results)}")
+                else:
+                    # Process full page (no splitting needed)
+                    print("Processing full page without splitting")
+                    chunk_results = process_image_chunk(full_image)
+                    # Scale coordinates from rendered image pixels to PDF points
+                    for result in chunk_results:
+                        bbox = result['bbox']
                         bbox['x'] = bbox['x'] / RENDER_SCALE
                         bbox['y'] = bbox['y'] / RENDER_SCALE
                         bbox['width'] = bbox['width'] / RENDER_SCALE
                         bbox['height'] = bbox['height'] / RENDER_SCALE
+                    page_results = chunk_results
+                # Remove duplicates from overlapping chunks
+                unique_results = []
+                seen_boxes = set()
+                DEDUP_TOLERANCE = 5  # pixels tolerance for deduplication
+                for result in page_results:
                     bbox = result['bbox']
+                    box_tuple = (
+                        round(bbox['x'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
+                        round(bbox['y'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
+                        round(bbox['width'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
+                        round(bbox['height'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE
+                    )
+                    if box_tuple not in seen_boxes:
+                        seen_boxes.add(box_tuple)
+                        unique_results.append(result)
+                print(f"  After deduplication: {len(unique_results)} unique extractions")
+                all_results.append({
+                    "page": page_num + 1,
+                    "page_dimensions": {
+                        "width": page_width,
+                        "height": page_height
+                    },
+                    "rotation": page.rotation,
+                    "extractions": unique_results
+                })
+            except Exception as e:
+                print(f"Error processing page {page_num + 1}: {e}")
+                import traceback
+                traceback.print_exc()
+                # Add empty page result to maintain page numbering
+                all_results.append({
+                    "page": page_num + 1,
+                    "page_dimensions": {"width": 0, "height": 0},
+                    "rotation": 0,
+                    "extractions": [],
+                    "error": str(e)
+                })
         pdf_document.close()
         os.unlink(tmp_file.name)