File size: 3,039 Bytes
5abe5ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# file: pdf_parallel_parser.py

import fitz  # PyMuPDF
from PIL import Image
import io
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

# Import the specialized parsers from our other module
from complex_parser import process_table_element, process_image_element

def _is_bbox_contained(inner_bbox, outer_bbox):
    """Check if inner_bbox is fully inside outer_bbox."""
    return (inner_bbox[0] >= outer_bbox[0] and
            inner_bbox[1] >= outer_bbox[1] and
            inner_bbox[2] <= outer_bbox[2] and
            inner_bbox[3] <= outer_bbox[3])

def _process_page(page: fitz.Page) -> str:
    """
    Processes a single PDF page to extract text, tables, and images.
    - Tables are found and processed with the complex_parser.
    - Plain text is extracted, excluding any text already inside a processed table.
    """
    page_content = []
    
    # 1. Find and process tables first
    table_bboxes = []
    try:
        tables = page.find_tables()
        pix = page.get_pixmap(dpi=200)
        page_image = Image.open(io.BytesIO(pix.tobytes("png")))
        
        print(f"Page {page.number}: Found {len(tables.tables)} potential tables.")
        for i, table in enumerate(tables):
            table_bboxes.append(table.bbox)
            table_image = page_image.crop(table.bbox)
            markdown_table = process_table_element(table_image)
            page_content.append(markdown_table)
    except Exception as e:
        print(f"Could not process tables on page {page.number}: {e}")

    # 2. Extract text blocks, excluding those within table bounding boxes
    text_blocks = page.get_text("blocks")
    for block in text_blocks:
        block_bbox = block[:4]
        # Check if this text block is inside any of the tables we just processed
        is_in_table = any(_is_bbox_contained(block_bbox, table_bbox) for table_bbox in table_bboxes)
        if not is_in_table:
            page_content.append(block[4].strip())
            
    # Note: Image extraction can be added here if needed, similar to table extraction.

    return "\n".join(page_content)

def process_pdf_with_hybrid_parallel_sync(file_path: Path) -> str:
    """
    Processes a PDF file in parallel using PyMuPDF and the complex_parser.
    """
    print(f"Processing PDF '{file_path.name}' with parallel page-by-page strategy...")
    all_page_texts = []
    doc = fitz.open(file_path)

    with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as executor:
        futures = {executor.submit(_process_page, page): page.number for page in doc}
        
        # Collect results in page order
        results = ["" for _ in range(len(doc))]
        for future in as_completed(futures):
            page_num = futures[future]
            try:
                results[page_num] = future.result()
            except Exception as e:
                print(f"Error processing page {page_num}: {e}")
        all_page_texts = results

    return f"\n\n--- Page Break ---\n\n".join(all_page_texts)