File size: 4,104 Bytes
ee4c8aa
 
608a057
2ae2f88
4968aaa
 
 
 
 
a50fdc7
27937fa
608a057
c842ecc
1296a46
 
 
c842ecc
 
27937fa
608a057
ab6b7bb
ee4c8aa
2ae2f88
 
 
 
4968aaa
ee4c8aa
 
 
4968aaa
 
 
27937fa
ee4c8aa
 
 
 
4968aaa
ee4c8aa
 
4968aaa
ab6b7bb
c842ecc
 
 
ee4c8aa
 
4968aaa
 
0a21337
 
 
 
c842ecc
0a21337
 
 
c842ecc
 
 
 
 
 
 
 
 
0a21337
c842ecc
0a21337
 
c842ecc
27937fa
1296a46
c842ecc
 
 
 
 
 
1296a46
c842ecc
 
 
 
 
 
ee4c8aa
 
 
 
eb147e0
ee4c8aa
4968aaa
27937fa
eb147e0
4968aaa
27937fa
2ae2f88
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import fitz  # PyMuPDF
import easyocr
import os
import tempfile
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
    """
    Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        list: List of dictionaries, each containing 'text' (str), 'words' (list of str), 
              'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
              Returns empty list if failed.
    """
    try:
        # Save PDF to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            with open(pdf_path, 'rb') as f:
                tmp.write(f.read())
            temp_path = tmp.name
        logger.info(f"Temporary PDF created at: {temp_path}")

        # Convert PDF to images using PyMuPDF
        doc = fitz.open(temp_path)
        if not doc.page_count:
            logger.error(f"PDF is empty or unreadable: {pdf_path}")
            return []
        all_pages = []
        reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed

        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72))  # Increased DPI to 400 for better detection
            img_path = f"{temp_path}_page_{page_num}.png"
            pix.save(img_path)
            logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")

            # Get image dimensions
            image_width, image_height = pix.width, pix.height

            # Perform OCR using EasyOCR
            results = reader.readtext(img_path)
            if not results:
                logger.warning(f"No text detected on page {page_num + 1}")
            text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
            words = []
            bboxes = []

            # Split text segments into words and assign normalized bounding boxes
            for res in results:
                segment_text = res[1]
                segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]]  # [x0, y0, x1, y1]
                # Normalize bounding box to 0-1000 range
                normalized_bbox = [
                    int((segment_bbox[0] / image_width) * 1000),
                    int((segment_bbox[1] / image_height) * 1000),
                    int((segment_bbox[2] / image_width) * 1000),
                    int((segment_bbox[3] / image_height) * 1000)
                ]
                # Ensure coordinates are within 0-1000
                normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
                segment_words = segment_text.split()
                # Assign the same normalized bounding box to each word in the segment
                for word in segment_words:
                    words.append(word)
                    bboxes.append(normalized_bbox)

            if text.strip():
                all_pages.append({
                    "text": text,
                    "words": words,
                    "bbox": bboxes,
                    "image_dims": [image_width, image_height]
                })
            else:
                all_pages.append({
                    "text": f"Page {page_num + 1}: No text detected",
                    "words": [],
                    "bbox": [],
                    "image_dims": [image_width, image_height]
                })

            # Clean up temporary image
            if os.path.exists(img_path):
                os.unlink(img_path)

        doc.close()
        logger.info(f"Extracted data from {len(all_pages)} pages")
        return all_pages
    except Exception as e:
        logger.error(f"OCR failed: {str(e)}")
        return []
    finally:
        if os.path.exists(temp_path):
            os.unlink(temp_path)