Spaces:
Sleeping
Sleeping
File size: 4,104 Bytes
ee4c8aa 608a057 2ae2f88 4968aaa a50fdc7 27937fa 608a057 c842ecc 1296a46 c842ecc 27937fa 608a057 ab6b7bb ee4c8aa 2ae2f88 4968aaa ee4c8aa 4968aaa 27937fa ee4c8aa 4968aaa ee4c8aa 4968aaa ab6b7bb c842ecc ee4c8aa 4968aaa 0a21337 c842ecc 0a21337 c842ecc 0a21337 c842ecc 0a21337 c842ecc 27937fa 1296a46 c842ecc 1296a46 c842ecc ee4c8aa eb147e0 ee4c8aa 4968aaa 27937fa eb147e0 4968aaa 27937fa 2ae2f88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import fitz # PyMuPDF
import easyocr
import os
import tempfile
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
"""
Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
Args:
pdf_path (str): Path to the PDF file.
Returns:
list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
Returns empty list if failed.
"""
try:
# Save PDF to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
with open(pdf_path, 'rb') as f:
tmp.write(f.read())
temp_path = tmp.name
logger.info(f"Temporary PDF created at: {temp_path}")
# Convert PDF to images using PyMuPDF
doc = fitz.open(temp_path)
if not doc.page_count:
logger.error(f"PDF is empty or unreadable: {pdf_path}")
return []
all_pages = []
reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72)) # Increased DPI to 400 for better detection
img_path = f"{temp_path}_page_{page_num}.png"
pix.save(img_path)
logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")
# Get image dimensions
image_width, image_height = pix.width, pix.height
# Perform OCR using EasyOCR
results = reader.readtext(img_path)
if not results:
logger.warning(f"No text detected on page {page_num + 1}")
text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
words = []
bboxes = []
# Split text segments into words and assign normalized bounding boxes
for res in results:
segment_text = res[1]
segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
# Normalize bounding box to 0-1000 range
normalized_bbox = [
int((segment_bbox[0] / image_width) * 1000),
int((segment_bbox[1] / image_height) * 1000),
int((segment_bbox[2] / image_width) * 1000),
int((segment_bbox[3] / image_height) * 1000)
]
# Ensure coordinates are within 0-1000
normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
segment_words = segment_text.split()
# Assign the same normalized bounding box to each word in the segment
for word in segment_words:
words.append(word)
bboxes.append(normalized_bbox)
if text.strip():
all_pages.append({
"text": text,
"words": words,
"bbox": bboxes,
"image_dims": [image_width, image_height]
})
else:
all_pages.append({
"text": f"Page {page_num + 1}: No text detected",
"words": [],
"bbox": [],
"image_dims": [image_width, image_height]
})
# Clean up temporary image
if os.path.exists(img_path):
os.unlink(img_path)
doc.close()
logger.info(f"Extracted data from {len(all_pages)} pages")
return all_pages
except Exception as e:
logger.error(f"OCR failed: {str(e)}")
return []
finally:
if os.path.exists(temp_path):
os.unlink(temp_path) |