SmartContractMigrator / ocr_utils.py
pavansuresh's picture
Update ocr_utils.py
4968aaa verified
import fitz # PyMuPDF
import easyocr
import os
import tempfile
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
"""
Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
Args:
pdf_path (str): Path to the PDF file.
Returns:
list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
Returns empty list if failed.
"""
try:
# Save PDF to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
with open(pdf_path, 'rb') as f:
tmp.write(f.read())
temp_path = tmp.name
logger.info(f"Temporary PDF created at: {temp_path}")
# Convert PDF to images using PyMuPDF
doc = fitz.open(temp_path)
if not doc.page_count:
logger.error(f"PDF is empty or unreadable: {pdf_path}")
return []
all_pages = []
reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72)) # Increased DPI to 400 for better detection
img_path = f"{temp_path}_page_{page_num}.png"
pix.save(img_path)
logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")
# Get image dimensions
image_width, image_height = pix.width, pix.height
# Perform OCR using EasyOCR
results = reader.readtext(img_path)
if not results:
logger.warning(f"No text detected on page {page_num + 1}")
text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
words = []
bboxes = []
# Split text segments into words and assign normalized bounding boxes
for res in results:
segment_text = res[1]
segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
# Normalize bounding box to 0-1000 range
normalized_bbox = [
int((segment_bbox[0] / image_width) * 1000),
int((segment_bbox[1] / image_height) * 1000),
int((segment_bbox[2] / image_width) * 1000),
int((segment_bbox[3] / image_height) * 1000)
]
# Ensure coordinates are within 0-1000
normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
segment_words = segment_text.split()
# Assign the same normalized bounding box to each word in the segment
for word in segment_words:
words.append(word)
bboxes.append(normalized_bbox)
if text.strip():
all_pages.append({
"text": text,
"words": words,
"bbox": bboxes,
"image_dims": [image_width, image_height]
})
else:
all_pages.append({
"text": f"Page {page_num + 1}: No text detected",
"words": [],
"bbox": [],
"image_dims": [image_width, image_height]
})
# Clean up temporary image
if os.path.exists(img_path):
os.unlink(img_path)
doc.close()
logger.info(f"Extracted data from {len(all_pages)} pages")
return all_pages
except Exception as e:
logger.error(f"OCR failed: {str(e)}")
return []
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)