Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

SmartContractMigrator / ocr_utils.py

pavansuresh

Update ocr_utils.py

4968aaa verified 6 months ago

raw

history blame contribute delete

4.1 kB

	import fitz # PyMuPDF
	import easyocr
	import os
	import tempfile
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
	"""
	Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
	Args:
	pdf_path (str): Path to the PDF file.
	Returns:
	list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
	'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
	Returns empty list if failed.
	"""
	try:
	# Save PDF to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
	with open(pdf_path, 'rb') as f:
	tmp.write(f.read())
	temp_path = tmp.name
	logger.info(f"Temporary PDF created at: {temp_path}")

	# Convert PDF to images using PyMuPDF
	doc = fitz.open(temp_path)
	if not doc.page_count:
	logger.error(f"PDF is empty or unreadable: {pdf_path}")
	return []
	all_pages = []
	reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed

	for page_num in range(len(doc)):
	page = doc[page_num]
	pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72)) # Increased DPI to 400 for better detection
	img_path = f"{temp_path}_page_{page_num}.png"
	pix.save(img_path)
	logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")

	# Get image dimensions
	image_width, image_height = pix.width, pix.height

	# Perform OCR using EasyOCR
	results = reader.readtext(img_path)
	if not results:
	logger.warning(f"No text detected on page {page_num + 1}")
	text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
	words = []
	bboxes = []

	# Split text segments into words and assign normalized bounding boxes
	for res in results:
	segment_text = res[1]
	segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
	# Normalize bounding box to 0-1000 range
	normalized_bbox = [
	int((segment_bbox[0] / image_width) * 1000),
	int((segment_bbox[1] / image_height) * 1000),
	int((segment_bbox[2] / image_width) * 1000),
	int((segment_bbox[3] / image_height) * 1000)
	]
	# Ensure coordinates are within 0-1000
	normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
	segment_words = segment_text.split()
	# Assign the same normalized bounding box to each word in the segment
	for word in segment_words:
	words.append(word)
	bboxes.append(normalized_bbox)

	if text.strip():
	all_pages.append({
	"text": text,
	"words": words,
	"bbox": bboxes,
	"image_dims": [image_width, image_height]
	})
	else:
	all_pages.append({
	"text": f"Page {page_num + 1}: No text detected",
	"words": [],
	"bbox": [],
	"image_dims": [image_width, image_height]
	})

	# Clean up temporary image
	if os.path.exists(img_path):
	os.unlink(img_path)

	doc.close()
	logger.info(f"Extracted data from {len(all_pages)} pages")
	return all_pages
	except Exception as e:
	logger.error(f"OCR failed: {str(e)}")
	return []
	finally:
	if os.path.exists(temp_path):
	os.unlink(temp_path)