Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

paddleocr-processor / paddle_ocr_standalone.py

mbuckle

New standalone file

ce717af 9 months ago

raw

history blame contribute delete

11.3 kB

	#!/usr/bin/env python3
	# paddle_ocr_standalone.py - Robust version with comprehensive error handling

	import sys
	import os
	import json
	import tempfile
	import traceback

	def safe_print_stderr(message):
	"""Safely print to stderr"""
	try:
	print(message, file=sys.stderr, flush=True)
	except:
	pass

	def safe_print_json(data):
	"""Safely print JSON to stdout"""
	try:
	print(json.dumps(data), flush=True)
	except Exception as e:
	safe_print_stderr(f"Error printing JSON: {e}")
	print('{"success": false, "error": "JSON serialization failed"}')

	# Check if file path was provided
	if len(sys.argv) < 2:
	safe_print_json({"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"})
	sys.exit(1)

	file_path = sys.argv[1]
	temp_files = []

	try:
	safe_print_stderr(f"=== Starting OCR processing for: {os.path.basename(file_path)} ===")

	# Check if file exists and is readable
	if not os.path.exists(file_path):
	raise Exception(f"File does not exist: {file_path}")

	if not os.access(file_path, os.R_OK):
	raise Exception(f"File is not readable: {file_path}")

	file_size = os.path.getsize(file_path)
	safe_print_stderr(f"File size: {file_size} bytes")

	# Import dependencies one by one with error handling
	safe_print_stderr("Importing PyMuPDF...")
	try:
	import fitz
	safe_print_stderr("✓ PyMuPDF imported successfully")
	except Exception as e:
	raise Exception(f"Failed to import PyMuPDF: {e}")

	# Apply monkey patch for PyMuPDF compatibility
	safe_print_stderr("Applying PyMuPDF compatibility patches...")
	try:
	if not hasattr(fitz.Document, 'pageCount'):
	def pageCount_property(self):
	return self.page_count
	fitz.Document.pageCount = property(pageCount_property)
	safe_print_stderr("✓ Added pageCount property")

	if not hasattr(fitz.Page, 'getPixmap'):
	def getPixmap(self, matrix=None, alpha=True):
	return self.get_pixmap(matrix=matrix, alpha=alpha)
	fitz.Page.getPixmap = getPixmap
	safe_print_stderr("✓ Added getPixmap method")

	if not hasattr(fitz.Page, 'getText'):
	def getText(self, option="text"):
	return self.get_text(option)
	fitz.Page.getText = getText
	safe_print_stderr("✓ Added getText method")

	except Exception as e:
	safe_print_stderr(f"Warning: Monkey patch failed: {e}")

	# Test PDF opening
	safe_print_stderr("Testing PDF opening...")
	try:
	test_doc = fitz.open(file_path)
	page_count = len(test_doc)
	safe_print_stderr(f"✓ PDF opened successfully, {page_count} pages detected")
	test_doc.close()
	except Exception as e:
	raise Exception(f"Failed to open PDF: {e}")

	# Import PaddleOCR
	safe_print_stderr("Importing PaddleOCR...")
	try:
	from paddleocr import PaddleOCR
	safe_print_stderr("✓ PaddleOCR imported successfully")
	except Exception as e:
	raise Exception(f"Failed to import PaddleOCR: {e}")

	# Initialize PaddleOCR
	safe_print_stderr("Initializing PaddleOCR...")
	try:
	ocr = PaddleOCR(
	use_angle_cls=True,
	lang='en',
	show_log=False,
	use_gpu=False
	)
	safe_print_stderr("✓ PaddleOCR initialized successfully")
	except Exception as e:
	raise Exception(f"Failed to initialize PaddleOCR: {e}")

	def pdf_to_images(pdf_path, dpi=150):
	"""Convert PDF pages to images"""
	try:
	safe_print_stderr(f"Converting PDF to images (DPI: {dpi})...")
	doc = fitz.open(pdf_path)
	image_paths = []

	total_pages = len(doc) # Store this before we close the document
	safe_print_stderr(f"PDF has {total_pages} pages")

	for page_num in range(total_pages):
	try:
	safe_print_stderr(f"Converting page {page_num + 1}...")
	page = doc[page_num]

	# Create transformation matrix
	mat = fitz.Matrix(dpi/72, dpi/72)

	# Render page to pixmap
	if hasattr(page, 'getPixmap'):
	pix = page.getPixmap(matrix=mat)
	else:
	pix = page.get_pixmap(matrix=mat)

	# Save to temporary file
	temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
	pix.save(temp_img_path)

	# Verify file creation
	if os.path.exists(temp_img_path):
	file_size = os.path.getsize(temp_img_path)
	safe_print_stderr(f"✓ Page {page_num + 1} converted: {temp_img_path} (size: {file_size} bytes, {pix.width}x{pix.height})")
	image_paths.append(temp_img_path)
	else:
	safe_print_stderr(f"✗ Failed to create image: {temp_img_path}")

	except Exception as page_error:
	safe_print_stderr(f"✗ Error converting page {page_num + 1}: {page_error}")
	continue

	doc.close()
	safe_print_stderr(f"✓ Successfully converted {len(image_paths)}/{total_pages} pages")
	return image_paths

	except Exception as e:
	safe_print_stderr(f"✗ PDF conversion failed: {e}")
	traceback.print_exc(file=sys.stderr)
	return []

	def cleanup_temp_files(file_paths):
	"""Clean up temporary files"""
	for file_path in file_paths:
	try:
	if os.path.exists(file_path):
	os.unlink(file_path)
	safe_print_stderr(f"✓ Cleaned up: {file_path}")
	except Exception as e:
	safe_print_stderr(f"Warning: Could not clean up {file_path}: {e}")

	# Determine file type and convert if needed
	is_pdf = file_path.lower().endswith('.pdf')

	if is_pdf:
	safe_print_stderr("Processing PDF file...")
	image_paths = pdf_to_images(file_path)
	temp_files = image_paths

	if not image_paths:
	raise Exception("PDF conversion produced no images")

	total_pages = len(image_paths)
	safe_print_stderr(f"Will process {total_pages} images")
	else:
	safe_print_stderr("Processing image file...")
	image_paths = [file_path]
	total_pages = 1

	safe_print_stderr(f"TOTAL_PAGES:{total_pages}")

	# Process each image with OCR
	safe_print_stderr("Starting OCR processing...")
	extracted_text = ""
	pages_processed = 0

	for i, img_path in enumerate(image_paths):
	try:
	current_page = i + 1
	safe_print_stderr(f"CURRENT_PAGE:{current_page}")
	safe_print_stderr(f"Processing image: {img_path}")

	# Verify image exists and is readable
	if not os.path.exists(img_path):
	safe_print_stderr(f"✗ Image file does not exist: {img_path}")
	continue

	img_size = os.path.getsize(img_path)
	safe_print_stderr(f"Image size: {img_size} bytes")

	# Run OCR on the image
	safe_print_stderr(f"Running OCR on page {current_page}...")
	result = ocr.ocr(img_path, cls=True)

	safe_print_stderr(f"OCR result type: {type(result)}")
	if result:
	safe_print_stderr(f"OCR result length: {len(result)}")
	if result[0]:
	safe_print_stderr(f"Page {current_page} has {len(result[0])} text regions detected")
	else:
	safe_print_stderr(f"Page {current_page}: OCR returned empty result")
	else:
	safe_print_stderr(f"Page {current_page}: OCR returned None")
	continue

	if result and result[0]:
	pages_processed += 1
	page_text = ""

	for line_idx, line in enumerate(result[0]):
	try:
	if len(line) >= 2:
	# Handle both string and float text values
	if isinstance(line[1], (list, tuple)):
	text_content = str(line[1][0])
	confidence = float(line[1][1]) if len(line[1]) > 1 else 1.0
	else:
	text_content = str(line[1])
	confidence = 1.0

	safe_print_stderr(f"Line {line_idx}: '{text_content}' (confidence: {confidence:.2f})")

	if confidence > 0.3:
	page_text += text_content + "\n"
	except Exception as line_error:
	safe_print_stderr(f"Error processing line {line_idx}: {line_error}")
	continue

	if page_text.strip():
	extracted_text += f"\n--- Page {current_page} ---\n"
	extracted_text += page_text
	safe_print_stderr(f"✓ Page {current_page}: Added {len(page_text)} characters of text")
	else:
	safe_print_stderr(f"Page {current_page}: No text above confidence threshold")

	else:
	safe_print_stderr(f"Page {current_page}: No OCR results")

	except Exception as page_error:
	safe_print_stderr(f"✗ Error processing page {current_page}: {page_error}")
	traceback.print_exc(file=sys.stderr)
	continue

	# Clean up temporary files
	if temp_files:
	safe_print_stderr("Cleaning up temporary files...")
	cleanup_temp_files(temp_files)

	# Prepare final result
	result_data = {
	"success": True,
	"text": extracted_text,
	"total_pages": total_pages,
	"pages_processed": pages_processed,
	"method": "pdf_to_images" if is_pdf else "direct_image"
	}

	safe_print_stderr(f"=== OCR Complete: {pages_processed}/{total_pages} pages processed ===")
	safe_print_stderr(f"Total text length: {len(extracted_text)} characters")

	# Output final JSON result
	safe_print_json(result_data)

	except Exception as e:
	# Clean up on error
	if temp_files:
	try:
	cleanup_temp_files(temp_files)
	except:
	pass

	safe_print_stderr(f"=== FATAL ERROR ===")
	safe_print_stderr(f"Error: {e}")
	traceback.print_exc(file=sys.stderr)

	error_data = {
	"success": False,
	"error": str(e)
	}
	safe_print_json(error_data)
	sys.exit(1)