Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

paddleocr-processor / archive /minimal_test_paddle.py

mbuckle

Enhanced paddle test

78b142a 9 months ago

raw

history blame contribute delete

3.5 kB

	#!/usr/bin/env python3
	# minimal_test_paddle.py - Minimal test to isolate the OCR issue

	import sys
	import os
	import json
	import fitz
	from paddleocr import PaddleOCR

	def test_ocr():
	if len(sys.argv) < 2:
	print(json.dumps({"error": "No file path provided"}))
	return

	file_path = sys.argv[1]

	try:
	print(f"Testing OCR on: {file_path}", file=sys.stderr)

	# Test 1: Can we open the PDF?
	print("Opening PDF...", file=sys.stderr)
	doc = fitz.open(file_path)
	print(f"PDF has {len(doc)} pages", file=sys.stderr)

	# Test 2: Convert first page to image
	print("Converting first page to image...", file=sys.stderr)
	page = doc[0]
	mat = fitz.Matrix(150/72, 150/72)
	pix = page.get_pixmap(matrix=mat)

	temp_img = "/tmp/test_page.png"
	pix.save(temp_img)

	if os.path.exists(temp_img):
	img_size = os.path.getsize(temp_img)
	print(f"Image created: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
	else:
	print("Failed to create image", file=sys.stderr)
	doc.close()
	return

	doc.close()

	# Test 3: Initialize OCR
	print("Initializing OCR...", file=sys.stderr)
	ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
	print("OCR initialized", file=sys.stderr)

	# Test 4: Run OCR on the image
	print("Running OCR...", file=sys.stderr)
	result = ocr.ocr(temp_img, cls=True)

	print(f"OCR result type: {type(result)}", file=sys.stderr)
	if result:
	print(f"Result length: {len(result)}", file=sys.stderr)
	if result[0]:
	print(f"First page has {len(result[0])} detections", file=sys.stderr)

	# Print all detected text
	for i, detection in enumerate(result[0]):
	if len(detection) >= 2:
	text = detection[1][0] if isinstance(detection[1], (list, tuple)) else str(detection[1])
	conf = detection[1][1] if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
	print(f"Detection {i}: '{text}' (confidence: {conf})", file=sys.stderr)
	else:
	print("First page result is empty", file=sys.stderr)
	else:
	print("OCR returned None", file=sys.stderr)

	# Clean up
	if os.path.exists(temp_img):
	os.unlink(temp_img)

	# Return simple result
	text_found = ""
	if result and result[0]:
	for detection in result[0]:
	if len(detection) >= 2:
	# Convert to string to handle both string and float values
	text_value = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
	text_found += text_value + "\n"

	print(json.dumps({
	"success": True,
	"text": text_found,
	"detections": len(result[0]) if result and result[0] else 0
	}))

	except Exception as e:
	print(f"Error: {e}", file=sys.stderr)
	import traceback
	traceback.print_exc(file=sys.stderr)
	print(json.dumps({"success": False, "error": str(e)}))

	if __name__ == "__main__":
	test_ocr()