Spaces:
Sleeping
Sleeping
PDF to image conversion fix
Browse files- paddle_ocr_standalone.py +92 -30
paddle_ocr_standalone.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
# paddle_ocr_standalone.py -
|
| 3 |
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
import json
|
|
|
|
| 7 |
|
| 8 |
# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
|
| 9 |
import fitz # PyMuPDF for PDF page counting
|
|
@@ -26,6 +27,48 @@ if not hasattr(fitz.Page, 'getText'):
|
|
| 26 |
# NOW import PaddleOCR after applying the patches
|
| 27 |
from paddleocr import PaddleOCR
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Check if file path was provided
|
| 30 |
if len(sys.argv) < 2:
|
| 31 |
result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
|
|
@@ -33,71 +76,90 @@ if len(sys.argv) < 2:
|
|
| 33 |
sys.exit(1)
|
| 34 |
|
| 35 |
file_path = sys.argv[1]
|
|
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
# Print progress to stderr (like your local implementation)
|
| 39 |
print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
|
| 40 |
|
| 41 |
# Initialize PaddleOCR - exactly like your local implementation
|
|
|
|
| 42 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
|
| 43 |
print("PaddleOCR initialized successfully", file=sys.stderr)
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
try:
|
| 48 |
-
if file_path.lower().endswith('.pdf'):
|
| 49 |
-
doc = fitz.open(file_path)
|
| 50 |
-
page_count = len(doc)
|
| 51 |
-
doc.close()
|
| 52 |
-
return page_count
|
| 53 |
-
else:
|
| 54 |
-
return 1 # Images are considered as 1 page
|
| 55 |
-
except Exception as e:
|
| 56 |
-
print(f"Error counting pages: {e}", file=sys.stderr)
|
| 57 |
-
return 1 # Default to 1 if we can't determine
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
print(f"Running OCR on file: {file_path}", file=sys.stderr)
|
| 65 |
-
result = ocr.ocr(file_path, cls=True)
|
| 66 |
-
print("OCR processing completed", file=sys.stderr)
|
| 67 |
|
| 68 |
-
#
|
| 69 |
extracted_text = ""
|
| 70 |
pages_processed = 0
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
current_page = page_idx + 1
|
| 76 |
print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
if
|
| 79 |
pages_processed += 1
|
| 80 |
page_text = ""
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
page_text += line[1][0] + "\n"
|
| 84 |
|
| 85 |
if page_text.strip():
|
| 86 |
extracted_text += f"\n--- Page {current_page} ---\n"
|
| 87 |
extracted_text += page_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Output the final result as JSON to stdout
|
| 90 |
result_data = {
|
| 91 |
"success": True,
|
| 92 |
"text": extracted_text,
|
| 93 |
"total_pages": total_pages,
|
| 94 |
-
"pages_processed": pages_processed
|
|
|
|
| 95 |
}
|
| 96 |
|
| 97 |
print(json.dumps(result_data))
|
| 98 |
print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
|
| 99 |
|
| 100 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
print(f"Error during OCR processing: {e}", file=sys.stderr)
|
| 102 |
import traceback
|
| 103 |
traceback.print_exc(file=sys.stderr)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
# paddle_ocr_standalone.py - Fixed version with PDF to image conversion
|
| 3 |
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
+
import tempfile
|
| 8 |
|
| 9 |
# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
|
| 10 |
import fitz # PyMuPDF for PDF page counting
|
|
|
|
| 27 |
# NOW import PaddleOCR after applying the patches
|
| 28 |
from paddleocr import PaddleOCR
|
| 29 |
|
| 30 |
+
def pdf_to_images(pdf_path, dpi=200):
|
| 31 |
+
"""Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
|
| 32 |
+
try:
|
| 33 |
+
doc = fitz.open(pdf_path)
|
| 34 |
+
image_paths = []
|
| 35 |
+
|
| 36 |
+
for page_num in range(len(doc)):
|
| 37 |
+
page = doc[page_num]
|
| 38 |
+
|
| 39 |
+
# Create a transformation matrix for higher DPI
|
| 40 |
+
mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
|
| 41 |
+
|
| 42 |
+
# Render page to pixmap
|
| 43 |
+
if hasattr(page, 'getPixmap'):
|
| 44 |
+
pix = page.getPixmap(matrix=mat)
|
| 45 |
+
else:
|
| 46 |
+
pix = page.get_pixmap(matrix=mat)
|
| 47 |
+
|
| 48 |
+
# Save to temporary file
|
| 49 |
+
temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
|
| 50 |
+
pix.save(temp_img_path)
|
| 51 |
+
image_paths.append(temp_img_path)
|
| 52 |
+
|
| 53 |
+
print(f"Converted page {page_num + 1} to: {temp_img_path}", file=sys.stderr)
|
| 54 |
+
|
| 55 |
+
doc.close()
|
| 56 |
+
return image_paths
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Error converting PDF to images: {e}", file=sys.stderr)
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
def cleanup_temp_files(file_paths):
|
| 63 |
+
"""Clean up temporary image files"""
|
| 64 |
+
for file_path in file_paths:
|
| 65 |
+
try:
|
| 66 |
+
if os.path.exists(file_path):
|
| 67 |
+
os.unlink(file_path)
|
| 68 |
+
print(f"Cleaned up: {file_path}", file=sys.stderr)
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
|
| 71 |
+
|
| 72 |
# Check if file path was provided
|
| 73 |
if len(sys.argv) < 2:
|
| 74 |
result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
|
|
|
|
| 76 |
sys.exit(1)
|
| 77 |
|
| 78 |
file_path = sys.argv[1]
|
| 79 |
+
temp_files = []
|
| 80 |
|
| 81 |
try:
|
| 82 |
# Print progress to stderr (like your local implementation)
|
| 83 |
print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
|
| 84 |
|
| 85 |
# Initialize PaddleOCR - exactly like your local implementation
|
| 86 |
+
# Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
|
| 87 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
|
| 88 |
print("PaddleOCR initialized successfully", file=sys.stderr)
|
| 89 |
|
| 90 |
+
# Check if it's a PDF or image
|
| 91 |
+
is_pdf = file_path.lower().endswith('.pdf')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
if is_pdf:
|
| 94 |
+
print("Converting PDF to images for OCR processing...", file=sys.stderr)
|
| 95 |
+
image_paths = pdf_to_images(file_path)
|
| 96 |
+
temp_files = image_paths
|
| 97 |
+
|
| 98 |
+
if not image_paths:
|
| 99 |
+
raise Exception("Failed to convert PDF to images")
|
| 100 |
+
|
| 101 |
+
total_pages = len(image_paths)
|
| 102 |
+
else:
|
| 103 |
+
# For image files, use directly
|
| 104 |
+
image_paths = [file_path]
|
| 105 |
+
total_pages = 1
|
| 106 |
|
| 107 |
+
print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
# Process each image with OCR
|
| 110 |
extracted_text = ""
|
| 111 |
pages_processed = 0
|
| 112 |
|
| 113 |
+
for i, img_path in enumerate(image_paths):
|
| 114 |
+
try:
|
| 115 |
+
current_page = i + 1
|
|
|
|
| 116 |
print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
|
| 117 |
+
print(f"Processing image: {img_path}", file=sys.stderr)
|
| 118 |
+
|
| 119 |
+
# Run OCR on the image
|
| 120 |
+
result = ocr.ocr(img_path, cls=True)
|
| 121 |
|
| 122 |
+
if result and result[0]: # result is a list of pages, we have one page per image
|
| 123 |
pages_processed += 1
|
| 124 |
page_text = ""
|
| 125 |
+
|
| 126 |
+
for line in result[0]:
|
| 127 |
+
if len(line) >= 2 and line[1][1] > 0.5: # confidence threshold
|
| 128 |
page_text += line[1][0] + "\n"
|
| 129 |
|
| 130 |
if page_text.strip():
|
| 131 |
extracted_text += f"\n--- Page {current_page} ---\n"
|
| 132 |
extracted_text += page_text
|
| 133 |
+
|
| 134 |
+
print(f"Page {current_page} processed successfully", file=sys.stderr)
|
| 135 |
+
else:
|
| 136 |
+
print(f"No text found on page {current_page}", file=sys.stderr)
|
| 137 |
+
|
| 138 |
+
except Exception as page_error:
|
| 139 |
+
print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
# Clean up temporary files
|
| 143 |
+
if temp_files:
|
| 144 |
+
cleanup_temp_files(temp_files)
|
| 145 |
|
| 146 |
# Output the final result as JSON to stdout
|
| 147 |
result_data = {
|
| 148 |
"success": True,
|
| 149 |
"text": extracted_text,
|
| 150 |
"total_pages": total_pages,
|
| 151 |
+
"pages_processed": pages_processed,
|
| 152 |
+
"method": "pdf_to_images" if is_pdf else "direct_image"
|
| 153 |
}
|
| 154 |
|
| 155 |
print(json.dumps(result_data))
|
| 156 |
print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
|
| 157 |
|
| 158 |
except Exception as e:
|
| 159 |
+
# Clean up on error
|
| 160 |
+
if temp_files:
|
| 161 |
+
cleanup_temp_files(temp_files)
|
| 162 |
+
|
| 163 |
print(f"Error during OCR processing: {e}", file=sys.stderr)
|
| 164 |
import traceback
|
| 165 |
traceback.print_exc(file=sys.stderr)
|