Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

5745364

1 Parent(s): 6dc8e9d

PDF to image conversion fix

Browse files

Files changed (1) hide show

paddle_ocr_standalone.py +92 -30

paddle_ocr_standalone.py CHANGED Viewed

@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
-# paddle_ocr_standalone.py - Standalone script that mirrors your local implementation
 import sys
 import os
 import json
 # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
 import fitz  # PyMuPDF for PDF page counting
@@ -26,6 +27,48 @@ if not hasattr(fitz.Page, 'getText'):
 # NOW import PaddleOCR after applying the patches
 from paddleocr import PaddleOCR
 # Check if file path was provided
 if len(sys.argv) < 2:
     result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
@@ -33,71 +76,90 @@ if len(sys.argv) < 2:
     sys.exit(1)
 file_path = sys.argv[1]
 try:
     # Print progress to stderr (like your local implementation)
     print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
     # Initialize PaddleOCR - exactly like your local implementation
     ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
     print("PaddleOCR initialized successfully", file=sys.stderr)
-    # Count total pages if it's a PDF
-    def count_pdf_pages(file_path):
-        try:
-            if file_path.lower().endswith('.pdf'):
-                doc = fitz.open(file_path)
-                page_count = len(doc)
-                doc.close()
-                return page_count
-            else:
-                return 1  # Images are considered as 1 page
-        except Exception as e:
-            print(f"Error counting pages: {e}", file=sys.stderr)
-            return 1  # Default to 1 if we can't determine
-    # Get total pages
-    total_pages = count_pdf_pages(file_path)
-    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
-    # Process the file - exactly like your local implementation
-    print(f"Running OCR on file: {file_path}", file=sys.stderr)
-    result = ocr.ocr(file_path, cls=True)
-    print("OCR processing completed", file=sys.stderr)
-    # Extract text and output results
     extracted_text = ""
     pages_processed = 0
-    if result:
-        # Print recognized text with page information
-        for page_idx, page_result in enumerate(result):
-            current_page = page_idx + 1
             print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
-            if page_result:
                 pages_processed += 1
                 page_text = ""
-                for line in page_result:
-                    if len(line) >= 2:
                         page_text += line[1][0] + "\n"
                 if page_text.strip():
                     extracted_text += f"\n--- Page {current_page} ---\n"
                     extracted_text += page_text
     # Output the final result as JSON to stdout
     result_data = {
         "success": True,
         "text": extracted_text,
         "total_pages": total_pages,
-        "pages_processed": pages_processed
     }
     print(json.dumps(result_data))
     print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
 except Exception as e:
     print(f"Error during OCR processing: {e}", file=sys.stderr)
     import traceback
     traceback.print_exc(file=sys.stderr)

 #!/usr/bin/env python3
+# paddle_ocr_standalone.py - Fixed version with PDF to image conversion
 import sys
 import os
 import json
+import tempfile
 # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
 import fitz  # PyMuPDF for PDF page counting
 # NOW import PaddleOCR after applying the patches
 from paddleocr import PaddleOCR
+def pdf_to_images(pdf_path, dpi=200):
+    """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
+    try:
+        doc = fitz.open(pdf_path)
+        image_paths = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            # Create a transformation matrix for higher DPI
+            mat = fitz.Matrix(dpi/72, dpi/72)  # 200 DPI for better OCR accuracy
+            # Render page to pixmap
+            if hasattr(page, 'getPixmap'):
+                pix = page.getPixmap(matrix=mat)
+            else:
+                pix = page.get_pixmap(matrix=mat)
+            # Save to temporary file
+            temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
+            pix.save(temp_img_path)
+            image_paths.append(temp_img_path)
+            print(f"Converted page {page_num + 1} to: {temp_img_path}", file=sys.stderr)
+        doc.close()
+        return image_paths
+    except Exception as e:
+        print(f"Error converting PDF to images: {e}", file=sys.stderr)
+        return []
+def cleanup_temp_files(file_paths):
+    """Clean up temporary image files"""
+    for file_path in file_paths:
+        try:
+            if os.path.exists(file_path):
+                os.unlink(file_path)
+                print(f"Cleaned up: {file_path}", file=sys.stderr)
+        except Exception as e:
+            print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
 # Check if file path was provided
 if len(sys.argv) < 2:
     result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
     sys.exit(1)
 file_path = sys.argv[1]
+temp_files = []
 try:
     # Print progress to stderr (like your local implementation)
     print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
     # Initialize PaddleOCR - exactly like your local implementation
+    # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
     ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
     print("PaddleOCR initialized successfully", file=sys.stderr)
+    # Check if it's a PDF or image
+    is_pdf = file_path.lower().endswith('.pdf')
+    if is_pdf:
+        print("Converting PDF to images for OCR processing...", file=sys.stderr)
+        image_paths = pdf_to_images(file_path)
+        temp_files = image_paths
+        if not image_paths:
+            raise Exception("Failed to convert PDF to images")
+        total_pages = len(image_paths)
+    else:
+        # For image files, use directly
+        image_paths = [file_path]
+        total_pages = 1
+    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
+    # Process each image with OCR
     extracted_text = ""
     pages_processed = 0
+    for i, img_path in enumerate(image_paths):
+        try:
+            current_page = i + 1
             print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
+            print(f"Processing image: {img_path}", file=sys.stderr)
+            # Run OCR on the image
+            result = ocr.ocr(img_path, cls=True)
+            if result and result[0]:  # result is a list of pages, we have one page per image
                 pages_processed += 1
                 page_text = ""
+                for line in result[0]:
+                    if len(line) >= 2 and line[1][1] > 0.5:  # confidence threshold
                         page_text += line[1][0] + "\n"
                 if page_text.strip():
                     extracted_text += f"\n--- Page {current_page} ---\n"
                     extracted_text += page_text
+                print(f"Page {current_page} processed successfully", file=sys.stderr)
+            else:
+                print(f"No text found on page {current_page}", file=sys.stderr)
+        except Exception as page_error:
+            print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
+            continue
+    # Clean up temporary files
+    if temp_files:
+        cleanup_temp_files(temp_files)
     # Output the final result as JSON to stdout
     result_data = {
         "success": True,
         "text": extracted_text,
         "total_pages": total_pages,
+        "pages_processed": pages_processed,
+        "method": "pdf_to_images" if is_pdf else "direct_image"
     }
     print(json.dumps(result_data))
     print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
 except Exception as e:
+    # Clean up on error
+    if temp_files:
+        cleanup_temp_files(temp_files)
     print(f"Error during OCR processing: {e}", file=sys.stderr)
     import traceback
     traceback.print_exc(file=sys.stderr)