Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

6dc8e9d

1 Parent(s): b7aa35b

Standalone script attempt #2

Browse files

Files changed (1) hide show

paddle_ocr_standalone.py +35 -18

paddle_ocr_standalone.py CHANGED Viewed

@@ -4,10 +4,10 @@
 import sys
 import os
 import json
-from paddleocr import PaddleOCR
 import fitz  # PyMuPDF for PDF page counting
-# Apply monkey patch for PyMuPDF compatibility
 if not hasattr(fitz.Document, 'pageCount'):
     def pageCount_property(self):
         return self.page_count
@@ -23,16 +23,24 @@ if not hasattr(fitz.Page, 'getText'):
         return self.get_text(option)
     fitz.Page.getText = getText
 # Check if file path was provided
 if len(sys.argv) < 2:
-    print(json.dumps({"error": "Usage: python paddle_ocr_standalone.py <file_path>"}))
     sys.exit(1)
 file_path = sys.argv[1]
 try:
     # Initialize PaddleOCR - exactly like your local implementation
     ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
     # Count total pages if it's a PDF
     def count_pdf_pages(file_path):
@@ -44,7 +52,8 @@ try:
                 return page_count
             else:
                 return 1  # Images are considered as 1 page
-        except:
             return 1  # Default to 1 if we can't determine
     # Get total pages
@@ -52,27 +61,30 @@ try:
     print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
     # Process the file - exactly like your local implementation
     result = ocr.ocr(file_path, cls=True)
     # Extract text and output results
     extracted_text = ""
     pages_processed = 0
-    # Print recognized text with page information
-    for page_idx, page_result in enumerate(result):
-        current_page = page_idx + 1
-        print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
-        if page_result:
-            pages_processed += 1
-            page_text = ""
-            for line in page_result:
-                if len(line) >= 2:
-                    page_text += line[1][0] + "\n"
-            if page_text.strip():
-                extracted_text += f"\n--- Page {current_page} ---\n"
-                extracted_text += page_text
     # Output the final result as JSON to stdout
     result_data = {
@@ -83,8 +95,13 @@ try:
     }
     print(json.dumps(result_data))
 except Exception as e:
     error_data = {
         "success": False,
         "error": str(e)

 import sys
 import os
 import json
+# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
 import fitz  # PyMuPDF for PDF page counting
 if not hasattr(fitz.Document, 'pageCount'):
     def pageCount_property(self):
         return self.page_count
         return self.get_text(option)
     fitz.Page.getText = getText
+# NOW import PaddleOCR after applying the patches
+from paddleocr import PaddleOCR
 # Check if file path was provided
 if len(sys.argv) < 2:
+    result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
+    print(json.dumps(result))
     sys.exit(1)
 file_path = sys.argv[1]
 try:
+    # Print progress to stderr (like your local implementation)
+    print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
     # Initialize PaddleOCR - exactly like your local implementation
     ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+    print("PaddleOCR initialized successfully", file=sys.stderr)
     # Count total pages if it's a PDF
     def count_pdf_pages(file_path):
                 return page_count
             else:
                 return 1  # Images are considered as 1 page
+        except Exception as e:
+            print(f"Error counting pages: {e}", file=sys.stderr)
             return 1  # Default to 1 if we can't determine
     # Get total pages
     print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
     # Process the file - exactly like your local implementation
+    print(f"Running OCR on file: {file_path}", file=sys.stderr)
     result = ocr.ocr(file_path, cls=True)
+    print("OCR processing completed", file=sys.stderr)
     # Extract text and output results
     extracted_text = ""
     pages_processed = 0
+    if result:
+        # Print recognized text with page information
+        for page_idx, page_result in enumerate(result):
+            current_page = page_idx + 1
+            print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
+            if page_result:
+                pages_processed += 1
+                page_text = ""
+                for line in page_result:
+                    if len(line) >= 2:
+                        page_text += line[1][0] + "\n"
+                if page_text.strip():
+                    extracted_text += f"\n--- Page {current_page} ---\n"
+                    extracted_text += page_text
     # Output the final result as JSON to stdout
     result_data = {
     }
     print(json.dumps(result_data))
+    print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
 except Exception as e:
+    print(f"Error during OCR processing: {e}", file=sys.stderr)
+    import traceback
+    traceback.print_exc(file=sys.stderr)
     error_data = {
         "success": False,
         "error": str(e)