mbuckle commited on
Commit
6dc8e9d
·
1 Parent(s): b7aa35b

Standalone script attempt #2

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +35 -18
paddle_ocr_standalone.py CHANGED
@@ -4,10 +4,10 @@
4
  import sys
5
  import os
6
  import json
7
- from paddleocr import PaddleOCR
 
8
  import fitz # PyMuPDF for PDF page counting
9
 
10
- # Apply monkey patch for PyMuPDF compatibility
11
  if not hasattr(fitz.Document, 'pageCount'):
12
  def pageCount_property(self):
13
  return self.page_count
@@ -23,16 +23,24 @@ if not hasattr(fitz.Page, 'getText'):
23
  return self.get_text(option)
24
  fitz.Page.getText = getText
25
 
 
 
 
26
  # Check if file path was provided
27
  if len(sys.argv) < 2:
28
- print(json.dumps({"error": "Usage: python paddle_ocr_standalone.py <file_path>"}))
 
29
  sys.exit(1)
30
 
31
  file_path = sys.argv[1]
32
 
33
  try:
 
 
 
34
  # Initialize PaddleOCR - exactly like your local implementation
35
  ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
 
36
 
37
  # Count total pages if it's a PDF
38
  def count_pdf_pages(file_path):
@@ -44,7 +52,8 @@ try:
44
  return page_count
45
  else:
46
  return 1 # Images are considered as 1 page
47
- except:
 
48
  return 1 # Default to 1 if we can't determine
49
 
50
  # Get total pages
@@ -52,27 +61,30 @@ try:
52
  print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
53
 
54
  # Process the file - exactly like your local implementation
 
55
  result = ocr.ocr(file_path, cls=True)
 
56
 
57
  # Extract text and output results
58
  extracted_text = ""
59
  pages_processed = 0
60
 
61
- # Print recognized text with page information
62
- for page_idx, page_result in enumerate(result):
63
- current_page = page_idx + 1
64
- print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
65
-
66
- if page_result:
67
- pages_processed += 1
68
- page_text = ""
69
- for line in page_result:
70
- if len(line) >= 2:
71
- page_text += line[1][0] + "\n"
72
 
73
- if page_text.strip():
74
- extracted_text += f"\n--- Page {current_page} ---\n"
75
- extracted_text += page_text
 
 
 
 
 
 
 
76
 
77
  # Output the final result as JSON to stdout
78
  result_data = {
@@ -83,8 +95,13 @@ try:
83
  }
84
 
85
  print(json.dumps(result_data))
 
86
 
87
  except Exception as e:
 
 
 
 
88
  error_data = {
89
  "success": False,
90
  "error": str(e)
 
4
  import sys
5
  import os
6
  import json
7
+
8
+ # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
9
  import fitz # PyMuPDF for PDF page counting
10
 
 
11
  if not hasattr(fitz.Document, 'pageCount'):
12
  def pageCount_property(self):
13
  return self.page_count
 
23
  return self.get_text(option)
24
  fitz.Page.getText = getText
25
 
26
+ # NOW import PaddleOCR after applying the patches
27
+ from paddleocr import PaddleOCR
28
+
29
  # Check if file path was provided
30
  if len(sys.argv) < 2:
31
+ result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
32
+ print(json.dumps(result))
33
  sys.exit(1)
34
 
35
  file_path = sys.argv[1]
36
 
37
  try:
38
+ # Print progress to stderr (like your local implementation)
39
+ print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
40
+
41
  # Initialize PaddleOCR - exactly like your local implementation
42
  ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
43
+ print("PaddleOCR initialized successfully", file=sys.stderr)
44
 
45
  # Count total pages if it's a PDF
46
  def count_pdf_pages(file_path):
 
52
  return page_count
53
  else:
54
  return 1 # Images are considered as 1 page
55
+ except Exception as e:
56
+ print(f"Error counting pages: {e}", file=sys.stderr)
57
  return 1 # Default to 1 if we can't determine
58
 
59
  # Get total pages
 
61
  print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
62
 
63
  # Process the file - exactly like your local implementation
64
+ print(f"Running OCR on file: {file_path}", file=sys.stderr)
65
  result = ocr.ocr(file_path, cls=True)
66
+ print("OCR processing completed", file=sys.stderr)
67
 
68
  # Extract text and output results
69
  extracted_text = ""
70
  pages_processed = 0
71
 
72
+ if result:
73
+ # Print recognized text with page information
74
+ for page_idx, page_result in enumerate(result):
75
+ current_page = page_idx + 1
76
+ print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
 
 
 
 
 
 
77
 
78
+ if page_result:
79
+ pages_processed += 1
80
+ page_text = ""
81
+ for line in page_result:
82
+ if len(line) >= 2:
83
+ page_text += line[1][0] + "\n"
84
+
85
+ if page_text.strip():
86
+ extracted_text += f"\n--- Page {current_page} ---\n"
87
+ extracted_text += page_text
88
 
89
  # Output the final result as JSON to stdout
90
  result_data = {
 
95
  }
96
 
97
  print(json.dumps(result_data))
98
+ print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
99
 
100
  except Exception as e:
101
+ print(f"Error during OCR processing: {e}", file=sys.stderr)
102
+ import traceback
103
+ traceback.print_exc(file=sys.stderr)
104
+
105
  error_data = {
106
  "success": False,
107
  "error": str(e)