mbuckle commited on
Commit
b7aa35b
·
1 Parent(s): 728c43f

Add standalone script

Browse files
Files changed (2) hide show
  1. app.py +102 -189
  2. paddle_ocr_standalone.py +93 -0
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - Simple fix mirroring your local implementation
2
 
3
  import os
4
  import subprocess
@@ -8,143 +8,83 @@ import time
8
  import base64
9
  import json
10
 
11
- # SSL fix function (keep as is)
12
- def fix_ssl_library():
13
- """Download and install libssl1.1 if not present"""
 
 
14
  try:
15
- if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
16
- print("libssl.so.1.1 already exists")
17
- return True
18
-
19
- print("Attempting to install libssl1.1...")
20
 
21
- subprocess.run([
22
- 'wget', '-q',
23
- 'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
24
- '-O', '/tmp/libssl1.1.deb'
25
- ], check=True)
26
 
27
- result = subprocess.run([
28
- 'dpkg', '-i', '/tmp/libssl1.1.deb'
29
- ], capture_output=True, text=True)
30
 
31
- if result.returncode != 0:
32
- print("dpkg install failed, trying manual extraction...")
33
- subprocess.run([
34
- 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
35
- ], check=True)
36
-
37
- lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
38
- current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
39
- if current_ld_path:
40
- os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
41
- else:
42
- os.environ['LD_LIBRARY_PATH'] = lib_path
43
- print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
44
 
45
- return True
 
 
 
 
 
 
 
46
 
47
- except Exception as e:
48
- print(f"Failed to install libssl1.1: {e}")
49
- return False
50
-
51
- def try_paddle_import():
52
- """Try different approaches to import PaddleOCR"""
53
-
54
- # First try the SSL fix
55
- fix_ssl_library()
56
-
57
- # Try importing with different environment variables
58
- os.environ['PADDLE_GIT_DISABLE'] = '1'
59
-
60
- try:
61
- from paddleocr import PaddleOCR
62
- return PaddleOCR
63
- except ImportError as e:
64
- if 'libssl.so.1.1' in str(e):
65
- print("Still having SSL issues, trying alternative PaddlePaddle version...")
66
 
67
- try:
68
- subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
69
- capture_output=True)
70
- subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
71
- check=True)
72
- from paddleocr import PaddleOCR
73
- return PaddleOCR
74
- except Exception as inner_e:
75
- print(f"Failed to install alternative version: {inner_e}")
76
 
77
- print(f"PaddleOCR import failed: {e}")
78
- raise e
79
-
80
- # Import Gradio
81
- import gradio as gr
82
-
83
- # CRITICAL: Apply PyMuPDF compatibility patch BEFORE importing PaddleOCR
84
- print("Applying PyMuPDF compatibility patches...")
85
- import fitz
86
-
87
- # Add pageCount property to Document class if it doesn't exist
88
- if not hasattr(fitz.Document, 'pageCount'):
89
- def pageCount_property(self):
90
- return self.page_count
91
-
92
- fitz.Document.pageCount = property(pageCount_property)
93
- print("✓ Added pageCount compatibility property to PyMuPDF Document class")
94
- else:
95
- print(" pageCount already exists")
96
-
97
- # Add getPixmap method to Page class if it doesn't exist
98
- if not hasattr(fitz.Page, 'getPixmap'):
99
- def getPixmap(self, matrix=None, alpha=True):
100
- return self.get_pixmap(matrix=matrix, alpha=alpha)
101
-
102
- fitz.Page.getPixmap = getPixmap
103
- print("✓ Added getPixmap compatibility method to PyMuPDF Page class")
104
- else:
105
- print("✓ getPixmap already exists")
106
-
107
- # Add getText method if it doesn't exist
108
- if not hasattr(fitz.Page, 'getText'):
109
- def getText(self, option="text"):
110
- return self.get_text(option)
111
-
112
- fitz.Page.getText = getText
113
- print("✓ Added getText compatibility method to PyMuPDF Page class")
114
- else:
115
- print("✓ getText already exists")
116
-
117
- print("✓ PyMuPDF compatibility patches applied successfully")
118
-
119
- # NOW import PaddleOCR after the patches are applied
120
- print("Attempting to import PaddleOCR...")
121
- try:
122
- PaddleOCR = try_paddle_import()
123
- print("Loading PaddleOCR models...")
124
- # Use the same settings as your local implementation
125
- ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
126
- print("PaddleOCR models loaded successfully!")
127
- except Exception as e:
128
- print(f"Failed to load PaddleOCR: {e}")
129
- print("Application will exit - compatibility issue not resolved")
130
- sys.exit(1)
131
-
132
- def count_pdf_pages(file_path):
133
- """Count pages in PDF - mirrors your local implementation"""
134
- try:
135
- if file_path.lower().endswith('.pdf'):
136
- doc = fitz.open(file_path)
137
- page_count = len(doc)
138
- doc.close()
139
- return page_count
140
- else:
141
- return 1 # Images are considered as 1 page
142
  except Exception as e:
143
- print(f"Error counting PDF pages: {e}")
144
- return 1 # Default to 1 if we can't determine
 
 
 
145
 
146
  def process_document(file):
147
- """Process uploaded document with PaddleOCR - mirrors your local implementation"""
148
  if file is None:
149
  return "No file uploaded", "", ""
150
 
@@ -157,49 +97,27 @@ def process_document(file):
157
  file_path = file.name
158
  print(f"File path: {file_path}")
159
 
160
- # Count total pages - exactly like your local implementation
161
- total_pages = count_pdf_pages(file_path)
162
- print(f"Total pages detected: {total_pages}")
163
-
164
- # Run OCR directly on the file path - just like your local version
165
- print(f"Running OCR on: {file_path}")
166
-
167
- # This is the exact same call as in your paddle_ocr.py
168
- result = ocr.ocr(file_path, cls=True)
169
 
170
- # Extract text - same logic as your local implementation
171
- extracted_text = ""
172
- pages_processed = 0
173
 
174
- if result:
175
- for page_idx, page_result in enumerate(result):
176
- current_page = page_idx + 1
177
- print(f"Processing page {current_page} of {total_pages}")
178
-
179
- if page_result:
180
- pages_processed += 1
181
- page_text = ""
182
-
183
- for line in page_result:
184
- if len(line) >= 2:
185
- # Add confidence check like your local version might have
186
- confidence = line[1][1] if len(line[1]) > 1 else 1.0
187
- if confidence > 0.5: # Only include high-confidence text
188
- page_text += line[1][0] + "\n"
189
-
190
- if page_text.strip():
191
- extracted_text += f"\n--- Page {current_page} ---\n"
192
- extracted_text += page_text
193
 
194
  processing_time = time.time() - start_time
195
- print(f"Completed processing {total_pages} pages in {processing_time:.2f}s")
196
 
197
  summary = f"""
198
  📄 **File**: {filename}
199
  📊 **Pages Processed**: {pages_processed}/{total_pages}
200
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
201
  📝 **Text Length**: {len(extracted_text)} characters
202
- 🔧 **OCR Engine**: PaddleOCR (Direct PDF Processing)
 
203
  """
204
 
205
  api_response = json.dumps({
@@ -209,13 +127,14 @@ def process_document(file):
209
  "pages_processed": pages_processed,
210
  "total_pages": total_pages,
211
  "processing_time": processing_time,
212
- "ocr_engine": "PaddleOCR"
 
213
  }, indent=2)
214
 
215
  return summary, extracted_text, api_response
216
 
217
  except Exception as e:
218
- error_msg = f"Error processing file: {str(e)}"
219
  print(f"Full error: {e}")
220
  import traceback
221
  traceback.print_exc()
@@ -239,32 +158,21 @@ def process_api_request(api_data):
239
  tmp_file_path = tmp_file.name
240
 
241
  try:
242
- # Count pages
243
- total_pages = count_pdf_pages(tmp_file_path)
244
 
245
- # Run OCR - same as your local implementation
246
- result = ocr.ocr(tmp_file_path, cls=True)
247
-
248
- # Extract text
249
- text = ""
250
- pages_processed = 0
251
-
252
- if result:
253
- for page_idx, page_result in enumerate(result):
254
- if page_result:
255
- pages_processed += 1
256
- for line in page_result:
257
- if len(line) >= 2:
258
- text += line[1][0] + "\n"
259
-
260
- return json.dumps({
261
- "success": True,
262
- "text": text,
263
- "filename": filename,
264
- "pages_processed": pages_processed,
265
- "total_pages": total_pages,
266
- "ocr_engine": "PaddleOCR"
267
- })
268
 
269
  finally:
270
  os.unlink(tmp_file_path)
@@ -331,7 +239,8 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
331
  "success": true,
332
  "text": "Extracted text content...",
333
  "filename": "lab_report.pdf",
334
- "ocr_engine": "PaddleOCR"
 
335
  }
336
  ]
337
  }
@@ -370,7 +279,7 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
370
 
371
  ### 🚀 Features
372
  - High accuracy OCR with PaddleOCR
373
- - Direct PDF processing (like your local implementation)
374
  - Medical document optimization
375
  - Multi-page PDF support
376
  - RESTful API integration
@@ -378,6 +287,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
378
 
379
  ### 🔗 Integration URL
380
  `https://mbuck17-paddleocr-processor.hf.space/api/predict`
 
 
 
 
381
  """)
382
 
383
  # Launch the app
 
1
+ # app.py - Using subprocess approach like your local Node.js implementation
2
 
3
  import os
4
  import subprocess
 
8
  import base64
9
  import json
10
 
11
+ # Import Gradio
12
+ import gradio as gr
13
+
14
+ def run_paddle_ocr_subprocess(file_path):
15
+ """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
16
  try:
17
+ # Get the path to our standalone OCR script
18
+ script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
 
 
 
19
 
20
+ # Run the subprocess - exactly like your Node.js implementation
21
+ command = [sys.executable, script_path, file_path]
 
 
 
22
 
23
+ print(f"Running command: {' '.join(command)}")
 
 
24
 
25
+ # Track progress
26
+ total_pages = 1
27
+ current_page = 0
 
 
 
 
 
 
 
 
 
 
28
 
29
+ process = subprocess.Popen(
30
+ command,
31
+ stdout=subprocess.PIPE,
32
+ stderr=subprocess.PIPE,
33
+ text=True,
34
+ bufsize=1,
35
+ universal_newlines=True
36
+ )
37
 
38
+ # Read stderr for progress updates (like your Node.js implementation)
39
+ stderr_output = ""
40
+ while True:
41
+ stderr_line = process.stderr.readline()
42
+ if not stderr_line:
43
+ break
44
+
45
+ stderr_output += stderr_line
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ if stderr_line.startswith('TOTAL_PAGES:'):
48
+ total_pages = int(stderr_line.split(':')[1].strip())
49
+ print(f"Processing document with {total_pages} pages")
 
 
 
 
 
 
50
 
51
+ elif stderr_line.startswith('CURRENT_PAGE:'):
52
+ current_page = int(stderr_line.split(':')[1].strip())
53
+ print(f"Processing page {current_page} of {total_pages}")
54
+
55
+ # Wait for process to complete and get stdout
56
+ stdout, remaining_stderr = process.communicate()
57
+
58
+ if process.returncode != 0:
59
+ print(f"OCR process failed with return code {process.returncode}")
60
+ print(f"stderr: {stderr_output + remaining_stderr}")
61
+ return {
62
+ "success": False,
63
+ "error": f"OCR process failed: {stderr_output + remaining_stderr}"
64
+ }
65
+
66
+ # Parse the JSON result from stdout
67
+ try:
68
+ result = json.loads(stdout.strip())
69
+ print(f"OCR completed successfully: {result.get('pages_processed', 0)}/{result.get('total_pages', 0)} pages")
70
+ return result
71
+ except json.JSONDecodeError as e:
72
+ print(f"Failed to parse OCR result: {e}")
73
+ print(f"stdout: {stdout}")
74
+ return {
75
+ "success": False,
76
+ "error": f"Failed to parse OCR result: {str(e)}"
77
+ }
78
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  except Exception as e:
80
+ print(f"Error running OCR subprocess: {e}")
81
+ return {
82
+ "success": False,
83
+ "error": str(e)
84
+ }
85
 
86
  def process_document(file):
87
+ """Process uploaded document using subprocess OCR"""
88
  if file is None:
89
  return "No file uploaded", "", ""
90
 
 
97
  file_path = file.name
98
  print(f"File path: {file_path}")
99
 
100
+ # Run OCR using subprocess (like your Node.js implementation)
101
+ ocr_result = run_paddle_ocr_subprocess(file_path)
 
 
 
 
 
 
 
102
 
103
+ if not ocr_result.get("success", False):
104
+ error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
105
+ return error_msg, "", json.dumps(ocr_result)
106
 
107
+ # Extract results
108
+ extracted_text = ocr_result.get("text", "")
109
+ pages_processed = ocr_result.get("pages_processed", 0)
110
+ total_pages = ocr_result.get("total_pages", 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  processing_time = time.time() - start_time
 
113
 
114
  summary = f"""
115
  📄 **File**: {filename}
116
  📊 **Pages Processed**: {pages_processed}/{total_pages}
117
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
118
  📝 **Text Length**: {len(extracted_text)} characters
119
+ 🔧 **OCR Engine**: PaddleOCR (Subprocess)
120
+ ✅ **Method**: Subprocess execution (like your local Node.js implementation)
121
  """
122
 
123
  api_response = json.dumps({
 
127
  "pages_processed": pages_processed,
128
  "total_pages": total_pages,
129
  "processing_time": processing_time,
130
+ "ocr_engine": "PaddleOCR",
131
+ "method": "subprocess"
132
  }, indent=2)
133
 
134
  return summary, extracted_text, api_response
135
 
136
  except Exception as e:
137
+ error_msg = f"Error processing file: {str(e)}"
138
  print(f"Full error: {e}")
139
  import traceback
140
  traceback.print_exc()
 
158
  tmp_file_path = tmp_file.name
159
 
160
  try:
161
+ # Run OCR using subprocess
162
+ ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
163
 
164
+ if ocr_result.get("success", False):
165
+ return json.dumps({
166
+ "success": True,
167
+ "text": ocr_result.get("text", ""),
168
+ "filename": filename,
169
+ "pages_processed": ocr_result.get("pages_processed", 0),
170
+ "total_pages": ocr_result.get("total_pages", 1),
171
+ "ocr_engine": "PaddleOCR",
172
+ "method": "subprocess"
173
+ })
174
+ else:
175
+ return json.dumps(ocr_result)
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  finally:
178
  os.unlink(tmp_file_path)
 
239
  "success": true,
240
  "text": "Extracted text content...",
241
  "filename": "lab_report.pdf",
242
+ "ocr_engine": "PaddleOCR",
243
+ "method": "subprocess"
244
  }
245
  ]
246
  }
 
279
 
280
  ### 🚀 Features
281
  - High accuracy OCR with PaddleOCR
282
+ - Subprocess execution (mirrors your local Node.js implementation)
283
  - Medical document optimization
284
  - Multi-page PDF support
285
  - RESTful API integration
 
287
 
288
  ### 🔗 Integration URL
289
  `https://mbuck17-paddleocr-processor.hf.space/api/predict`
290
+
291
+ ### ⚙️ Architecture
292
+ This implementation uses subprocess execution just like your local Node.js version,
293
+ ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
294
  """)
295
 
296
  # Launch the app
paddle_ocr_standalone.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # paddle_ocr_standalone.py - Standalone script that mirrors your local implementation
3
+
4
+ import sys
5
+ import os
6
+ import json
7
+ from paddleocr import PaddleOCR
8
+ import fitz # PyMuPDF for PDF page counting
9
+
10
+ # Apply monkey patch for PyMuPDF compatibility
11
+ if not hasattr(fitz.Document, 'pageCount'):
12
+ def pageCount_property(self):
13
+ return self.page_count
14
+ fitz.Document.pageCount = property(pageCount_property)
15
+
16
+ if not hasattr(fitz.Page, 'getPixmap'):
17
+ def getPixmap(self, matrix=None, alpha=True):
18
+ return self.get_pixmap(matrix=matrix, alpha=alpha)
19
+ fitz.Page.getPixmap = getPixmap
20
+
21
+ if not hasattr(fitz.Page, 'getText'):
22
+ def getText(self, option="text"):
23
+ return self.get_text(option)
24
+ fitz.Page.getText = getText
25
+
26
+ # Check if file path was provided
27
+ if len(sys.argv) < 2:
28
+ print(json.dumps({"error": "Usage: python paddle_ocr_standalone.py <file_path>"}))
29
+ sys.exit(1)
30
+
31
+ file_path = sys.argv[1]
32
+
33
+ try:
34
+ # Initialize PaddleOCR - exactly like your local implementation
35
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
36
+
37
+ # Count total pages if it's a PDF
38
+ def count_pdf_pages(file_path):
39
+ try:
40
+ if file_path.lower().endswith('.pdf'):
41
+ doc = fitz.open(file_path)
42
+ page_count = len(doc)
43
+ doc.close()
44
+ return page_count
45
+ else:
46
+ return 1 # Images are considered as 1 page
47
+ except:
48
+ return 1 # Default to 1 if we can't determine
49
+
50
+ # Get total pages
51
+ total_pages = count_pdf_pages(file_path)
52
+ print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
53
+
54
+ # Process the file - exactly like your local implementation
55
+ result = ocr.ocr(file_path, cls=True)
56
+
57
+ # Extract text and output results
58
+ extracted_text = ""
59
+ pages_processed = 0
60
+
61
+ # Print recognized text with page information
62
+ for page_idx, page_result in enumerate(result):
63
+ current_page = page_idx + 1
64
+ print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
65
+
66
+ if page_result:
67
+ pages_processed += 1
68
+ page_text = ""
69
+ for line in page_result:
70
+ if len(line) >= 2:
71
+ page_text += line[1][0] + "\n"
72
+
73
+ if page_text.strip():
74
+ extracted_text += f"\n--- Page {current_page} ---\n"
75
+ extracted_text += page_text
76
+
77
+ # Output the final result as JSON to stdout
78
+ result_data = {
79
+ "success": True,
80
+ "text": extracted_text,
81
+ "total_pages": total_pages,
82
+ "pages_processed": pages_processed
83
+ }
84
+
85
+ print(json.dumps(result_data))
86
+
87
+ except Exception as e:
88
+ error_data = {
89
+ "success": False,
90
+ "error": str(e)
91
+ }
92
+ print(json.dumps(error_data))
93
+ sys.exit(1)