mbuckle commited on
Commit
9ae0d8b
·
1 Parent(s): 4fd5951

Version 6

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +175 -304
paddle_ocr_standalone.py CHANGED
@@ -1,329 +1,200 @@
1
- # app.py - Using subprocess approach like your local Node.js implementation
 
2
 
3
- import os
4
- import subprocess
5
  import sys
6
- import tempfile
7
- import time
8
- import base64
9
  import json
 
 
 
 
10
 
11
- # Import Gradio
12
- import gradio as gr
 
 
13
 
14
- def run_paddle_ocr_subprocess(file_path):
15
- """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
- # Get the path to our standalone OCR script
18
- script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
19
-
20
- # Run the subprocess - exactly like your Node.js implementation
21
- command = [sys.executable, script_path, file_path]
22
-
23
- print(f"Running command: {' '.join(command)}")
24
 
25
- # Track progress
26
- total_pages = 1
27
- current_page = 0
28
-
29
- process = subprocess.Popen(
30
- command,
31
- stdout=subprocess.PIPE,
32
- stderr=subprocess.PIPE,
33
- text=True,
34
- bufsize=1,
35
- universal_newlines=True
36
- )
37
-
38
- # Read stderr for progress updates (like your Node.js implementation)
39
- stderr_output = ""
40
- while True:
41
- stderr_line = process.stderr.readline()
42
- if not stderr_line:
43
- break
44
-
45
- stderr_output += stderr_line
46
-
47
- if stderr_line.startswith('TOTAL_PAGES:'):
48
- total_pages = int(stderr_line.split(':')[1].strip())
49
- print(f"Processing document with {total_pages} pages")
50
-
51
- elif stderr_line.startswith('CURRENT_PAGE:'):
52
- current_page = int(stderr_line.split(':')[1].strip())
53
- print(f"Processing page {current_page} of {total_pages}")
54
-
55
- # Wait for process to complete and get stdout
56
- stdout, remaining_stderr = process.communicate()
57
-
58
- if process.returncode != 0:
59
- print(f"OCR process failed with return code {process.returncode}")
60
- print(f"stderr: {stderr_output + remaining_stderr}")
61
- return {
62
- "success": False,
63
- "error": f"OCR process failed: {stderr_output + remaining_stderr}"
64
- }
65
 
66
- # Parse the JSON result from stdout - handle mixed output
67
- try:
68
- # PaddleOCR might output download messages to stdout, find the JSON
69
- stdout_lines = stdout.strip().split('\n')
70
- json_result = None
71
-
72
- # Look for the JSON result (usually the last line that starts with {)
73
- for line in reversed(stdout_lines):
74
- line = line.strip()
75
- if line.startswith('{') and line.endswith('}'):
76
- try:
77
- json_result = json.loads(line)
78
- break
79
- except json.JSONDecodeError:
80
- continue
81
 
82
- if json_result is None:
83
- # If no valid JSON found, try the entire stdout
84
- json_result = json.loads(stdout.strip())
85
 
86
- print(f"OCR completed successfully: {json_result.get('pages_processed', 0)}/{json_result.get('total_pages', 0)} pages")
87
- return json_result
88
-
89
- except json.JSONDecodeError as e:
90
- print(f"Failed to parse OCR result: {e}")
91
- print(f"stdout: {stdout}")
92
- print(f"Trying to extract JSON from mixed output...")
93
-
94
- # Try to find JSON in the mixed output
95
- import re
96
- json_match = re.search(r'\{.*"success".*\}', stdout, re.DOTALL)
97
- if json_match:
98
- try:
99
- result = json.loads(json_match.group())
100
- print("Successfully extracted JSON from mixed output")
101
- return result
102
- except json.JSONDecodeError:
103
- pass
104
 
105
- return {
106
- "success": False,
107
- "error": f"Failed to parse OCR result: {str(e)}"
108
- }
109
 
110
- except Exception as e:
111
- print(f"Error running OCR subprocess: {e}")
112
- return {
113
- "success": False,
114
- "error": str(e)
115
- }
116
-
117
- def process_document(file):
118
- """Process uploaded document using subprocess OCR"""
119
- if file is None:
120
- return "No file uploaded", "", ""
121
-
122
- start_time = time.time()
123
-
124
- try:
125
- filename = os.path.basename(file.name)
126
- print(f"Processing: {filename}")
127
-
128
- file_path = file.name
129
- print(f"File path: {file_path}")
130
-
131
- # Run OCR using subprocess (like your Node.js implementation)
132
- ocr_result = run_paddle_ocr_subprocess(file_path)
133
-
134
- if not ocr_result.get("success", False):
135
- error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
136
- return error_msg, "", json.dumps(ocr_result)
137
-
138
- # Extract results
139
- extracted_text = ocr_result.get("text", "")
140
- pages_processed = ocr_result.get("pages_processed", 0)
141
- total_pages = ocr_result.get("total_pages", 1)
142
-
143
- processing_time = time.time() - start_time
144
-
145
- summary = f"""
146
- 📄 **File**: {filename}
147
- 📊 **Pages Processed**: {pages_processed}/{total_pages}
148
- ⏱️ **Processing Time**: {processing_time:.2f} seconds
149
- 📝 **Text Length**: {len(extracted_text)} characters
150
- 🔧 **OCR Engine**: PaddleOCR (Subprocess)
151
- ✅ **Method**: Subprocess execution (like your local Node.js implementation)
152
- """
153
-
154
- api_response = json.dumps({
155
- "success": True,
156
- "text": extracted_text,
157
- "filename": filename,
158
- "pages_processed": pages_processed,
159
- "total_pages": total_pages,
160
- "processing_time": processing_time,
161
- "ocr_engine": "PaddleOCR",
162
- "method": "subprocess"
163
- }, indent=2)
164
 
165
- return summary, extracted_text, api_response
 
 
166
 
167
  except Exception as e:
168
- error_msg = f"Error processing file: {str(e)}"
169
- print(f"Full error: {e}")
170
  import traceback
171
- traceback.print_exc()
172
- return error_msg, "", json.dumps({"success": False, "error": str(e)})
173
 
174
- def process_api_request(api_data):
175
- """Process API-style requests (for integration with your Vercel app)"""
176
- try:
177
- data = json.loads(api_data)
178
-
179
- if 'file' not in data:
180
- return json.dumps({"success": False, "error": "No file data provided"})
181
-
182
- # Decode base64 file
183
- file_data = base64.b64decode(data['file'])
184
- filename = data.get('filename', 'unknown.pdf')
185
-
186
- # Save to temp file
187
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
188
- tmp_file.write(file_data)
189
- tmp_file_path = tmp_file.name
190
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  try:
192
- # Run OCR using subprocess
193
- ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
 
194
 
195
- if ocr_result.get("success", False):
196
- return json.dumps({
197
- "success": True,
198
- "text": ocr_result.get("text", ""),
199
- "filename": filename,
200
- "pages_processed": ocr_result.get("pages_processed", 0),
201
- "total_pages": ocr_result.get("total_pages", 1),
202
- "ocr_engine": "PaddleOCR",
203
- "method": "subprocess"
204
- })
205
- else:
206
- return json.dumps(ocr_result)
207
 
208
- finally:
209
- os.unlink(tmp_file_path)
210
 
211
- except Exception as e:
212
- return json.dumps({"success": False, "error": str(e)})
213
-
214
- # Create Gradio interface
215
- with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
216
- gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
217
- gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- with gr.Tab("📄 File Upload"):
220
- with gr.Row():
221
- with gr.Column():
222
- file_input = gr.File(
223
- label="Upload Document (PDF, JPG, PNG)",
224
- file_types=[".pdf", ".jpg", ".jpeg", ".png"]
225
- )
226
- process_btn = gr.Button("🔍 Process Document", variant="primary")
227
-
228
- with gr.Column():
229
- summary_output = gr.Markdown(label="📊 Processing Summary")
230
-
231
- with gr.Row():
232
- text_output = gr.Textbox(
233
- label="📝 Extracted Text",
234
- lines=15,
235
- max_lines=20
236
- )
237
-
238
- process_btn.click(
239
- fn=process_document,
240
- inputs=[file_input],
241
- outputs=[summary_output, text_output, gr.Textbox(visible=False)]
242
- )
243
 
244
- with gr.Tab("🔌 API Integration"):
245
- gr.Markdown("### For integration with your Vercel app:")
246
- gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
247
- gr.Markdown("**Method**: POST")
248
- gr.Markdown("**Headers**: `Content-Type: application/json`")
249
-
250
- with gr.Row():
251
- with gr.Column():
252
- gr.Markdown("**Sample Request:**")
253
- gr.Code('''
254
- {
255
- "data": [
256
- {
257
- "file": "base64_encoded_file_data_here",
258
- "filename": "lab_report.pdf"
259
- }
260
- ]
261
- }
262
- ''', language="json")
263
-
264
- with gr.Column():
265
- gr.Markdown("**Sample Response:**")
266
- gr.Code('''
267
- {
268
- "data": [
269
- {
270
- "success": true,
271
- "text": "Extracted text content...",
272
- "filename": "lab_report.pdf",
273
- "ocr_engine": "PaddleOCR",
274
- "method": "subprocess"
275
  }
276
- ]
277
- }
278
- ''', language="json")
279
-
280
- gr.Markdown("### Test API Request:")
281
- api_input = gr.Textbox(
282
- label="API Request (JSON)",
283
- placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
284
- lines=5
285
- )
286
- api_btn = gr.Button("🧪 Test API Request")
287
- api_output = gr.Textbox(
288
- label="API Response (JSON)",
289
- lines=10
290
- )
291
-
292
- api_btn.click(
293
- fn=process_api_request,
294
- inputs=[api_input],
295
- outputs=[api_output]
296
- )
297
 
298
- with gr.Tab("ℹ️ About"):
299
- gr.Markdown("""
300
- ### 🎯 Purpose
301
- This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
302
-
303
- ### 🔧 Integration
304
- This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
305
-
306
- ### 📚 Supported Formats
307
- - PDF documents (multi-page)
308
- - JPEG/JPG images
309
- - PNG images
310
-
311
- ### 🚀 Features
312
- - High accuracy OCR with PaddleOCR
313
- - Subprocess execution (mirrors your local Node.js implementation)
314
- - Medical document optimization
315
- - Multi-page PDF support
316
- - RESTful API integration
317
- - Free hosting on Hugging Face
318
-
319
- ### 🔗 Integration URL
320
- `https://mbuck17-paddleocr-processor.hf.space/api/predict`
321
-
322
- ### ⚙️ Architecture
323
- This implementation uses subprocess execution just like your local Node.js version,
324
- ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
325
- """)
326
-
327
- # Launch the app
328
- if __name__ == "__main__":
329
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ #!/usr/bin/env python3
2
+ # paddle_ocr_standalone.py - Fixed version with PDF to image conversion
3
 
 
 
4
  import sys
5
+ import os
 
 
6
  import json
7
+ import tempfile
8
+
9
+ # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
10
+ import fitz # PyMuPDF for PDF page counting
11
 
12
+ if not hasattr(fitz.Document, 'pageCount'):
13
+ def pageCount_property(self):
14
+ return self.page_count
15
+ fitz.Document.pageCount = property(pageCount_property)
16
 
17
+ if not hasattr(fitz.Page, 'getPixmap'):
18
+ def getPixmap(self, matrix=None, alpha=True):
19
+ return self.get_pixmap(matrix=matrix, alpha=alpha)
20
+ fitz.Page.getPixmap = getPixmap
21
+
22
+ if not hasattr(fitz.Page, 'getText'):
23
+ def getText(self, option="text"):
24
+ return self.get_text(option)
25
+ fitz.Page.getText = getText
26
+
27
+ # NOW import PaddleOCR after applying the patches
28
+ from paddleocr import PaddleOCR
29
+
30
+ def pdf_to_images(pdf_path, dpi=200):
31
+ """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
32
  try:
33
+ doc = fitz.open(pdf_path)
34
+ image_paths = []
 
 
 
 
 
35
 
36
+ print(f"PDF has {len(doc)} pages", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ for page_num in range(len(doc)):
39
+ page = doc[page_num]
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Create a transformation matrix for higher DPI
42
+ mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
 
43
 
44
+ # Render page to pixmap
45
+ if hasattr(page, 'getPixmap'):
46
+ pix = page.getPixmap(matrix=mat)
47
+ else:
48
+ pix = page.get_pixmap(matrix=mat)
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Save to temporary file
51
+ temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
52
+ pix.save(temp_img_path)
 
53
 
54
+ # Check if file was created and get its size
55
+ if os.path.exists(temp_img_path):
56
+ file_size = os.path.getsize(temp_img_path)
57
+ print(f"Converted page {page_num + 1} to: {temp_img_path} (size: {file_size} bytes, dimensions: {pix.width}x{pix.height})", file=sys.stderr)
58
+ else:
59
+ print(f"Failed to create image file: {temp_img_path}", file=sys.stderr)
60
+ continue
61
+
62
+ image_paths.append(temp_img_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ doc.close()
65
+ print(f"Successfully converted {len(image_paths)} pages to images", file=sys.stderr)
66
+ return image_paths
67
 
68
  except Exception as e:
69
+ print(f"Error converting PDF to images: {e}", file=sys.stderr)
 
70
  import traceback
71
+ traceback.print_exc(file=sys.stderr)
72
+ return []
73
 
74
+ def cleanup_temp_files(file_paths):
75
+ """Clean up temporary image files"""
76
+ for file_path in file_paths:
77
+ try:
78
+ if os.path.exists(file_path):
79
+ os.unlink(file_path)
80
+ print(f"Cleaned up: {file_path}", file=sys.stderr)
81
+ except Exception as e:
82
+ print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
83
+
84
+ # Check if file path was provided
85
+ if len(sys.argv) < 2:
86
+ result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
87
+ print(json.dumps(result))
88
+ sys.exit(1)
89
+
90
+ file_path = sys.argv[1]
91
+ temp_files = []
92
+
93
+ try:
94
+ # Print progress to stderr (like your local implementation)
95
+ print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
96
+
97
+ # Initialize PaddleOCR - exactly like your local implementation
98
+ # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
99
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
100
+ print("PaddleOCR initialized successfully", file=sys.stderr)
101
+
102
+ # Check if it's a PDF or image
103
+ is_pdf = file_path.lower().endswith('.pdf')
104
+
105
+ if is_pdf:
106
+ print("Converting PDF to images for OCR processing...", file=sys.stderr)
107
+ image_paths = pdf_to_images(file_path)
108
+ temp_files = image_paths
109
+
110
+ if not image_paths:
111
+ raise Exception("Failed to convert PDF to images")
112
+
113
+ total_pages = len(image_paths)
114
+ else:
115
+ # For image files, use directly
116
+ image_paths = [file_path]
117
+ total_pages = 1
118
+
119
+ print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
120
+
121
+ # Process each image with OCR
122
+ extracted_text = ""
123
+ pages_processed = 0
124
+
125
+ for i, img_path in enumerate(image_paths):
126
  try:
127
+ current_page = i + 1
128
+ print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
129
+ print(f"Processing image: {img_path}", file=sys.stderr)
130
 
131
+ # Run OCR on the image
132
+ result = ocr.ocr(img_path, cls=True)
 
 
 
 
 
 
 
 
 
 
133
 
134
+ # Debug: print what OCR returns
135
+ print(f"OCR result for page {current_page}: {type(result)}, length: {len(result) if result else 'None'}", file=sys.stderr)
136
 
137
+ if result and result[0]: # result is a list of pages, we have one page per image
138
+ print(f"Page {current_page} has {len(result[0])} text lines detected", file=sys.stderr)
139
+
140
+ pages_processed += 1
141
+ page_text = ""
142
+
143
+ for line_idx, line in enumerate(result[0]):
144
+ if len(line) >= 2:
145
+ text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
146
+ confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
147
+
148
+ print(f"Line {line_idx}: '{text_content}' (confidence: {confidence})", file=sys.stderr)
149
+
150
+ if confidence > 0.3: # Lower confidence threshold for debugging
151
+ page_text += text_content + "\n"
152
+
153
+ if page_text.strip():
154
+ extracted_text += f"\n--- Page {current_page} ---\n"
155
+ extracted_text += page_text
156
+ print(f"Page {current_page} text added: {len(page_text)} characters", file=sys.stderr)
157
+ else:
158
+ print(f"Page {current_page}: No text above confidence threshold", file=sys.stderr)
159
+
160
+ print(f"Page {current_page} processed successfully", file=sys.stderr)
161
+ else:
162
+ print(f"No OCR results returned for page {current_page}", file=sys.stderr)
163
+ if result:
164
+ print(f"Result structure: {result}", file=sys.stderr)
165
+
166
+ except Exception as page_error:
167
+ print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
168
+ continue
169
 
170
+ # Clean up temporary files
171
+ if temp_files:
172
+ cleanup_temp_files(temp_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Output the final result as JSON to stdout
175
+ result_data = {
176
+ "success": True,
177
+ "text": extracted_text,
178
+ "total_pages": total_pages,
179
+ "pages_processed": pages_processed,
180
+ "method": "pdf_to_images" if is_pdf else "direct_image"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ print(json.dumps(result_data))
184
+ print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
185
+
186
+ except Exception as e:
187
+ # Clean up on error
188
+ if temp_files:
189
+ cleanup_temp_files(temp_files)
190
+
191
+ print(f"Error during OCR processing: {e}", file=sys.stderr)
192
+ import traceback
193
+ traceback.print_exc(file=sys.stderr)
194
+
195
+ error_data = {
196
+ "success": False,
197
+ "error": str(e)
198
+ }
199
+ print(json.dumps(error_data))
200
+ sys.exit(1)