mbuckle commited on
Commit
4fd5951
Β·
1 Parent(s): 5745364

Fixed PDF to Image conversion

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +307 -150
paddle_ocr_standalone.py CHANGED
@@ -1,172 +1,329 @@
1
- #!/usr/bin/env python3
2
- # paddle_ocr_standalone.py - Fixed version with PDF to image conversion
3
 
4
- import sys
5
  import os
6
- import json
 
7
  import tempfile
 
 
 
8
 
9
- # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
10
- import fitz # PyMuPDF for PDF page counting
11
-
12
- if not hasattr(fitz.Document, 'pageCount'):
13
- def pageCount_property(self):
14
- return self.page_count
15
- fitz.Document.pageCount = property(pageCount_property)
16
-
17
- if not hasattr(fitz.Page, 'getPixmap'):
18
- def getPixmap(self, matrix=None, alpha=True):
19
- return self.get_pixmap(matrix=matrix, alpha=alpha)
20
- fitz.Page.getPixmap = getPixmap
21
-
22
- if not hasattr(fitz.Page, 'getText'):
23
- def getText(self, option="text"):
24
- return self.get_text(option)
25
- fitz.Page.getText = getText
26
-
27
- # NOW import PaddleOCR after applying the patches
28
- from paddleocr import PaddleOCR
29
 
30
- def pdf_to_images(pdf_path, dpi=200):
31
- """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
32
  try:
33
- doc = fitz.open(pdf_path)
34
- image_paths = []
 
 
 
 
 
35
 
36
- for page_num in range(len(doc)):
37
- page = doc[page_num]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Create a transformation matrix for higher DPI
40
- mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # Render page to pixmap
43
- if hasattr(page, 'getPixmap'):
44
- pix = page.getPixmap(matrix=mat)
45
- else:
46
- pix = page.get_pixmap(matrix=mat)
 
 
 
 
47
 
48
- # Save to temporary file
49
- temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
50
- pix.save(temp_img_path)
51
- image_paths.append(temp_img_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- print(f"Converted page {page_num + 1} to: {temp_img_path}", file=sys.stderr)
54
-
55
- doc.close()
56
- return image_paths
57
-
58
  except Exception as e:
59
- print(f"Error converting PDF to images: {e}", file=sys.stderr)
60
- return []
61
-
62
- def cleanup_temp_files(file_paths):
63
- """Clean up temporary image files"""
64
- for file_path in file_paths:
65
- try:
66
- if os.path.exists(file_path):
67
- os.unlink(file_path)
68
- print(f"Cleaned up: {file_path}", file=sys.stderr)
69
- except Exception as e:
70
- print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
71
-
72
- # Check if file path was provided
73
- if len(sys.argv) < 2:
74
- result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
75
- print(json.dumps(result))
76
- sys.exit(1)
77
-
78
- file_path = sys.argv[1]
79
- temp_files = []
80
 
81
- try:
82
- # Print progress to stderr (like your local implementation)
83
- print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
 
84
 
85
- # Initialize PaddleOCR - exactly like your local implementation
86
- # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
87
- ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
88
- print("PaddleOCR initialized successfully", file=sys.stderr)
89
 
90
- # Check if it's a PDF or image
91
- is_pdf = file_path.lower().endswith('.pdf')
92
-
93
- if is_pdf:
94
- print("Converting PDF to images for OCR processing...", file=sys.stderr)
95
- image_paths = pdf_to_images(file_path)
96
- temp_files = image_paths
97
-
98
- if not image_paths:
99
- raise Exception("Failed to convert PDF to images")
100
-
101
- total_pages = len(image_paths)
102
- else:
103
- # For image files, use directly
104
- image_paths = [file_path]
105
- total_pages = 1
106
-
107
- print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
108
-
109
- # Process each image with OCR
110
- extracted_text = ""
111
- pages_processed = 0
112
-
113
- for i, img_path in enumerate(image_paths):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  try:
115
- current_page = i + 1
116
- print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
117
- print(f"Processing image: {img_path}", file=sys.stderr)
118
 
119
- # Run OCR on the image
120
- result = ocr.ocr(img_path, cls=True)
121
-
122
- if result and result[0]: # result is a list of pages, we have one page per image
123
- pages_processed += 1
124
- page_text = ""
125
-
126
- for line in result[0]:
127
- if len(line) >= 2 and line[1][1] > 0.5: # confidence threshold
128
- page_text += line[1][0] + "\n"
129
-
130
- if page_text.strip():
131
- extracted_text += f"\n--- Page {current_page} ---\n"
132
- extracted_text += page_text
133
-
134
- print(f"Page {current_page} processed successfully", file=sys.stderr)
135
  else:
136
- print(f"No text found on page {current_page}", file=sys.stderr)
137
-
138
- except Exception as page_error:
139
- print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
140
- continue
 
 
 
 
 
 
 
141
 
142
- # Clean up temporary files
143
- if temp_files:
144
- cleanup_temp_files(temp_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Output the final result as JSON to stdout
147
- result_data = {
148
- "success": True,
149
- "text": extracted_text,
150
- "total_pages": total_pages,
151
- "pages_processed": pages_processed,
152
- "method": "pdf_to_images" if is_pdf else "direct_image"
 
 
 
 
 
 
 
 
153
  }
154
-
155
- print(json.dumps(result_data))
156
- print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
157
-
158
- except Exception as e:
159
- # Clean up on error
160
- if temp_files:
161
- cleanup_temp_files(temp_files)
162
-
163
- print(f"Error during OCR processing: {e}", file=sys.stderr)
164
- import traceback
165
- traceback.print_exc(file=sys.stderr)
166
-
167
- error_data = {
168
- "success": False,
169
- "error": str(e)
170
  }
171
- print(json.dumps(error_data))
172
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Using subprocess approach like your local Node.js implementation
 
2
 
 
3
  import os
4
+ import subprocess
5
+ import sys
6
  import tempfile
7
+ import time
8
+ import base64
9
+ import json
10
 
11
+ # Import Gradio
12
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ def run_paddle_ocr_subprocess(file_path):
15
+ """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
16
  try:
17
+ # Get the path to our standalone OCR script
18
+ script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
19
+
20
+ # Run the subprocess - exactly like your Node.js implementation
21
+ command = [sys.executable, script_path, file_path]
22
+
23
+ print(f"Running command: {' '.join(command)}")
24
 
25
+ # Track progress
26
+ total_pages = 1
27
+ current_page = 0
28
+
29
+ process = subprocess.Popen(
30
+ command,
31
+ stdout=subprocess.PIPE,
32
+ stderr=subprocess.PIPE,
33
+ text=True,
34
+ bufsize=1,
35
+ universal_newlines=True
36
+ )
37
+
38
+ # Read stderr for progress updates (like your Node.js implementation)
39
+ stderr_output = ""
40
+ while True:
41
+ stderr_line = process.stderr.readline()
42
+ if not stderr_line:
43
+ break
44
+
45
+ stderr_output += stderr_line
46
 
47
+ if stderr_line.startswith('TOTAL_PAGES:'):
48
+ total_pages = int(stderr_line.split(':')[1].strip())
49
+ print(f"Processing document with {total_pages} pages")
50
+
51
+ elif stderr_line.startswith('CURRENT_PAGE:'):
52
+ current_page = int(stderr_line.split(':')[1].strip())
53
+ print(f"Processing page {current_page} of {total_pages}")
54
+
55
+ # Wait for process to complete and get stdout
56
+ stdout, remaining_stderr = process.communicate()
57
+
58
+ if process.returncode != 0:
59
+ print(f"OCR process failed with return code {process.returncode}")
60
+ print(f"stderr: {stderr_output + remaining_stderr}")
61
+ return {
62
+ "success": False,
63
+ "error": f"OCR process failed: {stderr_output + remaining_stderr}"
64
+ }
65
+
66
+ # Parse the JSON result from stdout - handle mixed output
67
+ try:
68
+ # PaddleOCR might output download messages to stdout, find the JSON
69
+ stdout_lines = stdout.strip().split('\n')
70
+ json_result = None
71
 
72
+ # Look for the JSON result (usually the last line that starts with {)
73
+ for line in reversed(stdout_lines):
74
+ line = line.strip()
75
+ if line.startswith('{') and line.endswith('}'):
76
+ try:
77
+ json_result = json.loads(line)
78
+ break
79
+ except json.JSONDecodeError:
80
+ continue
81
 
82
+ if json_result is None:
83
+ # If no valid JSON found, try the entire stdout
84
+ json_result = json.loads(stdout.strip())
85
+
86
+ print(f"OCR completed successfully: {json_result.get('pages_processed', 0)}/{json_result.get('total_pages', 0)} pages")
87
+ return json_result
88
+
89
+ except json.JSONDecodeError as e:
90
+ print(f"Failed to parse OCR result: {e}")
91
+ print(f"stdout: {stdout}")
92
+ print(f"Trying to extract JSON from mixed output...")
93
+
94
+ # Try to find JSON in the mixed output
95
+ import re
96
+ json_match = re.search(r'\{.*"success".*\}', stdout, re.DOTALL)
97
+ if json_match:
98
+ try:
99
+ result = json.loads(json_match.group())
100
+ print("Successfully extracted JSON from mixed output")
101
+ return result
102
+ except json.JSONDecodeError:
103
+ pass
104
+
105
+ return {
106
+ "success": False,
107
+ "error": f"Failed to parse OCR result: {str(e)}"
108
+ }
109
 
 
 
 
 
 
110
  except Exception as e:
111
+ print(f"Error running OCR subprocess: {e}")
112
+ return {
113
+ "success": False,
114
+ "error": str(e)
115
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ def process_document(file):
118
+ """Process uploaded document using subprocess OCR"""
119
+ if file is None:
120
+ return "No file uploaded", "", ""
121
 
122
+ start_time = time.time()
 
 
 
123
 
124
+ try:
125
+ filename = os.path.basename(file.name)
126
+ print(f"Processing: {filename}")
127
+
128
+ file_path = file.name
129
+ print(f"File path: {file_path}")
130
+
131
+ # Run OCR using subprocess (like your Node.js implementation)
132
+ ocr_result = run_paddle_ocr_subprocess(file_path)
133
+
134
+ if not ocr_result.get("success", False):
135
+ error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
136
+ return error_msg, "", json.dumps(ocr_result)
137
+
138
+ # Extract results
139
+ extracted_text = ocr_result.get("text", "")
140
+ pages_processed = ocr_result.get("pages_processed", 0)
141
+ total_pages = ocr_result.get("total_pages", 1)
142
+
143
+ processing_time = time.time() - start_time
144
+
145
+ summary = f"""
146
+ πŸ“„ **File**: {filename}
147
+ πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
148
+ ⏱️ **Processing Time**: {processing_time:.2f} seconds
149
+ πŸ“ **Text Length**: {len(extracted_text)} characters
150
+ πŸ”§ **OCR Engine**: PaddleOCR (Subprocess)
151
+ βœ… **Method**: Subprocess execution (like your local Node.js implementation)
152
+ """
153
+
154
+ api_response = json.dumps({
155
+ "success": True,
156
+ "text": extracted_text,
157
+ "filename": filename,
158
+ "pages_processed": pages_processed,
159
+ "total_pages": total_pages,
160
+ "processing_time": processing_time,
161
+ "ocr_engine": "PaddleOCR",
162
+ "method": "subprocess"
163
+ }, indent=2)
164
+
165
+ return summary, extracted_text, api_response
166
+
167
+ except Exception as e:
168
+ error_msg = f"❌ Error processing file: {str(e)}"
169
+ print(f"Full error: {e}")
170
+ import traceback
171
+ traceback.print_exc()
172
+ return error_msg, "", json.dumps({"success": False, "error": str(e)})
173
+
174
+ def process_api_request(api_data):
175
+ """Process API-style requests (for integration with your Vercel app)"""
176
+ try:
177
+ data = json.loads(api_data)
178
+
179
+ if 'file' not in data:
180
+ return json.dumps({"success": False, "error": "No file data provided"})
181
+
182
+ # Decode base64 file
183
+ file_data = base64.b64decode(data['file'])
184
+ filename = data.get('filename', 'unknown.pdf')
185
+
186
+ # Save to temp file
187
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
188
+ tmp_file.write(file_data)
189
+ tmp_file_path = tmp_file.name
190
+
191
  try:
192
+ # Run OCR using subprocess
193
+ ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
 
194
 
195
+ if ocr_result.get("success", False):
196
+ return json.dumps({
197
+ "success": True,
198
+ "text": ocr_result.get("text", ""),
199
+ "filename": filename,
200
+ "pages_processed": ocr_result.get("pages_processed", 0),
201
+ "total_pages": ocr_result.get("total_pages", 1),
202
+ "ocr_engine": "PaddleOCR",
203
+ "method": "subprocess"
204
+ })
 
 
 
 
 
 
205
  else:
206
+ return json.dumps(ocr_result)
207
+
208
+ finally:
209
+ os.unlink(tmp_file_path)
210
+
211
+ except Exception as e:
212
+ return json.dumps({"success": False, "error": str(e)})
213
+
214
+ # Create Gradio interface
215
+ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
216
+ gr.Markdown("# πŸ₯ PaddleOCR Medical Document Processor")
217
+ gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
218
 
219
+ with gr.Tab("πŸ“„ File Upload"):
220
+ with gr.Row():
221
+ with gr.Column():
222
+ file_input = gr.File(
223
+ label="Upload Document (PDF, JPG, PNG)",
224
+ file_types=[".pdf", ".jpg", ".jpeg", ".png"]
225
+ )
226
+ process_btn = gr.Button("πŸ” Process Document", variant="primary")
227
+
228
+ with gr.Column():
229
+ summary_output = gr.Markdown(label="πŸ“Š Processing Summary")
230
+
231
+ with gr.Row():
232
+ text_output = gr.Textbox(
233
+ label="πŸ“ Extracted Text",
234
+ lines=15,
235
+ max_lines=20
236
+ )
237
+
238
+ process_btn.click(
239
+ fn=process_document,
240
+ inputs=[file_input],
241
+ outputs=[summary_output, text_output, gr.Textbox(visible=False)]
242
+ )
243
 
244
+ with gr.Tab("πŸ”Œ API Integration"):
245
+ gr.Markdown("### For integration with your Vercel app:")
246
+ gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
247
+ gr.Markdown("**Method**: POST")
248
+ gr.Markdown("**Headers**: `Content-Type: application/json`")
249
+
250
+ with gr.Row():
251
+ with gr.Column():
252
+ gr.Markdown("**Sample Request:**")
253
+ gr.Code('''
254
+ {
255
+ "data": [
256
+ {
257
+ "file": "base64_encoded_file_data_here",
258
+ "filename": "lab_report.pdf"
259
  }
260
+ ]
261
+ }
262
+ ''', language="json")
263
+
264
+ with gr.Column():
265
+ gr.Markdown("**Sample Response:**")
266
+ gr.Code('''
267
+ {
268
+ "data": [
269
+ {
270
+ "success": true,
271
+ "text": "Extracted text content...",
272
+ "filename": "lab_report.pdf",
273
+ "ocr_engine": "PaddleOCR",
274
+ "method": "subprocess"
 
275
  }
276
+ ]
277
+ }
278
+ ''', language="json")
279
+
280
+ gr.Markdown("### Test API Request:")
281
+ api_input = gr.Textbox(
282
+ label="API Request (JSON)",
283
+ placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
284
+ lines=5
285
+ )
286
+ api_btn = gr.Button("πŸ§ͺ Test API Request")
287
+ api_output = gr.Textbox(
288
+ label="API Response (JSON)",
289
+ lines=10
290
+ )
291
+
292
+ api_btn.click(
293
+ fn=process_api_request,
294
+ inputs=[api_input],
295
+ outputs=[api_output]
296
+ )
297
+
298
+ with gr.Tab("ℹ️ About"):
299
+ gr.Markdown("""
300
+ ### 🎯 Purpose
301
+ This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
302
+
303
+ ### πŸ”§ Integration
304
+ This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
305
+
306
+ ### πŸ“š Supported Formats
307
+ - PDF documents (multi-page)
308
+ - JPEG/JPG images
309
+ - PNG images
310
+
311
+ ### πŸš€ Features
312
+ - High accuracy OCR with PaddleOCR
313
+ - Subprocess execution (mirrors your local Node.js implementation)
314
+ - Medical document optimization
315
+ - Multi-page PDF support
316
+ - RESTful API integration
317
+ - Free hosting on Hugging Face
318
+
319
+ ### πŸ”— Integration URL
320
+ `https://mbuck17-paddleocr-processor.hf.space/api/predict`
321
+
322
+ ### βš™οΈ Architecture
323
+ This implementation uses subprocess execution just like your local Node.js version,
324
+ ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
325
+ """)
326
+
327
+ # Launch the app
328
+ if __name__ == "__main__":
329
+ demo.launch(server_name="0.0.0.0", server_port=7860)