mbuckle commited on
Commit
cced363
Β·
1 Parent(s): 3533982

Fixed version with PDF to image conversion

Browse files
Files changed (1) hide show
  1. app.py +169 -52
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - Correct structure with monkey patch BEFORE any fitz imports
2
 
3
  import os
4
  import subprocess
@@ -159,13 +159,60 @@ except Exception as e:
159
  if test_doc:
160
  test_doc.close()
161
 
162
- # Rest of your app code (process_document, API functions, Gradio interface, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def process_document(file):
164
  """Process uploaded document with PaddleOCR"""
165
  if file is None:
166
  return "No file uploaded", "", ""
167
 
168
  start_time = time.time()
 
169
 
170
  try:
171
  filename = os.path.basename(file.name)
@@ -174,56 +221,66 @@ def process_document(file):
174
  file_path = file.name
175
  print(f"File path: {file_path}")
176
 
177
- # Count pages if PDF
 
178
  total_pages = 1
179
- if filename.lower().endswith('.pdf'):
180
- try:
181
- print(f"Opening PDF: {file_path}")
182
- doc = fitz.open(file_path)
183
-
184
- # Test pageCount attribute
185
- print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
186
- print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
187
-
188
- if hasattr(doc, 'pageCount'):
189
- total_pages = doc.pageCount
190
- print(f"Used pageCount: {total_pages}")
191
- elif hasattr(doc, 'page_count'):
192
- total_pages = doc.page_count
193
- print(f"Used page_count: {total_pages}")
194
- else:
195
- total_pages = len(doc)
196
- print(f"Used len(): {total_pages}")
197
-
198
- doc.close()
199
- except Exception as e:
200
- print(f"PDF page counting error: {e}")
201
- total_pages = 1
202
 
203
- # Run OCR
204
- print(f"Running OCR on: {file_path}")
205
- result = ocr.ocr(file_path, cls=True)
 
 
 
 
 
 
 
 
206
 
207
- # Extract text
208
  extracted_text = ""
209
  pages_processed = 0
210
 
211
- if result:
212
- for page_idx, page_result in enumerate(result):
213
- if page_result:
 
 
 
 
 
214
  pages_processed += 1
215
- for line in page_result:
216
- if len(line) >= 2 and line[1][1] > 0.5:
217
- extracted_text += line[1][0] + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  processing_time = time.time() - start_time
220
 
 
 
 
 
221
  summary = f"""
222
  πŸ“„ **File**: {filename}
223
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
224
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
225
  πŸ“ **Text Length**: {len(extracted_text)} characters
226
  πŸ”§ **OCR Engine**: PaddleOCR
 
227
  """
228
 
229
  api_response = json.dumps({
@@ -233,13 +290,18 @@ def process_document(file):
233
  "pages_processed": pages_processed,
234
  "total_pages": total_pages,
235
  "processing_time": processing_time,
236
- "ocr_engine": "PaddleOCR"
 
237
  }, indent=2)
238
 
239
  return summary, extracted_text, api_response
240
 
241
  except Exception as e:
242
- error_msg = f"Error processing file: {str(e)}"
 
 
 
 
243
  print(f"Full error: {e}")
244
  import traceback
245
  traceback.print_exc()
@@ -247,6 +309,8 @@ def process_document(file):
247
 
248
  def process_api_request(api_data):
249
  """Process API-style requests (for integration with your Vercel app)"""
 
 
250
  try:
251
  data = json.loads(api_data)
252
 
@@ -262,29 +326,73 @@ def process_api_request(api_data):
262
  tmp_file.write(file_data)
263
  tmp_file_path = tmp_file.name
264
 
 
 
265
  try:
266
- # Run OCR
267
- result = ocr.ocr(tmp_file_path, cls=True)
 
 
 
 
 
 
 
 
 
 
268
 
269
- # Extract text
270
- text = ""
271
- for page_result in result:
272
- if page_result:
273
- for line in page_result:
274
- if len(line) >= 2:
275
- text += line[1][0] + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  return json.dumps({
278
  "success": True,
279
- "text": text,
280
  "filename": filename,
281
- "ocr_engine": "PaddleOCR"
 
 
 
282
  })
283
 
284
  finally:
285
- os.unlink(tmp_file_path)
 
 
 
 
 
 
286
 
287
  except Exception as e:
 
 
 
 
 
 
 
288
  return json.dumps({"success": False, "error": str(e)})
289
 
290
  # Create Gradio interface with multiple tabs
@@ -346,7 +454,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
346
  "success": true,
347
  "text": "Extracted text content...",
348
  "filename": "lab_report.pdf",
349
- "ocr_engine": "PaddleOCR"
 
 
 
350
  }
351
  ]
352
  }
@@ -379,12 +490,13 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
379
  This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
380
 
381
  ### πŸ“š Supported Formats
382
- - PDF documents (multi-page)
383
  - JPEG/JPG images
384
  - PNG images
385
 
386
  ### πŸš€ Features
387
  - High accuracy OCR with PaddleOCR
 
388
  - Medical document optimization
389
  - Multi-page PDF support
390
  - RESTful API integration
@@ -393,6 +505,11 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
393
 
394
  ### πŸ”— Integration URL
395
  `https://mbuck17-paddleocr-processor.hf.space/api/predict`
 
 
 
 
 
396
  """)
397
 
398
  # Launch the app
 
1
+ # app.py - Fixed version with PDF to image conversion for PaddleOCR
2
 
3
  import os
4
  import subprocess
 
159
  if test_doc:
160
  test_doc.close()
161
 
162
+ def pdf_to_images(pdf_path, dpi=200):
163
+ """Convert PDF pages to images for OCR processing"""
164
+ try:
165
+ doc = fitz.open(pdf_path)
166
+ images = []
167
+ image_paths = []
168
+
169
+ for page_num in range(len(doc)):
170
+ page = doc[page_num]
171
+
172
+ # Create a transformation matrix for higher DPI
173
+ mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
174
+
175
+ # Render page to pixmap
176
+ if hasattr(page, 'getPixmap'):
177
+ pix = page.getPixmap(matrix=mat)
178
+ else:
179
+ pix = page.get_pixmap(matrix=mat)
180
+
181
+ # Convert to PIL Image
182
+ img_data = pix.tobytes("png")
183
+
184
+ # Save to temporary file
185
+ temp_img_path = f"/tmp/page_{page_num}_{int(time.time())}.png"
186
+ with open(temp_img_path, "wb") as f:
187
+ f.write(img_data)
188
+
189
+ image_paths.append(temp_img_path)
190
+ print(f"βœ“ Converted page {page_num + 1} to image: {temp_img_path}")
191
+
192
+ doc.close()
193
+ return image_paths
194
+
195
+ except Exception as e:
196
+ print(f"Error converting PDF to images: {e}")
197
+ return []
198
+
199
+ def cleanup_temp_files(file_paths):
200
+ """Clean up temporary image files"""
201
+ for file_path in file_paths:
202
+ try:
203
+ if os.path.exists(file_path):
204
+ os.unlink(file_path)
205
+ print(f"βœ“ Cleaned up: {file_path}")
206
+ except Exception as e:
207
+ print(f"Warning: Could not clean up {file_path}: {e}")
208
+
209
  def process_document(file):
210
  """Process uploaded document with PaddleOCR"""
211
  if file is None:
212
  return "No file uploaded", "", ""
213
 
214
  start_time = time.time()
215
+ image_paths = []
216
 
217
  try:
218
  filename = os.path.basename(file.name)
 
221
  file_path = file.name
222
  print(f"File path: {file_path}")
223
 
224
+ # Check if it's a PDF or image
225
+ is_pdf = filename.lower().endswith('.pdf')
226
  total_pages = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ if is_pdf:
229
+ # Convert PDF to images
230
+ print("Converting PDF to images for OCR processing...")
231
+ image_paths = pdf_to_images(file_path)
232
+ total_pages = len(image_paths)
233
+
234
+ if not image_paths:
235
+ return "❌ Failed to convert PDF to images", "", json.dumps({"success": False, "error": "PDF conversion failed"})
236
+ else:
237
+ # For image files, use directly
238
+ image_paths = [file_path]
239
 
240
+ # Process each image with OCR
241
  extracted_text = ""
242
  pages_processed = 0
243
 
244
+ for i, img_path in enumerate(image_paths):
245
+ try:
246
+ print(f"Running OCR on page {i + 1}/{len(image_paths)}: {img_path}")
247
+
248
+ # Run OCR on the image
249
+ result = ocr.ocr(img_path, cls=True)
250
+
251
+ if result and result[0]: # result is a list of pages, we have one page per image
252
  pages_processed += 1
253
+ page_text = ""
254
+
255
+ for line in result[0]:
256
+ if len(line) >= 2 and line[1][1] > 0.5: # confidence threshold
257
+ page_text += line[1][0] + "\n"
258
+
259
+ if page_text.strip():
260
+ extracted_text += f"\n--- Page {i + 1} ---\n"
261
+ extracted_text += page_text
262
+
263
+ print(f"βœ“ Page {i + 1} processed successfully")
264
+ else:
265
+ print(f"⚠️ No text found on page {i + 1}")
266
+
267
+ except Exception as page_error:
268
+ print(f"❌ Error processing page {i + 1}: {page_error}")
269
+ continue
270
 
271
  processing_time = time.time() - start_time
272
 
273
+ # Clean up temporary files
274
+ if is_pdf:
275
+ cleanup_temp_files(image_paths)
276
+
277
  summary = f"""
278
  πŸ“„ **File**: {filename}
279
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
280
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
281
  πŸ“ **Text Length**: {len(extracted_text)} characters
282
  πŸ”§ **OCR Engine**: PaddleOCR
283
+ πŸ–ΌοΈ **Method**: {"PDF β†’ Images β†’ OCR" if is_pdf else "Direct Image OCR"}
284
  """
285
 
286
  api_response = json.dumps({
 
290
  "pages_processed": pages_processed,
291
  "total_pages": total_pages,
292
  "processing_time": processing_time,
293
+ "ocr_engine": "PaddleOCR",
294
+ "method": "pdf_to_images" if is_pdf else "direct_image"
295
  }, indent=2)
296
 
297
  return summary, extracted_text, api_response
298
 
299
  except Exception as e:
300
+ # Clean up on error
301
+ if image_paths:
302
+ cleanup_temp_files(image_paths)
303
+
304
+ error_msg = f"❌ Error processing file: {str(e)}"
305
  print(f"Full error: {e}")
306
  import traceback
307
  traceback.print_exc()
 
309
 
310
  def process_api_request(api_data):
311
  """Process API-style requests (for integration with your Vercel app)"""
312
+ temp_files = []
313
+
314
  try:
315
  data = json.loads(api_data)
316
 
 
326
  tmp_file.write(file_data)
327
  tmp_file_path = tmp_file.name
328
 
329
+ temp_files.append(tmp_file_path)
330
+
331
  try:
332
+ # Check if it's a PDF
333
+ is_pdf = filename.lower().endswith('.pdf')
334
+
335
+ if is_pdf:
336
+ # Convert PDF to images
337
+ image_paths = pdf_to_images(tmp_file_path)
338
+ temp_files.extend(image_paths)
339
+
340
+ if not image_paths:
341
+ return json.dumps({"success": False, "error": "Failed to convert PDF to images"})
342
+ else:
343
+ image_paths = [tmp_file_path]
344
 
345
+ # Process each image with OCR
346
+ extracted_text = ""
347
+ pages_processed = 0
348
+
349
+ for i, img_path in enumerate(image_paths):
350
+ try:
351
+ result = ocr.ocr(img_path, cls=True)
352
+
353
+ if result and result[0]:
354
+ pages_processed += 1
355
+ page_text = ""
356
+
357
+ for line in result[0]:
358
+ if len(line) >= 2:
359
+ page_text += line[1][0] + "\n"
360
+
361
+ if page_text.strip():
362
+ extracted_text += f"\n--- Page {i + 1} ---\n"
363
+ extracted_text += page_text
364
+
365
+ except Exception as page_error:
366
+ print(f"Error processing page {i + 1}: {page_error}")
367
+ continue
368
 
369
  return json.dumps({
370
  "success": True,
371
+ "text": extracted_text,
372
  "filename": filename,
373
+ "pages_processed": pages_processed,
374
+ "total_pages": len(image_paths),
375
+ "ocr_engine": "PaddleOCR",
376
+ "method": "pdf_to_images" if is_pdf else "direct_image"
377
  })
378
 
379
  finally:
380
+ # Clean up all temp files
381
+ for temp_file in temp_files:
382
+ try:
383
+ if os.path.exists(temp_file):
384
+ os.unlink(temp_file)
385
+ except Exception as cleanup_error:
386
+ print(f"Cleanup error: {cleanup_error}")
387
 
388
  except Exception as e:
389
+ # Clean up on error
390
+ for temp_file in temp_files:
391
+ try:
392
+ if os.path.exists(temp_file):
393
+ os.unlink(temp_file)
394
+ except:
395
+ pass
396
  return json.dumps({"success": False, "error": str(e)})
397
 
398
  # Create Gradio interface with multiple tabs
 
454
  "success": true,
455
  "text": "Extracted text content...",
456
  "filename": "lab_report.pdf",
457
+ "pages_processed": 2,
458
+ "total_pages": 2,
459
+ "ocr_engine": "PaddleOCR",
460
+ "method": "pdf_to_images"
461
  }
462
  ]
463
  }
 
490
  This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
491
 
492
  ### πŸ“š Supported Formats
493
+ - PDF documents (multi-page) - converted to images for processing
494
  - JPEG/JPG images
495
  - PNG images
496
 
497
  ### πŸš€ Features
498
  - High accuracy OCR with PaddleOCR
499
+ - Automatic PDF to image conversion
500
  - Medical document optimization
501
  - Multi-page PDF support
502
  - RESTful API integration
 
505
 
506
  ### πŸ”— Integration URL
507
  `https://mbuck17-paddleocr-processor.hf.space/api/predict`
508
+
509
+ ### πŸ“‹ Processing Method
510
+ - **PDFs**: Converted to high-resolution images (200 DPI) then processed with OCR
511
+ - **Images**: Processed directly with OCR
512
+ - **Multi-page**: Each page processed separately and results combined
513
  """)
514
 
515
  # Launch the app