Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

cced363

1 Parent(s): 3533982

Fixed version with PDF to image conversion

Browse files

Files changed (1) hide show

app.py +169 -52

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - Correct structure with monkey patch BEFORE any fitz imports
 import os
 import subprocess
@@ -159,13 +159,60 @@ except Exception as e:
     if test_doc:
         test_doc.close()
-# Rest of your app code (process_document, API functions, Gradio interface, etc.)
 def process_document(file):
     """Process uploaded document with PaddleOCR"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
     try:
         filename = os.path.basename(file.name)
@@ -174,56 +221,66 @@ def process_document(file):
         file_path = file.name
         print(f"File path: {file_path}")
-        # Count pages if PDF
         total_pages = 1
-        if filename.lower().endswith('.pdf'):
-            try:
-                print(f"Opening PDF: {file_path}")
-                doc = fitz.open(file_path)
-                # Test pageCount attribute
-                print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
-                print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
-                if hasattr(doc, 'pageCount'):
-                    total_pages = doc.pageCount
-                    print(f"Used pageCount: {total_pages}")
-                elif hasattr(doc, 'page_count'):
-                    total_pages = doc.page_count
-                    print(f"Used page_count: {total_pages}")
-                else:
-                    total_pages = len(doc)
-                    print(f"Used len(): {total_pages}")
-                doc.close()
-            except Exception as e:
-                print(f"PDF page counting error: {e}")
-                total_pages = 1
-        # Run OCR
-        print(f"Running OCR on: {file_path}")
-        result = ocr.ocr(file_path, cls=True)
-        # Extract text
         extracted_text = ""
         pages_processed = 0
-        if result:
-            for page_idx, page_result in enumerate(result):
-                if page_result:
                     pages_processed += 1
-                    for line in page_result:
-                        if len(line) >= 2 and line[1][1] > 0.5:
-                            extracted_text += line[1][0] + "\n"
         processing_time = time.time() - start_time
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
 🔧 **OCR Engine**: PaddleOCR
         """
         api_response = json.dumps({
@@ -233,13 +290,18 @@ def process_document(file):
             "pages_processed": pages_processed,
             "total_pages": total_pages,
             "processing_time": processing_time,
-            "ocr_engine": "PaddleOCR"
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
-        error_msg = f"Error processing file: {str(e)}"
         print(f"Full error: {e}")
         import traceback
         traceback.print_exc()
@@ -247,6 +309,8 @@ def process_document(file):
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
     try:
         data = json.loads(api_data)
@@ -262,29 +326,73 @@ def process_api_request(api_data):
             tmp_file.write(file_data)
             tmp_file_path = tmp_file.name
         try:
-            # Run OCR
-            result = ocr.ocr(tmp_file_path, cls=True)
-            # Extract text
-            text = ""
-            for page_result in result:
-                if page_result:
-                    for line in page_result:
-                        if len(line) >= 2:
-                            text += line[1][0] + "\n"
             return json.dumps({
                 "success": True,
-                "text": text,
                 "filename": filename,
-                "ocr_engine": "PaddleOCR"
             })
         finally:
-            os.unlink(tmp_file_path)
     except Exception as e:
         return json.dumps({"success": False, "error": str(e)})
 # Create Gradio interface with multiple tabs
@@ -346,7 +454,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
       "success": true,
       "text": "Extracted text content...",
       "filename": "lab_report.pdf",
-      "ocr_engine": "PaddleOCR"
     }
   ]
 }
@@ -379,12 +490,13 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
         This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
         ### 📚 Supported Formats
-        - PDF documents (multi-page)
         - JPEG/JPG images
         - PNG images
         ### 🚀 Features
         - High accuracy OCR with PaddleOCR
         - Medical document optimization
         - Multi-page PDF support
         - RESTful API integration
@@ -393,6 +505,11 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
         ### 🔗 Integration URL
         `https://mbuck17-paddleocr-processor.hf.space/api/predict`
         """)
 # Launch the app

+# app.py - Fixed version with PDF to image conversion for PaddleOCR
 import os
 import subprocess
     if test_doc:
         test_doc.close()
+def pdf_to_images(pdf_path, dpi=200):
+    """Convert PDF pages to images for OCR processing"""
+    try:
+        doc = fitz.open(pdf_path)
+        images = []
+        image_paths = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            # Create a transformation matrix for higher DPI
+            mat = fitz.Matrix(dpi/72, dpi/72)  # 200 DPI for better OCR accuracy
+            # Render page to pixmap
+            if hasattr(page, 'getPixmap'):
+                pix = page.getPixmap(matrix=mat)
+            else:
+                pix = page.get_pixmap(matrix=mat)
+            # Convert to PIL Image
+            img_data = pix.tobytes("png")
+            # Save to temporary file
+            temp_img_path = f"/tmp/page_{page_num}_{int(time.time())}.png"
+            with open(temp_img_path, "wb") as f:
+                f.write(img_data)
+            image_paths.append(temp_img_path)
+            print(f"✓ Converted page {page_num + 1} to image: {temp_img_path}")
+        doc.close()
+        return image_paths
+    except Exception as e:
+        print(f"Error converting PDF to images: {e}")
+        return []
+def cleanup_temp_files(file_paths):
+    """Clean up temporary image files"""
+    for file_path in file_paths:
+        try:
+            if os.path.exists(file_path):
+                os.unlink(file_path)
+                print(f"✓ Cleaned up: {file_path}")
+        except Exception as e:
+            print(f"Warning: Could not clean up {file_path}: {e}")
 def process_document(file):
     """Process uploaded document with PaddleOCR"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
+    image_paths = []
     try:
         filename = os.path.basename(file.name)
         file_path = file.name
         print(f"File path: {file_path}")
+        # Check if it's a PDF or image
+        is_pdf = filename.lower().endswith('.pdf')
         total_pages = 1
+        if is_pdf:
+            # Convert PDF to images
+            print("Converting PDF to images for OCR processing...")
+            image_paths = pdf_to_images(file_path)
+            total_pages = len(image_paths)
+            if not image_paths:
+                return "❌ Failed to convert PDF to images", "", json.dumps({"success": False, "error": "PDF conversion failed"})
+        else:
+            # For image files, use directly
+            image_paths = [file_path]
+        # Process each image with OCR
         extracted_text = ""
         pages_processed = 0
+        for i, img_path in enumerate(image_paths):
+            try:
+                print(f"Running OCR on page {i + 1}/{len(image_paths)}: {img_path}")
+                # Run OCR on the image
+                result = ocr.ocr(img_path, cls=True)
+                if result and result[0]:  # result is a list of pages, we have one page per image
                     pages_processed += 1
+                    page_text = ""
+                    for line in result[0]:
+                        if len(line) >= 2 and line[1][1] > 0.5:  # confidence threshold
+                            page_text += line[1][0] + "\n"
+                    if page_text.strip():
+                        extracted_text += f"\n--- Page {i + 1} ---\n"
+                        extracted_text += page_text
+                    print(f"✓ Page {i + 1} processed successfully")
+                else:
+                    print(f"⚠️ No text found on page {i + 1}")
+            except Exception as page_error:
+                print(f"❌ Error processing page {i + 1}: {page_error}")
+                continue
         processing_time = time.time() - start_time
+        # Clean up temporary files
+        if is_pdf:
+            cleanup_temp_files(image_paths)
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
 🔧 **OCR Engine**: PaddleOCR
+🖼️ **Method**: {"PDF → Images → OCR" if is_pdf else "Direct Image OCR"}
         """
         api_response = json.dumps({
             "pages_processed": pages_processed,
             "total_pages": total_pages,
             "processing_time": processing_time,
+            "ocr_engine": "PaddleOCR",
+            "method": "pdf_to_images" if is_pdf else "direct_image"
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
+        # Clean up on error
+        if image_paths:
+            cleanup_temp_files(image_paths)
+        error_msg = f"❌ Error processing file: {str(e)}"
         print(f"Full error: {e}")
         import traceback
         traceback.print_exc()
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
+    temp_files = []
     try:
         data = json.loads(api_data)
             tmp_file.write(file_data)
             tmp_file_path = tmp_file.name
+        temp_files.append(tmp_file_path)
         try:
+            # Check if it's a PDF
+            is_pdf = filename.lower().endswith('.pdf')
+            if is_pdf:
+                # Convert PDF to images
+                image_paths = pdf_to_images(tmp_file_path)
+                temp_files.extend(image_paths)
+                if not image_paths:
+                    return json.dumps({"success": False, "error": "Failed to convert PDF to images"})
+            else:
+                image_paths = [tmp_file_path]
+            # Process each image with OCR
+            extracted_text = ""
+            pages_processed = 0
+            for i, img_path in enumerate(image_paths):
+                try:
+                    result = ocr.ocr(img_path, cls=True)
+                    if result and result[0]:
+                        pages_processed += 1
+                        page_text = ""
+                        for line in result[0]:
+                            if len(line) >= 2:
+                                page_text += line[1][0] + "\n"
+                        if page_text.strip():
+                            extracted_text += f"\n--- Page {i + 1} ---\n"
+                            extracted_text += page_text
+                except Exception as page_error:
+                    print(f"Error processing page {i + 1}: {page_error}")
+                    continue
             return json.dumps({
                 "success": True,
+                "text": extracted_text,
                 "filename": filename,
+                "pages_processed": pages_processed,
+                "total_pages": len(image_paths),
+                "ocr_engine": "PaddleOCR",
+                "method": "pdf_to_images" if is_pdf else "direct_image"
             })
         finally:
+            # Clean up all temp files
+            for temp_file in temp_files:
+                try:
+                    if os.path.exists(temp_file):
+                        os.unlink(temp_file)
+                except Exception as cleanup_error:
+                    print(f"Cleanup error: {cleanup_error}")
     except Exception as e:
+        # Clean up on error
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except:
+                pass
         return json.dumps({"success": False, "error": str(e)})
 # Create Gradio interface with multiple tabs
       "success": true,
       "text": "Extracted text content...",
       "filename": "lab_report.pdf",
+      "pages_processed": 2,
+      "total_pages": 2,
+      "ocr_engine": "PaddleOCR",
+      "method": "pdf_to_images"
     }
   ]
 }
         This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
         ### 📚 Supported Formats
+        - PDF documents (multi-page) - converted to images for processing
         - JPEG/JPG images
         - PNG images
         ### 🚀 Features
         - High accuracy OCR with PaddleOCR
+        - Automatic PDF to image conversion
         - Medical document optimization
         - Multi-page PDF support
         - RESTful API integration
         ### 🔗 Integration URL
         `https://mbuck17-paddleocr-processor.hf.space/api/predict`
+        ### 📋 Processing Method
+        - **PDFs**: Converted to high-resolution images (200 DPI) then processed with OCR
+        - **Images**: Processed directly with OCR
+        - **Multi-page**: Each page processed separately and results combined
         """)
 # Launch the app