Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 2, 2025

Commit

8c10e64

1 Parent(s): b313953

Initial PaddleOCR setup

Browse files

Files changed (3) hide show

README.md +22 -7
app.py +195 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,14 +1,29 @@
 ---
-title: Paddleocr Processor
-emoji: ⚡
-colorFrom: yellow
-colorTo: pink
 sdk: gradio
-sdk_version: 5.32.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: OCR processor for health tracker app
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+"""
 ---
+title: PaddleOCR Medical Document Processor
+emoji: 🏥
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.8.0
 app_file: app.py
 pinned: false
 license: mit
 ---
+# PaddleOCR Medical Document Processor
+This Hugging Face Space provides OCR processing for medical documents using PaddleOCR.
+## Features
+- Extract text from PDFs and images
+- Optimized for medical/lab documents
+- RESTful API for integration
+- Multi-page PDF support
+## API Usage
+Send POST requests to the `/api/predict` endpoint with JSON data containing base64-encoded files.
+## Integration
+This space can be integrated with external applications as an OCR microservice.
+"""

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# app.py - Hugging Face Spaces version
+import gradio as gr
+import tempfile
+import os
+import time
+import base64
+import json
+from paddleocr import PaddleOCR
+import fitz  # PyMuPDF
+# Initialize PaddleOCR
+print("Loading PaddleOCR models...")
+ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+print("PaddleOCR models loaded!")
+def process_document(file):
+    """Process uploaded document with PaddleOCR"""
+    if file is None:
+        return "No file uploaded", "", ""
+    start_time = time.time()
+    try:
+        # Get file info
+        filename = os.path.basename(file.name)
+        print(f"Processing: {filename}")
+        # Count pages if PDF
+        total_pages = 1
+        if filename.lower().endswith('.pdf'):
+            try:
+                doc = fitz.open(file.name)
+                total_pages = len(doc)
+                doc.close()
+            except Exception as e:
+                print(f"Could not count PDF pages: {e}")
+        # Run OCR
+        result = ocr.ocr(file.name, cls=True)
+        # Extract text
+        extracted_text = ""
+        pages_processed = 0
+        for page_idx, page_result in enumerate(result):
+            if page_result:
+                pages_processed += 1
+                for line in page_result:
+                    if len(line) >= 2 and line[1][1] > 0.5:  # Confidence > 50%
+                        extracted_text += line[1][0] + "\n"
+        processing_time = time.time() - start_time
+        # Create summary
+        summary = f"""
+📄 **File**: {filename}
+📊 **Pages Processed**: {pages_processed}/{total_pages}
+⏱️ **Processing Time**: {processing_time:.2f} seconds
+📝 **Text Length**: {len(extracted_text)} characters
+        """
+        # For API compatibility, also return JSON format
+        api_response = json.dumps({
+            "success": True,
+            "text": extracted_text,
+            "filename": filename,
+            "pages_processed": pages_processed,
+            "total_pages": total_pages,
+            "processing_time": processing_time
+        }, indent=2)
+        return summary, extracted_text, api_response
+    except Exception as e:
+        error_msg = f"Error processing file: {str(e)}"
+        return error_msg, "", json.dumps({"success": False, "error": str(e)})
+def process_api_request(api_data):
+    """Process API-style requests (for integration with your Vercel app)"""
+    try:
+        data = json.loads(api_data)
+        if 'file' not in data:
+            return json.dumps({"success": False, "error": "No file data provided"})
+        # Decode base64 file
+        file_data = base64.b64decode(data['file'])
+        filename = data.get('filename', 'unknown.pdf')
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
+            tmp_file.write(file_data)
+            tmp_file_path = tmp_file.name
+        try:
+            # Run OCR
+            result = ocr.ocr(tmp_file_path, cls=True)
+            # Extract text
+            text = ""
+            for page_result in result:
+                if page_result:
+                    for line in page_result:
+                        if len(line) >= 2:
+                            text += line[1][0] + "\n"
+            return json.dumps({
+                "success": True,
+                "text": text,
+                "filename": filename
+            })
+        finally:
+            os.unlink(tmp_file_path)
+    except Exception as e:
+        return json.dumps({"success": False, "error": str(e)})
+# Create Gradio interface with multiple tabs
+with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
+    gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
+    gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
+    with gr.Tab("📄 File Upload"):
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(
+                    label="Upload Document (PDF, JPG, PNG)",
+                    file_types=[".pdf", ".jpg", ".jpeg", ".png"]
+                )
+                process_btn = gr.Button("🔍 Process Document", variant="primary")
+            with gr.Column():
+                summary_output = gr.Markdown(label="📊 Processing Summary")
+        with gr.Row():
+            text_output = gr.Textbox(
+                label="📝 Extracted Text",
+                lines=15,
+                max_lines=20
+            )
+        process_btn.click(
+            fn=process_document,
+            inputs=[file_input],
+            outputs=[summary_output, text_output, gr.Textbox(visible=False)]
+        )
+    with gr.Tab("🔌 API Integration"):
+        gr.Markdown("### For integration with your Vercel app:")
+        gr.Markdown("**Endpoint**: `https://your-space-name-your-username.hf.space/api/predict`")
+        gr.Markdown("**Method**: POST")
+        gr.Markdown("**Headers**: `Content-Type: application/json`")
+        api_input = gr.Textbox(
+            label="API Request (JSON)",
+            placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
+            lines=5
+        )
+        api_btn = gr.Button("🧪 Test API Request")
+        api_output = gr.Textbox(
+            label="API Response (JSON)",
+            lines=10
+        )
+        api_btn.click(
+            fn=process_api_request,
+            inputs=[api_input],
+            outputs=[api_output]
+        )
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ### 🎯 Purpose
+        This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
+        ### 🔧 Integration
+        This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
+        ### 📚 Supported Formats
+        - PDF documents (multi-page)
+        - JPEG/JPG images
+        - PNG images
+        ### 🚀 Features
+        - High accuracy OCR with PaddleOCR
+        - Medical document optimization
+        - Multi-page PDF support
+        - RESTful API integration
+        - Free hosting on Hugging Face
+        """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==4.8.0
+paddlepaddle==2.5.1
+paddleocr==2.6.1.3
+PyMuPDF==1.23.0
+Pillow==10.0.0