Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

3ca6417

1 Parent(s): 503febe

Enhanced paddle test version 2

Browse files

Files changed (2) hide show

app.py +14 -17
enhanced_paddle_test.py +95 -83

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ def test_ocr_minimal(file):
         return "No file uploaded", ""
     try:
-        # Run the enhanced test script
         script_path = "/home/user/app/enhanced_paddle_test.py"
         command = [sys.executable, script_path, file.name]
@@ -18,7 +18,7 @@ def test_ocr_minimal(file):
             command,
             capture_output=True,
             text=True,
-            timeout=120
         )
         print(f"Return code: {process.returncode}")
@@ -29,21 +29,18 @@ def test_ocr_minimal(file):
             try:
                 result = json.loads(process.stdout.strip())
-                # Format the enhanced results
                 summary = f"""
-**Results Summary:**
-- **Best Approach:** {result.get('best_approach', 'Unknown')}
-- **Best Detections:** {result.get('detections', 0)}
 - **Text Length:** {len(result.get('text', ''))}
-**All Approaches:**
 """
-                # Add results for each approach
-                all_results = result.get('all_results', {})
-                for approach_name, approach_data in all_results.items():
-                    summary += f"\n- **{approach_name}:** {approach_data.get('detections', 0)} detections"
                 return summary, result.get('text', '')
             except json.JSONDecodeError:
                 return f"JSON parse error. Stdout: {process.stdout}", ""
@@ -54,19 +51,19 @@ def test_ocr_minimal(file):
         return f"Error: {e}", ""
 # Simple Gradio interface for testing
-with gr.Blocks(title="Enhanced OCR Test") as demo:
-    gr.Markdown("# Enhanced OCR Test - Multiple Approaches")
-    gr.Markdown("This will test different DPI settings and OCR configurations to find the best quality match for your local implementation.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        test_btn = gr.Button("Test Multiple OCR Approaches")
     with gr.Row():
         summary_output = gr.Markdown(label="Results Summary")
     with gr.Row():
-        text_output = gr.Textbox(label="Best Extracted Text", lines=15)
     test_btn.click(
         fn=test_ocr_minimal,

         return "No file uploaded", ""
     try:
+        # Run the focused high-quality test script
         script_path = "/home/user/app/enhanced_paddle_test.py"
         command = [sys.executable, script_path, file.name]
             command,
             capture_output=True,
             text=True,
+            timeout=180  # Increased to 3 minutes for high-quality processing
         )
         print(f"Return code: {process.returncode}")
             try:
                 result = json.loads(process.stdout.strip())
+                # Format the focused results
                 summary = f"""
+**High-Quality OCR Results:**
+- **Detections Found:** {result.get('detections', 0)}
 - **Text Length:** {len(result.get('text', ''))}
+- **Settings:** {result.get('settings', 'Unknown')}
+**Sample Numbers Found:** {', '.join(result.get('numbers_found', []))}
+**Sample Terms Found:** {', '.join(result.get('terms_found', []))}
 """
                 return summary, result.get('text', '')
             except json.JSONDecodeError:
                 return f"JSON parse error. Stdout: {process.stdout}", ""
         return f"Error: {e}", ""
 # Simple Gradio interface for testing
+with gr.Blocks(title="Focused High-Quality OCR Test") as demo:
+    gr.Markdown("# Focused High-Quality OCR Test")
+    gr.Markdown("This uses optimized settings for medical documents: 300 DPI, medical-specific OCR parameters, and lower confidence thresholds.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        test_btn = gr.Button("Run High-Quality OCR Test")
     with gr.Row():
         summary_output = gr.Markdown(label="Results Summary")
     with gr.Row():
+        text_output = gr.Textbox(label="Extracted Text", lines=15)
     test_btn.click(
         fn=test_ocr_minimal,

enhanced_paddle_test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# enhanced_paddle_test.py - Trying to match your local implementation quality
 import sys
 import os
@@ -7,7 +7,7 @@ import json
 import fitz
 from paddleocr import PaddleOCR
-def test_multiple_approaches():
     if len(sys.argv) < 2:
         print(json.dumps({"error": "No file path provided"}))
         return
@@ -15,105 +15,117 @@ def test_multiple_approaches():
     file_path = sys.argv[1]
     try:
-        print(f"Testing multiple OCR approaches on: {file_path}", file=sys.stderr)
-        # Test different DPI settings and OCR configurations
-        approaches = [
-            {"name": "High DPI (300)", "dpi": 300, "det_limit_side_len": 1960},
-            {"name": "Medium DPI (200)", "dpi": 200, "det_limit_side_len": 1280},
-            {"name": "Low DPI (150)", "dpi": 150, "det_limit_side_len": 960},
-            {"name": "Your Local Settings", "dpi": 200, "det_limit_side_len": None}
-        ]
         doc = fitz.open(file_path)
         print(f"PDF has {len(doc)} pages", file=sys.stderr)
-        all_results = {}
-        for approach in approaches:
-            print(f"\n=== Testing {approach['name']} ===", file=sys.stderr)
-            # Convert first page with specific DPI
-            page = doc[0]
-            mat = fitz.Matrix(approach['dpi']/72, approach['dpi']/72)
-            pix = page.get_pixmap(matrix=mat)
-            temp_img = f"/tmp/test_{approach['dpi']}.png"
-            pix.save(temp_img)
-            if os.path.exists(temp_img):
-                img_size = os.path.getsize(temp_img)
-                print(f"Image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
-                # Initialize OCR with specific settings
-                ocr_kwargs = {
-                    'use_angle_cls': True,
-                    'lang': 'en',
-                    'show_log': False,
-                    'use_gpu': False
-                }
-                if approach['det_limit_side_len']:
-                    ocr_kwargs['det_limit_side_len'] = approach['det_limit_side_len']
-                print(f"OCR settings: {ocr_kwargs}", file=sys.stderr)
-                ocr = PaddleOCR(**ocr_kwargs)
-                # Run OCR
-                result = ocr.ocr(temp_img, cls=True)
-                if result and result[0]:
-                    detections = len(result[0])
-                    print(f"Detections: {detections}", file=sys.stderr)
-                    # Extract all text
-                    text_parts = []
-                    for i, detection in enumerate(result[0]):
-                        if len(detection) >= 2:
-                            text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
-                            conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
-                            if conf > 0.3:  # Lower threshold for testing
-                                text_parts.append(text)
-                                if i < 10:  # Show first 10 detections
-                                    print(f"  {i}: '{text}' ({conf:.2f})", file=sys.stderr)
-                    all_results[approach['name']] = {
-                        'detections': detections,
-                        'text': '\n'.join(text_parts),
-                        'settings': ocr_kwargs
-                    }
-                else:
-                    print("No detections", file=sys.stderr)
-                    all_results[approach['name']] = {'detections': 0, 'text': '', 'settings': ocr_kwargs}
                 # Clean up
                 if os.path.exists(temp_img):
                     os.unlink(temp_img)
             else:
-                print(f"Failed to create image: {temp_img}", file=sys.stderr)
-        doc.close()
-        # Find best result
-        best_approach = max(all_results.keys(), key=lambda k: all_results[k]['detections'])
-        print(f"\nBest approach: {best_approach} with {all_results[best_approach]['detections']} detections", file=sys.stderr)
-        # Return the best result
-        print(json.dumps({
-            "success": True,
-            "best_approach": best_approach,
-            "all_results": all_results,
-            "text": all_results[best_approach]['text'],
-            "detections": all_results[best_approach]['detections']
-        }))
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         import traceback
         traceback.print_exc(file=sys.stderr)
         print(json.dumps({"success": False, "error": str(e)}))
 if __name__ == "__main__":
-    test_multiple_approaches()

 #!/usr/bin/env python3
+# focused_paddle_test.py - Quick test focused on high-quality settings
 import sys
 import os
 import fitz
 from paddleocr import PaddleOCR
+def test_high_quality_ocr():
     if len(sys.argv) < 2:
         print(json.dumps({"error": "No file path provided"}))
         return
     file_path = sys.argv[1]
     try:
+        print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
+        # Open PDF
         doc = fitz.open(file_path)
         print(f"PDF has {len(doc)} pages", file=sys.stderr)
+        # Convert first page with high quality settings
+        page = doc[0]
+        # Use higher DPI and better quality settings
+        mat = fitz.Matrix(300/72, 300/72)  # 300 DPI like professional scanners
+        pix = page.get_pixmap(matrix=mat, alpha=False)  # No alpha for better OCR
+        temp_img = "/tmp/high_quality_page.png"
+        pix.save(temp_img)
+        if os.path.exists(temp_img):
+            img_size = os.path.getsize(temp_img)
+            print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
+        else:
+            print("Failed to create high quality image", file=sys.stderr)
+            doc.close()
+            return
+        doc.close()
+        # Initialize OCR with optimized settings for medical documents
+        print("Initializing OCR with medical document settings...", file=sys.stderr)
+        ocr = PaddleOCR(
+            use_angle_cls=True,          # Detect text orientation
+            lang='en',                   # English language
+            show_log=False,              # Suppress logs
+            use_gpu=False,               # CPU mode for serverless
+            det_limit_side_len=1960,     # Higher detection limit for high-res images
+            det_limit_type='max',        # Max side length limit
+            rec_batch_num=6,             # Process more text regions at once
+            max_text_length=25,          # Allow longer text detection
+            use_space_char=True,         # Preserve spaces in text
+            drop_score=0.2               # Lower threshold to catch more text
+        )
+        print("OCR initialized with medical settings", file=sys.stderr)
+        # Run OCR with these optimized settings
+        print("Running optimized OCR...", file=sys.stderr)
+        result = ocr.ocr(temp_img, cls=True)
+        print(f"OCR result type: {type(result)}", file=sys.stderr)
+        if result:
+            print(f"Result length: {len(result)}", file=sys.stderr)
+            if result[0]:
+                detections = len(result[0])
+                print(f"High-quality approach found {detections} detections", file=sys.stderr)
+                # Extract text with lower confidence threshold
+                text_parts = []
+                medical_terms = []
+                numbers = []
+                for i, detection in enumerate(result[0]):
+                    if len(detection) >= 2:
+                        text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
+                        conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
+                        # Show first 20 detections for debugging
+                        if i < 20:
+                            print(f"  {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
+                        # Use lower confidence threshold (0.2 instead of 0.3)
+                        if conf > 0.2:
+                            text_parts.append(text)
+                            # Categorize detections
+                            if any(char.isdigit() for char in text) and '.' in text:
+                                numbers.append(text)
+                            elif len(text) > 3 and text.isalpha():
+                                medical_terms.append(text)
+                full_text = '\n'.join(text_parts)
                 # Clean up
                 if os.path.exists(temp_img):
                     os.unlink(temp_img)
+                print(f"Extracted {len(text_parts)} text pieces ({len(numbers)} numbers, {len(medical_terms)} terms)", file=sys.stderr)
+                # Return comprehensive result
+                print(json.dumps({
+                    "success": True,
+                    "text": full_text,
+                    "detections": detections,
+                    "numbers_found": numbers[:10],  # First 10 numbers
+                    "terms_found": medical_terms[:10],  # First 10 terms
+                    "settings": "High-quality 300 DPI with medical optimization"
+                }))
             else:
+                print("First page result is empty", file=sys.stderr)
+                print(json.dumps({"success": False, "error": "No text detected"}))
+        else:
+            print("OCR returned None", file=sys.stderr)
+            print(json.dumps({"success": False, "error": "OCR returned no results"}))
     except Exception as e:
+        # Clean up on error
+        if os.path.exists("/tmp/high_quality_page.png"):
+            os.unlink("/tmp/high_quality_page.png")
         print(f"Error: {e}", file=sys.stderr)
         import traceback
         traceback.print_exc(file=sys.stderr)
         print(json.dumps({"success": False, "error": str(e)}))
 if __name__ == "__main__":
+    test_high_quality_ocr()