Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 4, 2025

Commit

2a0cc07

1 Parent(s): b92fc27

Diagnostic test

Browse files

Files changed (3) hide show

archive/app - enhanced.py +106 -0
enhanced_paddle_test.py → archive/enhanced_paddle_test.py +0 -0
diagnostic_test.py +180 -0

archive/app - enhanced.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import subprocess
+import json
+import sys
+import gradio as gr
+def test_ocr_minimal(file):
+    if file is None:
+        return "No file uploaded", "", ""
+    try:
+        # Run the enhanced test script
+        script_path = "/home/user/app/enhanced_paddle_test.py"
+        command = [sys.executable, script_path, file.name]
+        print(f"Running: {' '.join(command)}")
+        process = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            timeout=300  # 5 minutes for multi-page processing
+        )
+        print(f"Return code: {process.returncode}")
+        print(f"Stderr: {process.stderr}")
+        if process.returncode == 0:
+            try:
+                result = json.loads(process.stdout.strip())
+                # Format the comprehensive results
+                summary = f"""
+**Enhanced OCR Results:**
+- **Total Detections:** {result.get('total_detections', 0)}
+- **Pages Processed:** {result.get('pages_processed', 0)}
+- **Text Length:** {len(result.get('text', ''))}
+- **Lab Values Found:** {len(result.get('lab_values', {}))}
+- **Settings:** {result.get('settings', 'Unknown')}
+**Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
+**Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
+**Lab Values Detected:**
+"""
+                # Add lab values to summary
+                lab_values = result.get('lab_values', {})
+                if lab_values:
+                    for name, data in lab_values.items():
+                        summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
+                else:
+                    summary += "- No lab values detected with current patterns\n"
+                # Format lab values for display
+                lab_display = "**Detected Lab Values:**\n\n"
+                if lab_values:
+                    for name, data in lab_values.items():
+                        lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
+                        lab_display += f"  - Raw text: {data.get('raw_text', 'N/A')}\n"
+                        lab_display += f"  - Confidence: {data.get('confidence', 0):.2f}\n\n"
+                else:
+                    lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
+                return summary, result.get('text', ''), lab_display
+            except json.JSONDecodeError as e:
+                return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
+        else:
+            return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
+    except subprocess.TimeoutExpired:
+        return "Process timed out after 5 minutes", "", ""
+    except Exception as e:
+        return f"Error: {e}", "", ""
+# Enhanced Gradio interface
+with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
+    gr.Markdown("# Enhanced Medical Document OCR")
+    gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
+    with gr.Row():
+        file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        test_btn = gr.Button("Run Enhanced OCR", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Results Summary")
+            summary_output = gr.Markdown(label="Summary")
+        with gr.Column():
+            gr.Markdown("### Lab Values")
+            lab_output = gr.Markdown(label="Lab Values")
+    with gr.Row():
+        gr.Markdown("### Full Extracted Text")
+        text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
+    test_btn.click(
+        fn=test_ocr_minimal,
+        inputs=[file_input],
+        outputs=[summary_output, text_output, lab_output]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

enhanced_paddle_test.py → archive/enhanced_paddle_test.py RENAMED Viewed

File without changes

diagnostic_test.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python3
+# diagnostic_test.py - Debug PaddleOCR performance issues
+import sys
+import os
+import json
+import fitz
+import cv2
+import numpy as np
+from paddleocr import PaddleOCR
+def diagnostic_test():
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "No file path provided"}))
+        return
+    file_path = sys.argv[1]
+    try:
+        print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
+        # Check system info
+        print(f"Python version: {sys.version}", file=sys.stderr)
+        print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
+        # Check PaddleOCR installation
+        try:
+            import paddle
+            print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
+        except:
+            print("PaddlePaddle not available", file=sys.stderr)
+        # Open PDF and get basic info
+        doc = fitz.open(file_path)
+        total_pages = len(doc)
+        print(f"PDF pages: {total_pages}", file=sys.stderr)
+        # Test different extraction methods on first page
+        page = doc[0]
+        # Method 1: Standard quality (72 DPI)
+        print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
+        pix_72 = page.get_pixmap(alpha=False)
+        temp_72 = "/tmp/test_72dpi.png"
+        pix_72.save(temp_72)
+        print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
+        # Method 2: High quality (300 DPI)
+        print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
+        mat = fitz.Matrix(300/72, 300/72)
+        pix_300 = page.get_pixmap(matrix=mat, alpha=False)
+        temp_300 = "/tmp/test_300dpi.png"
+        pix_300.save(temp_300)
+        print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
+        # Method 3: Try different preprocessing
+        print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
+        img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
+        # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
+        img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
+        # Apply some preprocessing
+        gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
+        # Increase contrast
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        enhanced = clahe.apply(gray)
+        temp_enhanced = "/tmp/test_enhanced.png"
+        cv2.imwrite(temp_enhanced, enhanced)
+        print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
+        doc.close()
+        # Test OCR with minimal settings first
+        print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
+        ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
+        result_minimal = ocr_minimal.ocr(temp_72, cls=False)
+        print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
+        # Test OCR with your current settings
+        print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
+        ocr_current = PaddleOCR(
+            use_angle_cls=True,
+            lang='en',
+            show_log=False,
+            use_gpu=False,
+            det_limit_side_len=2880,
+            det_limit_type='max',
+            rec_batch_num=8,
+            max_text_length=50,
+            use_space_char=True,
+            drop_score=0.1
+        )
+        result_current = ocr_current.ocr(temp_300, cls=True)
+        current_detections = len(result_current[0]) if result_current and result_current[0] else 0
+        print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
+        # Test OCR with more aggressive settings
+        print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
+        ocr_aggressive = PaddleOCR(
+            use_angle_cls=True,
+            lang='en',
+            show_log=False,
+            use_gpu=False,
+            det_limit_side_len=4000,    # Even higher
+            det_limit_type='max',
+            rec_batch_num=1,            # Lower batch for memory
+            max_text_length=100,        # Longer text
+            use_space_char=True,
+            drop_score=0.05             # Very low threshold
+        )
+        result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
+        aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
+        print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
+        # Show sample results from best performing method
+        best_result = None
+        best_count = 0
+        best_method = "none"
+        if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
+            best_result = result_minimal
+            best_count = len(result_minimal[0])
+            best_method = "minimal"
+        if current_detections > best_count:
+            best_result = result_current
+            best_count = current_detections
+            best_method = "current"
+        if aggressive_detections > best_count:
+            best_result = result_aggressive
+            best_count = aggressive_detections
+            best_method = "aggressive"
+        print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
+        # Extract and show sample text from best result
+        sample_texts = []
+        if best_result and best_result[0]:
+            for i, detection in enumerate(best_result[0][:10]):  # First 10 only
+                if len(detection) >= 2:
+                    text_info = detection[1]
+                    if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
+                        text = str(text_info[0])
+                        conf = float(text_info[1])
+                    else:
+                        text = str(text_info)
+                        conf = 1.0
+                    sample_texts.append(f"'{text}' ({conf:.2f})")
+                    print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
+        # Clean up
+        for temp_file in [temp_72, temp_300, temp_enhanced]:
+            if os.path.exists(temp_file):
+                os.unlink(temp_file)
+        # Return diagnostic results
+        result = {
+            "success": True,
+            "diagnostics": {
+                "total_pages": total_pages,
+                "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
+                "current_detections": current_detections,
+                "aggressive_detections": aggressive_detections,
+                "best_method": best_method,
+                "best_count": best_count,
+                "sample_texts": sample_texts
+            }
+        }
+        print(json.dumps(result))
+    except Exception as e:
+        print(f"Diagnostic error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        print(json.dumps({"success": False, "error": str(e)}))
+if __name__ == "__main__":
+    diagnostic_test()