Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 4, 2025

Commit

96be125

1 Parent(s): 2a0cc07

Diagnostic test

Browse files

Files changed (1) hide show

app.py +33 -58

app.py CHANGED Viewed

@@ -3,22 +3,22 @@ import json
 import sys
 import gradio as gr
-def test_ocr_minimal(file):
     if file is None:
-        return "No file uploaded", "", ""
     try:
-        # Run the enhanced test script
-        script_path = "/home/user/app/enhanced_paddle_test.py"
         command = [sys.executable, script_path, file.name]
-        print(f"Running: {' '.join(command)}")
         process = subprocess.run(
             command,
             capture_output=True,
             text=True,
-            timeout=300  # 5 minutes for multi-page processing
         )
         print(f"Return code: {process.returncode}")
@@ -27,79 +27,54 @@ def test_ocr_minimal(file):
         if process.returncode == 0:
             try:
                 result = json.loads(process.stdout.strip())
-                # Format the comprehensive results
-                summary = f"""
-**Enhanced OCR Results:**
-- **Total Detections:** {result.get('total_detections', 0)}
-- **Pages Processed:** {result.get('pages_processed', 0)}
-- **Text Length:** {len(result.get('text', ''))}
-- **Lab Values Found:** {len(result.get('lab_values', {}))}
-- **Settings:** {result.get('settings', 'Unknown')}
-**Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
-**Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
-**Lab Values Detected:**
 """
-                # Add lab values to summary
-                lab_values = result.get('lab_values', {})
-                if lab_values:
-                    for name, data in lab_values.items():
-                        summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
                 else:
-                    summary += "- No lab values detected with current patterns\n"
-                # Format lab values for display
-                lab_display = "**Detected Lab Values:**\n\n"
-                if lab_values:
-                    for name, data in lab_values.items():
-                        lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
-                        lab_display += f"  - Raw text: {data.get('raw_text', 'N/A')}\n"
-                        lab_display += f"  - Confidence: {data.get('confidence', 0):.2f}\n\n"
-                else:
-                    lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
-                return summary, result.get('text', ''), lab_display
             except json.JSONDecodeError as e:
-                return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
         else:
-            return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
-    except subprocess.TimeoutExpired:
-        return "Process timed out after 5 minutes", "", ""
     except Exception as e:
-        return f"Error: {e}", "", ""
-# Enhanced Gradio interface
-with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
-    gr.Markdown("# Enhanced Medical Document OCR")
-    gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        test_btn = gr.Button("Run Enhanced OCR", variant="primary")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Results Summary")
-            summary_output = gr.Markdown(label="Summary")
-        with gr.Column():
-            gr.Markdown("### Lab Values")
-            lab_output = gr.Markdown(label="Lab Values")
-    with gr.Row():
-        gr.Markdown("### Full Extracted Text")
-        text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
     test_btn.click(
-        fn=test_ocr_minimal,
         inputs=[file_input],
-        outputs=[summary_output, text_output, lab_output]
     )
 if __name__ == "__main__":

 import sys
 import gradio as gr
+def run_diagnostic(file):
     if file is None:
+        return "No file uploaded"
     try:
+        # Run the diagnostic script
+        script_path = "/home/user/app/diagnostic_test.py"
         command = [sys.executable, script_path, file.name]
+        print(f"Running diagnostic: {' '.join(command)}")
         process = subprocess.run(
             command,
             capture_output=True,
             text=True,
+            timeout=300
         )
         print(f"Return code: {process.returncode}")
         if process.returncode == 0:
             try:
                 result = json.loads(process.stdout.strip())
+                diagnostics = result.get('diagnostics', {})
+                output = f"""
+# Diagnostic Results
+## Detection Counts:
+- **Minimal Settings (72 DPI):** {diagnostics.get('minimal_detections', 0)}
+- **Current Settings (300 DPI):** {diagnostics.get('current_detections', 0)}
+- **Aggressive Settings (Enhanced):** {diagnostics.get('aggressive_detections', 0)}
+## Best Method: {diagnostics.get('best_method', 'none')} ({diagnostics.get('best_count', 0)} detections)
+## Sample Detected Text:
 """
+                sample_texts = diagnostics.get('sample_texts', [])
+                if sample_texts:
+                    for i, text in enumerate(sample_texts):
+                        output += f"{i+1}. {text}\n"
                 else:
+                    output += "No text detected in any method\n"
+                output += f"\n**Total Pages:** {diagnostics.get('total_pages', 0)}"
+                return output
             except json.JSONDecodeError as e:
+                return f"JSON parse error: {e}\nOutput: {process.stdout}"
         else:
+            return f"Process failed: {process.stderr}"
     except Exception as e:
+        return f"Error: {e}"
+# Simple diagnostic interface
+with gr.Blocks(title="PaddleOCR Diagnostic") as demo:
+    gr.Markdown("# PaddleOCR Performance Diagnostic")
+    gr.Markdown("This will test different OCR settings to identify why detection is poor.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        test_btn = gr.Button("Run Diagnostic", variant="primary")
+    output = gr.Markdown(label="Diagnostic Results")
     test_btn.click(
+        fn=run_diagnostic,
         inputs=[file_input],
+        outputs=[output]
     )
 if __name__ == "__main__":