paddleocr-processor / archive /app - enhanced.py
mbuckle's picture
Diagnostic test
2a0cc07
import subprocess
import json
import sys
import gradio as gr
def test_ocr_minimal(file):
if file is None:
return "No file uploaded", "", ""
try:
# Run the enhanced test script
script_path = "/home/user/app/enhanced_paddle_test.py"
command = [sys.executable, script_path, file.name]
print(f"Running: {' '.join(command)}")
process = subprocess.run(
command,
capture_output=True,
text=True,
timeout=300 # 5 minutes for multi-page processing
)
print(f"Return code: {process.returncode}")
print(f"Stderr: {process.stderr}")
if process.returncode == 0:
try:
result = json.loads(process.stdout.strip())
# Format the comprehensive results
summary = f"""
**Enhanced OCR Results:**
- **Total Detections:** {result.get('total_detections', 0)}
- **Pages Processed:** {result.get('pages_processed', 0)}
- **Text Length:** {len(result.get('text', ''))}
- **Lab Values Found:** {len(result.get('lab_values', {}))}
- **Settings:** {result.get('settings', 'Unknown')}
**Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
**Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
**Lab Values Detected:**
"""
# Add lab values to summary
lab_values = result.get('lab_values', {})
if lab_values:
for name, data in lab_values.items():
summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
else:
summary += "- No lab values detected with current patterns\n"
# Format lab values for display
lab_display = "**Detected Lab Values:**\n\n"
if lab_values:
for name, data in lab_values.items():
lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
lab_display += f" - Raw text: {data.get('raw_text', 'N/A')}\n"
lab_display += f" - Confidence: {data.get('confidence', 0):.2f}\n\n"
else:
lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
return summary, result.get('text', ''), lab_display
except json.JSONDecodeError as e:
return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
else:
return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
except subprocess.TimeoutExpired:
return "Process timed out after 5 minutes", "", ""
except Exception as e:
return f"Error: {e}", "", ""
# Enhanced Gradio interface
with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
gr.Markdown("# Enhanced Medical Document OCR")
gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
with gr.Row():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
test_btn = gr.Button("Run Enhanced OCR", variant="primary")
with gr.Row():
with gr.Column():
gr.Markdown("### Results Summary")
summary_output = gr.Markdown(label="Summary")
with gr.Column():
gr.Markdown("### Lab Values")
lab_output = gr.Markdown(label="Lab Values")
with gr.Row():
gr.Markdown("### Full Extracted Text")
text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
test_btn.click(
fn=test_ocr_minimal,
inputs=[file_input],
outputs=[summary_output, text_output, lab_output]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)