mbuckle commited on
Commit
96be125
·
1 Parent(s): 2a0cc07

Diagnostic test

Browse files
Files changed (1) hide show
  1. app.py +33 -58
app.py CHANGED
@@ -3,22 +3,22 @@ import json
3
  import sys
4
  import gradio as gr
5
 
6
- def test_ocr_minimal(file):
7
  if file is None:
8
- return "No file uploaded", "", ""
9
 
10
  try:
11
- # Run the enhanced test script
12
- script_path = "/home/user/app/enhanced_paddle_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
15
- print(f"Running: {' '.join(command)}")
16
 
17
  process = subprocess.run(
18
  command,
19
  capture_output=True,
20
  text=True,
21
- timeout=300 # 5 minutes for multi-page processing
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
@@ -27,79 +27,54 @@ def test_ocr_minimal(file):
27
  if process.returncode == 0:
28
  try:
29
  result = json.loads(process.stdout.strip())
 
30
 
31
- # Format the comprehensive results
32
- summary = f"""
33
- **Enhanced OCR Results:**
34
- - **Total Detections:** {result.get('total_detections', 0)}
35
- - **Pages Processed:** {result.get('pages_processed', 0)}
36
- - **Text Length:** {len(result.get('text', ''))}
37
- - **Lab Values Found:** {len(result.get('lab_values', {}))}
38
- - **Settings:** {result.get('settings', 'Unknown')}
39
 
40
- **Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
 
 
 
41
 
42
- **Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
43
 
44
- **Lab Values Detected:**
45
  """
46
-
47
- # Add lab values to summary
48
- lab_values = result.get('lab_values', {})
49
- if lab_values:
50
- for name, data in lab_values.items():
51
- summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
52
  else:
53
- summary += "- No lab values detected with current patterns\n"
54
 
55
- # Format lab values for display
56
- lab_display = "**Detected Lab Values:**\n\n"
57
- if lab_values:
58
- for name, data in lab_values.items():
59
- lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
60
- lab_display += f" - Raw text: {data.get('raw_text', 'N/A')}\n"
61
- lab_display += f" - Confidence: {data.get('confidence', 0):.2f}\n\n"
62
- else:
63
- lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
64
 
65
- return summary, result.get('text', ''), lab_display
66
 
67
  except json.JSONDecodeError as e:
68
- return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
69
  else:
70
- return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
71
 
72
- except subprocess.TimeoutExpired:
73
- return "Process timed out after 5 minutes", "", ""
74
  except Exception as e:
75
- return f"Error: {e}", "", ""
76
 
77
- # Enhanced Gradio interface
78
- with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
79
- gr.Markdown("# Enhanced Medical Document OCR")
80
- gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
81
 
82
  with gr.Row():
83
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
84
- test_btn = gr.Button("Run Enhanced OCR", variant="primary")
85
-
86
- with gr.Row():
87
- with gr.Column():
88
- gr.Markdown("### Results Summary")
89
- summary_output = gr.Markdown(label="Summary")
90
-
91
- with gr.Column():
92
- gr.Markdown("### Lab Values")
93
- lab_output = gr.Markdown(label="Lab Values")
94
 
95
- with gr.Row():
96
- gr.Markdown("### Full Extracted Text")
97
- text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
98
 
99
  test_btn.click(
100
- fn=test_ocr_minimal,
101
  inputs=[file_input],
102
- outputs=[summary_output, text_output, lab_output]
103
  )
104
 
105
  if __name__ == "__main__":
 
3
  import sys
4
  import gradio as gr
5
 
6
+ def run_diagnostic(file):
7
  if file is None:
8
+ return "No file uploaded"
9
 
10
  try:
11
+ # Run the diagnostic script
12
+ script_path = "/home/user/app/diagnostic_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
15
+ print(f"Running diagnostic: {' '.join(command)}")
16
 
17
  process = subprocess.run(
18
  command,
19
  capture_output=True,
20
  text=True,
21
+ timeout=300
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
 
27
  if process.returncode == 0:
28
  try:
29
  result = json.loads(process.stdout.strip())
30
+ diagnostics = result.get('diagnostics', {})
31
 
32
+ output = f"""
33
+ # Diagnostic Results
 
 
 
 
 
 
34
 
35
+ ## Detection Counts:
36
+ - **Minimal Settings (72 DPI):** {diagnostics.get('minimal_detections', 0)}
37
+ - **Current Settings (300 DPI):** {diagnostics.get('current_detections', 0)}
38
+ - **Aggressive Settings (Enhanced):** {diagnostics.get('aggressive_detections', 0)}
39
 
40
+ ## Best Method: {diagnostics.get('best_method', 'none')} ({diagnostics.get('best_count', 0)} detections)
41
 
42
+ ## Sample Detected Text:
43
  """
44
+ sample_texts = diagnostics.get('sample_texts', [])
45
+ if sample_texts:
46
+ for i, text in enumerate(sample_texts):
47
+ output += f"{i+1}. {text}\n"
 
 
48
  else:
49
+ output += "No text detected in any method\n"
50
 
51
+ output += f"\n**Total Pages:** {diagnostics.get('total_pages', 0)}"
 
 
 
 
 
 
 
 
52
 
53
+ return output
54
 
55
  except json.JSONDecodeError as e:
56
+ return f"JSON parse error: {e}\nOutput: {process.stdout}"
57
  else:
58
+ return f"Process failed: {process.stderr}"
59
 
 
 
60
  except Exception as e:
61
+ return f"Error: {e}"
62
 
63
+ # Simple diagnostic interface
64
+ with gr.Blocks(title="PaddleOCR Diagnostic") as demo:
65
+ gr.Markdown("# PaddleOCR Performance Diagnostic")
66
+ gr.Markdown("This will test different OCR settings to identify why detection is poor.")
67
 
68
  with gr.Row():
69
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
70
+ test_btn = gr.Button("Run Diagnostic", variant="primary")
 
 
 
 
 
 
 
 
 
71
 
72
+ output = gr.Markdown(label="Diagnostic Results")
 
 
73
 
74
  test_btn.click(
75
+ fn=run_diagnostic,
76
  inputs=[file_input],
77
+ outputs=[output]
78
  )
79
 
80
  if __name__ == "__main__":