mbuckle commited on
Commit
b92fc27
·
1 Parent(s): 3ca6417

Enhanced version 5

Browse files
Files changed (2) hide show
  1. app.py +53 -22
  2. enhanced_paddle_test.py +140 -77
app.py CHANGED
@@ -5,10 +5,10 @@ import gradio as gr
5
 
6
  def test_ocr_minimal(file):
7
  if file is None:
8
- return "No file uploaded", ""
9
 
10
  try:
11
- # Run the focused high-quality test script
12
  script_path = "/home/user/app/enhanced_paddle_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
@@ -18,57 +18,88 @@ def test_ocr_minimal(file):
18
  command,
19
  capture_output=True,
20
  text=True,
21
- timeout=180 # Increased to 3 minutes for high-quality processing
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
25
  print(f"Stderr: {process.stderr}")
26
- print(f"Stdout: {process.stdout}")
27
 
28
  if process.returncode == 0:
29
  try:
30
  result = json.loads(process.stdout.strip())
31
 
32
- # Format the focused results
33
  summary = f"""
34
- **High-Quality OCR Results:**
35
- - **Detections Found:** {result.get('detections', 0)}
 
36
  - **Text Length:** {len(result.get('text', ''))}
 
37
  - **Settings:** {result.get('settings', 'Unknown')}
38
 
39
- **Sample Numbers Found:** {', '.join(result.get('numbers_found', []))}
40
 
41
- **Sample Terms Found:** {', '.join(result.get('terms_found', []))}
 
 
42
  """
43
 
44
- return summary, result.get('text', '')
45
- except json.JSONDecodeError:
46
- return f"JSON parse error. Stdout: {process.stdout}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  else:
48
- return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", ""
49
 
 
 
50
  except Exception as e:
51
- return f"Error: {e}", ""
52
 
53
- # Simple Gradio interface for testing
54
- with gr.Blocks(title="Focused High-Quality OCR Test") as demo:
55
- gr.Markdown("# Focused High-Quality OCR Test")
56
- gr.Markdown("This uses optimized settings for medical documents: 300 DPI, medical-specific OCR parameters, and lower confidence thresholds.")
57
 
58
  with gr.Row():
59
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
60
- test_btn = gr.Button("Run High-Quality OCR Test")
61
 
62
  with gr.Row():
63
- summary_output = gr.Markdown(label="Results Summary")
 
 
 
 
 
 
64
 
65
  with gr.Row():
66
- text_output = gr.Textbox(label="Extracted Text", lines=15)
 
67
 
68
  test_btn.click(
69
  fn=test_ocr_minimal,
70
  inputs=[file_input],
71
- outputs=[summary_output, text_output]
72
  )
73
 
74
  if __name__ == "__main__":
 
5
 
6
  def test_ocr_minimal(file):
7
  if file is None:
8
+ return "No file uploaded", "", ""
9
 
10
  try:
11
+ # Run the enhanced test script
12
  script_path = "/home/user/app/enhanced_paddle_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
 
18
  command,
19
  capture_output=True,
20
  text=True,
21
+ timeout=300 # 5 minutes for multi-page processing
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
25
  print(f"Stderr: {process.stderr}")
 
26
 
27
  if process.returncode == 0:
28
  try:
29
  result = json.loads(process.stdout.strip())
30
 
31
+ # Format the comprehensive results
32
  summary = f"""
33
+ **Enhanced OCR Results:**
34
+ - **Total Detections:** {result.get('total_detections', 0)}
35
+ - **Pages Processed:** {result.get('pages_processed', 0)}
36
  - **Text Length:** {len(result.get('text', ''))}
37
+ - **Lab Values Found:** {len(result.get('lab_values', {}))}
38
  - **Settings:** {result.get('settings', 'Unknown')}
39
 
40
+ **Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
41
 
42
+ **Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
43
+
44
+ **Lab Values Detected:**
45
  """
46
 
47
+ # Add lab values to summary
48
+ lab_values = result.get('lab_values', {})
49
+ if lab_values:
50
+ for name, data in lab_values.items():
51
+ summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
52
+ else:
53
+ summary += "- No lab values detected with current patterns\n"
54
+
55
+ # Format lab values for display
56
+ lab_display = "**Detected Lab Values:**\n\n"
57
+ if lab_values:
58
+ for name, data in lab_values.items():
59
+ lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
60
+ lab_display += f" - Raw text: {data.get('raw_text', 'N/A')}\n"
61
+ lab_display += f" - Confidence: {data.get('confidence', 0):.2f}\n\n"
62
+ else:
63
+ lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
64
+
65
+ return summary, result.get('text', ''), lab_display
66
+
67
+ except json.JSONDecodeError as e:
68
+ return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
69
  else:
70
+ return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
71
 
72
+ except subprocess.TimeoutExpired:
73
+ return "Process timed out after 5 minutes", "", ""
74
  except Exception as e:
75
+ return f"Error: {e}", "", ""
76
 
77
+ # Enhanced Gradio interface
78
+ with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
79
+ gr.Markdown("# Enhanced Medical Document OCR")
80
+ gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
81
 
82
  with gr.Row():
83
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
84
+ test_btn = gr.Button("Run Enhanced OCR", variant="primary")
85
 
86
  with gr.Row():
87
+ with gr.Column():
88
+ gr.Markdown("### Results Summary")
89
+ summary_output = gr.Markdown(label="Summary")
90
+
91
+ with gr.Column():
92
+ gr.Markdown("### Lab Values")
93
+ lab_output = gr.Markdown(label="Lab Values")
94
 
95
  with gr.Row():
96
+ gr.Markdown("### Full Extracted Text")
97
+ text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
98
 
99
  test_btn.click(
100
  fn=test_ocr_minimal,
101
  inputs=[file_input],
102
+ outputs=[summary_output, text_output, lab_output]
103
  )
104
 
105
  if __name__ == "__main__":
enhanced_paddle_test.py CHANGED
@@ -1,5 +1,5 @@
1
  #!/usr/bin/env python3
2
- # focused_paddle_test.py - Quick test focused on high-quality settings
3
 
4
  import sys
5
  import os
@@ -19,113 +19,176 @@ def test_high_quality_ocr():
19
 
20
  # Open PDF
21
  doc = fitz.open(file_path)
22
- print(f"PDF has {len(doc)} pages", file=sys.stderr)
 
23
 
24
- # Convert first page with high quality settings
25
- page = doc[0]
 
 
26
 
27
- # Use higher DPI and better quality settings
28
- mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
29
- pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR
30
-
31
- temp_img = "/tmp/high_quality_page.png"
32
- pix.save(temp_img)
33
-
34
- if os.path.exists(temp_img):
35
- img_size = os.path.getsize(temp_img)
36
- print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
37
- else:
38
- print("Failed to create high quality image", file=sys.stderr)
39
- doc.close()
40
- return
41
-
42
- doc.close()
43
-
44
- # Initialize OCR with optimized settings for medical documents
45
  print("Initializing OCR with medical document settings...", file=sys.stderr)
46
  ocr = PaddleOCR(
47
  use_angle_cls=True, # Detect text orientation
48
  lang='en', # English language
49
  show_log=False, # Suppress logs
50
  use_gpu=False, # CPU mode for serverless
51
- det_limit_side_len=1960, # Higher detection limit for high-res images
52
  det_limit_type='max', # Max side length limit
53
- rec_batch_num=6, # Process more text regions at once
54
- max_text_length=25, # Allow longer text detection
55
  use_space_char=True, # Preserve spaces in text
56
- drop_score=0.2 # Lower threshold to catch more text
57
  )
58
  print("OCR initialized with medical settings", file=sys.stderr)
59
 
60
- # Run OCR with these optimized settings
61
- print("Running optimized OCR...", file=sys.stderr)
62
- result = ocr.ocr(temp_img, cls=True)
63
-
64
- print(f"OCR result type: {type(result)}", file=sys.stderr)
65
- if result:
66
- print(f"Result length: {len(result)}", file=sys.stderr)
67
- if result[0]:
68
- detections = len(result[0])
69
- print(f"High-quality approach found {detections} detections", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  # Extract text with lower confidence threshold
72
- text_parts = []
73
- medical_terms = []
74
- numbers = []
75
 
76
  for i, detection in enumerate(result[0]):
77
  if len(detection) >= 2:
78
- text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
79
- conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
 
 
 
 
 
80
 
81
- # Show first 20 detections for debugging
82
- if i < 20:
83
  print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
84
 
85
- # Use lower confidence threshold (0.2 instead of 0.3)
86
- if conf > 0.2:
87
- text_parts.append(text)
 
88
 
89
  # Categorize detections
90
- if any(char.isdigit() for char in text) and '.' in text:
91
- numbers.append(text)
92
- elif len(text) > 3 and text.isalpha():
93
- medical_terms.append(text)
94
-
95
- full_text = '\n'.join(text_parts)
 
96
 
97
- # Clean up
98
- if os.path.exists(temp_img):
99
- os.unlink(temp_img)
100
-
101
- print(f"Extracted {len(text_parts)} text pieces ({len(numbers)} numbers, {len(medical_terms)} terms)", file=sys.stderr)
102
-
103
- # Return comprehensive result
104
- print(json.dumps({
105
- "success": True,
106
- "text": full_text,
107
- "detections": detections,
108
- "numbers_found": numbers[:10], # First 10 numbers
109
- "terms_found": medical_terms[:10], # First 10 terms
110
- "settings": "High-quality 300 DPI with medical optimization"
111
- }))
112
-
113
- else:
114
- print("First page result is empty", file=sys.stderr)
115
- print(json.dumps({"success": False, "error": "No text detected"}))
116
- else:
117
- print("OCR returned None", file=sys.stderr)
118
- print(json.dumps({"success": False, "error": "OCR returned no results"}))
 
 
 
 
 
 
 
 
119
 
120
  except Exception as e:
121
  # Clean up on error
122
- if os.path.exists("/tmp/high_quality_page.png"):
123
- os.unlink("/tmp/high_quality_page.png")
124
-
 
 
125
  print(f"Error: {e}", file=sys.stderr)
126
  import traceback
127
  traceback.print_exc(file=sys.stderr)
128
  print(json.dumps({"success": False, "error": str(e)}))
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  if __name__ == "__main__":
131
  test_high_quality_ocr()
 
1
  #!/usr/bin/env python3
2
+ # enhanced_paddle_test.py - Improved to match local implementation
3
 
4
  import sys
5
  import os
 
19
 
20
  # Open PDF
21
  doc = fitz.open(file_path)
22
+ total_pages = len(doc)
23
+ print(f"PDF has {total_pages} pages", file=sys.stderr)
24
 
25
+ all_text_parts = []
26
+ all_numbers = []
27
+ all_medical_terms = []
28
+ total_detections = 0
29
 
30
+ # Initialize OCR once with optimized settings for medical documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  print("Initializing OCR with medical document settings...", file=sys.stderr)
32
  ocr = PaddleOCR(
33
  use_angle_cls=True, # Detect text orientation
34
  lang='en', # English language
35
  show_log=False, # Suppress logs
36
  use_gpu=False, # CPU mode for serverless
37
+ det_limit_side_len=2880, # Higher detection limit for high-res images
38
  det_limit_type='max', # Max side length limit
39
+ rec_batch_num=8, # Process more text regions at once
40
+ max_text_length=50, # Allow longer text detection
41
  use_space_char=True, # Preserve spaces in text
42
+ drop_score=0.1 # Much lower threshold to catch more text
43
  )
44
  print("OCR initialized with medical settings", file=sys.stderr)
45
 
46
+ # Process all pages (not just first page)
47
+ for page_num in range(total_pages):
48
+ print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
49
+
50
+ page = doc[page_num]
51
+
52
+ # Use higher DPI and better quality settings
53
+ mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
54
+ pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR
55
+
56
+ temp_img = f"/tmp/high_quality_page_{page_num}.png"
57
+ pix.save(temp_img)
58
+
59
+ if os.path.exists(temp_img):
60
+ img_size = os.path.getsize(temp_img)
61
+ print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
62
+ else:
63
+ print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
64
+ continue
65
+
66
+ # Run OCR on this page
67
+ print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
68
+ result = ocr.ocr(temp_img, cls=True)
69
+
70
+ if result and result[0]:
71
+ page_detections = len(result[0])
72
+ total_detections += page_detections
73
+ print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
74
 
75
  # Extract text with lower confidence threshold
76
+ page_text_parts = []
 
 
77
 
78
  for i, detection in enumerate(result[0]):
79
  if len(detection) >= 2:
80
+ text_info = detection[1]
81
+ if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
82
+ text = str(text_info[0])
83
+ conf = float(text_info[1])
84
+ else:
85
+ text = str(text_info)
86
+ conf = 1.0
87
 
88
+ # Show some detections for debugging (first page only)
89
+ if page_num == 0 and i < 20:
90
  print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
91
 
92
+ # Use very low confidence threshold (0.1 instead of 0.2)
93
+ if conf > 0.1 and len(text.strip()) > 0:
94
+ page_text_parts.append(text)
95
+ all_text_parts.append(text)
96
 
97
  # Categorize detections
98
+ if any(char.isdigit() for char in text):
99
+ # Look for numbers with decimals or medical values
100
+ if '.' in text or any(c.isdigit() for c in text):
101
+ all_numbers.append(text)
102
+ elif len(text) > 2 and any(c.isalpha() for c in text):
103
+ # Look for potential medical terms
104
+ all_medical_terms.append(text)
105
 
106
+ print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
107
+
108
+ # Clean up page image
109
+ if os.path.exists(temp_img):
110
+ os.unlink(temp_img)
111
+
112
+ doc.close()
113
+
114
+ # Combine all text
115
+ full_text = '\n'.join(all_text_parts)
116
+
117
+ print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
118
+ print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
119
+
120
+ # Apply basic lab patterns similar to local implementation
121
+ lab_values = apply_basic_patterns(full_text)
122
+
123
+ # Return comprehensive result
124
+ result_data = {
125
+ "success": True,
126
+ "text": full_text,
127
+ "total_detections": total_detections,
128
+ "pages_processed": total_pages,
129
+ "numbers_found": all_numbers[:20], # First 20 numbers
130
+ "terms_found": all_medical_terms[:20], # First 20 terms
131
+ "lab_values": lab_values,
132
+ "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
133
+ }
134
+
135
+ print(json.dumps(result_data))
136
 
137
  except Exception as e:
138
  # Clean up on error
139
+ for i in range(10): # Clean up any temp files
140
+ temp_file = f"/tmp/high_quality_page_{i}.png"
141
+ if os.path.exists(temp_file):
142
+ os.unlink(temp_file)
143
+
144
  print(f"Error: {e}", file=sys.stderr)
145
  import traceback
146
  traceback.print_exc(file=sys.stderr)
147
  print(json.dumps({"success": False, "error": str(e)}))
148
 
149
+ def apply_basic_patterns(text):
150
+ """Apply basic lab value patterns similar to local implementation"""
151
+ lab_values = {}
152
+
153
+ if not text:
154
+ return lab_values
155
+
156
+ # Define basic patterns for common lab values
157
+ patterns = {
158
+ 'TSH': r'TSH[:\s]*(\d+\.?\d*)',
159
+ 'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
160
+ 'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
161
+ 'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
162
+ 'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
163
+ 'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
164
+ 'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
165
+ 'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
166
+ 'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
167
+ 'WBC': r'WBC[:\s]*(\d+\.?\d*)',
168
+ 'RBC': r'RBC[:\s]*(\d+\.?\d*)'
169
+ }
170
+
171
+ import re
172
+
173
+ # Normalize text for pattern matching
174
+ normalized_text = re.sub(r'\s+', ' ', text)
175
+
176
+ for test_name, pattern in patterns.items():
177
+ try:
178
+ match = re.search(pattern, normalized_text, re.IGNORECASE)
179
+ if match:
180
+ value = float(match.group(1))
181
+ lab_values[test_name] = {
182
+ "value": value,
183
+ "raw_text": match.group(0),
184
+ "confidence": 0.8
185
+ }
186
+ print(f"Found {test_name}: {value}", file=sys.stderr)
187
+ except (ValueError, IndexError) as e:
188
+ print(f"Error parsing {test_name}: {e}", file=sys.stderr)
189
+ continue
190
+
191
+ return lab_values
192
+
193
  if __name__ == "__main__":
194
  test_high_quality_ocr()