mbuckle commited on
Commit
3ca6417
·
1 Parent(s): 503febe

Enhanced paddle test version 2

Browse files
Files changed (2) hide show
  1. app.py +14 -17
  2. enhanced_paddle_test.py +95 -83
app.py CHANGED
@@ -8,7 +8,7 @@ def test_ocr_minimal(file):
8
  return "No file uploaded", ""
9
 
10
  try:
11
- # Run the enhanced test script
12
  script_path = "/home/user/app/enhanced_paddle_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
@@ -18,7 +18,7 @@ def test_ocr_minimal(file):
18
  command,
19
  capture_output=True,
20
  text=True,
21
- timeout=120
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
@@ -29,21 +29,18 @@ def test_ocr_minimal(file):
29
  try:
30
  result = json.loads(process.stdout.strip())
31
 
32
- # Format the enhanced results
33
  summary = f"""
34
- **Results Summary:**
35
- - **Best Approach:** {result.get('best_approach', 'Unknown')}
36
- - **Best Detections:** {result.get('detections', 0)}
37
  - **Text Length:** {len(result.get('text', ''))}
 
38
 
39
- **All Approaches:**
 
 
40
  """
41
 
42
- # Add results for each approach
43
- all_results = result.get('all_results', {})
44
- for approach_name, approach_data in all_results.items():
45
- summary += f"\n- **{approach_name}:** {approach_data.get('detections', 0)} detections"
46
-
47
  return summary, result.get('text', '')
48
  except json.JSONDecodeError:
49
  return f"JSON parse error. Stdout: {process.stdout}", ""
@@ -54,19 +51,19 @@ def test_ocr_minimal(file):
54
  return f"Error: {e}", ""
55
 
56
  # Simple Gradio interface for testing
57
- with gr.Blocks(title="Enhanced OCR Test") as demo:
58
- gr.Markdown("# Enhanced OCR Test - Multiple Approaches")
59
- gr.Markdown("This will test different DPI settings and OCR configurations to find the best quality match for your local implementation.")
60
 
61
  with gr.Row():
62
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
63
- test_btn = gr.Button("Test Multiple OCR Approaches")
64
 
65
  with gr.Row():
66
  summary_output = gr.Markdown(label="Results Summary")
67
 
68
  with gr.Row():
69
- text_output = gr.Textbox(label="Best Extracted Text", lines=15)
70
 
71
  test_btn.click(
72
  fn=test_ocr_minimal,
 
8
  return "No file uploaded", ""
9
 
10
  try:
11
+ # Run the focused high-quality test script
12
  script_path = "/home/user/app/enhanced_paddle_test.py"
13
  command = [sys.executable, script_path, file.name]
14
 
 
18
  command,
19
  capture_output=True,
20
  text=True,
21
+ timeout=180 # Increased to 3 minutes for high-quality processing
22
  )
23
 
24
  print(f"Return code: {process.returncode}")
 
29
  try:
30
  result = json.loads(process.stdout.strip())
31
 
32
+ # Format the focused results
33
  summary = f"""
34
+ **High-Quality OCR Results:**
35
+ - **Detections Found:** {result.get('detections', 0)}
 
36
  - **Text Length:** {len(result.get('text', ''))}
37
+ - **Settings:** {result.get('settings', 'Unknown')}
38
 
39
+ **Sample Numbers Found:** {', '.join(result.get('numbers_found', []))}
40
+
41
+ **Sample Terms Found:** {', '.join(result.get('terms_found', []))}
42
  """
43
 
 
 
 
 
 
44
  return summary, result.get('text', '')
45
  except json.JSONDecodeError:
46
  return f"JSON parse error. Stdout: {process.stdout}", ""
 
51
  return f"Error: {e}", ""
52
 
53
  # Simple Gradio interface for testing
54
+ with gr.Blocks(title="Focused High-Quality OCR Test") as demo:
55
+ gr.Markdown("# Focused High-Quality OCR Test")
56
+ gr.Markdown("This uses optimized settings for medical documents: 300 DPI, medical-specific OCR parameters, and lower confidence thresholds.")
57
 
58
  with gr.Row():
59
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
60
+ test_btn = gr.Button("Run High-Quality OCR Test")
61
 
62
  with gr.Row():
63
  summary_output = gr.Markdown(label="Results Summary")
64
 
65
  with gr.Row():
66
+ text_output = gr.Textbox(label="Extracted Text", lines=15)
67
 
68
  test_btn.click(
69
  fn=test_ocr_minimal,
enhanced_paddle_test.py CHANGED
@@ -1,5 +1,5 @@
1
  #!/usr/bin/env python3
2
- # enhanced_paddle_test.py - Trying to match your local implementation quality
3
 
4
  import sys
5
  import os
@@ -7,7 +7,7 @@ import json
7
  import fitz
8
  from paddleocr import PaddleOCR
9
 
10
- def test_multiple_approaches():
11
  if len(sys.argv) < 2:
12
  print(json.dumps({"error": "No file path provided"}))
13
  return
@@ -15,105 +15,117 @@ def test_multiple_approaches():
15
  file_path = sys.argv[1]
16
 
17
  try:
18
- print(f"Testing multiple OCR approaches on: {file_path}", file=sys.stderr)
19
-
20
- # Test different DPI settings and OCR configurations
21
- approaches = [
22
- {"name": "High DPI (300)", "dpi": 300, "det_limit_side_len": 1960},
23
- {"name": "Medium DPI (200)", "dpi": 200, "det_limit_side_len": 1280},
24
- {"name": "Low DPI (150)", "dpi": 150, "det_limit_side_len": 960},
25
- {"name": "Your Local Settings", "dpi": 200, "det_limit_side_len": None}
26
- ]
27
 
 
28
  doc = fitz.open(file_path)
29
  print(f"PDF has {len(doc)} pages", file=sys.stderr)
30
 
31
- all_results = {}
 
32
 
33
- for approach in approaches:
34
- print(f"\n=== Testing {approach['name']} ===", file=sys.stderr)
35
-
36
- # Convert first page with specific DPI
37
- page = doc[0]
38
- mat = fitz.Matrix(approach['dpi']/72, approach['dpi']/72)
39
- pix = page.get_pixmap(matrix=mat)
40
-
41
- temp_img = f"/tmp/test_{approach['dpi']}.png"
42
- pix.save(temp_img)
43
-
44
- if os.path.exists(temp_img):
45
- img_size = os.path.getsize(temp_img)
46
- print(f"Image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
47
-
48
- # Initialize OCR with specific settings
49
- ocr_kwargs = {
50
- 'use_angle_cls': True,
51
- 'lang': 'en',
52
- 'show_log': False,
53
- 'use_gpu': False
54
- }
55
-
56
- if approach['det_limit_side_len']:
57
- ocr_kwargs['det_limit_side_len'] = approach['det_limit_side_len']
58
-
59
- print(f"OCR settings: {ocr_kwargs}", file=sys.stderr)
60
- ocr = PaddleOCR(**ocr_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # Run OCR
63
- result = ocr.ocr(temp_img, cls=True)
 
 
64
 
65
- if result and result[0]:
66
- detections = len(result[0])
67
- print(f"Detections: {detections}", file=sys.stderr)
68
-
69
- # Extract all text
70
- text_parts = []
71
- for i, detection in enumerate(result[0]):
72
- if len(detection) >= 2:
73
- text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
74
- conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
 
 
75
 
76
- if conf > 0.3: # Lower threshold for testing
77
- text_parts.append(text)
78
- if i < 10: # Show first 10 detections
79
- print(f" {i}: '{text}' ({conf:.2f})", file=sys.stderr)
80
-
81
- all_results[approach['name']] = {
82
- 'detections': detections,
83
- 'text': '\n'.join(text_parts),
84
- 'settings': ocr_kwargs
85
- }
86
- else:
87
- print("No detections", file=sys.stderr)
88
- all_results[approach['name']] = {'detections': 0, 'text': '', 'settings': ocr_kwargs}
89
 
90
  # Clean up
91
  if os.path.exists(temp_img):
92
  os.unlink(temp_img)
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  else:
94
- print(f"Failed to create image: {temp_img}", file=sys.stderr)
95
-
96
- doc.close()
97
-
98
- # Find best result
99
- best_approach = max(all_results.keys(), key=lambda k: all_results[k]['detections'])
100
-
101
- print(f"\nBest approach: {best_approach} with {all_results[best_approach]['detections']} detections", file=sys.stderr)
102
-
103
- # Return the best result
104
- print(json.dumps({
105
- "success": True,
106
- "best_approach": best_approach,
107
- "all_results": all_results,
108
- "text": all_results[best_approach]['text'],
109
- "detections": all_results[best_approach]['detections']
110
- }))
111
 
112
  except Exception as e:
 
 
 
 
113
  print(f"Error: {e}", file=sys.stderr)
114
  import traceback
115
  traceback.print_exc(file=sys.stderr)
116
  print(json.dumps({"success": False, "error": str(e)}))
117
 
118
  if __name__ == "__main__":
119
- test_multiple_approaches()
 
1
  #!/usr/bin/env python3
2
+ # focused_paddle_test.py - Quick test focused on high-quality settings
3
 
4
  import sys
5
  import os
 
7
  import fitz
8
  from paddleocr import PaddleOCR
9
 
10
+ def test_high_quality_ocr():
11
  if len(sys.argv) < 2:
12
  print(json.dumps({"error": "No file path provided"}))
13
  return
 
15
  file_path = sys.argv[1]
16
 
17
  try:
18
+ print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
 
 
 
 
 
 
 
 
19
 
20
+ # Open PDF
21
  doc = fitz.open(file_path)
22
  print(f"PDF has {len(doc)} pages", file=sys.stderr)
23
 
24
+ # Convert first page with high quality settings
25
+ page = doc[0]
26
 
27
+ # Use higher DPI and better quality settings
28
+ mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
29
+ pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR
30
+
31
+ temp_img = "/tmp/high_quality_page.png"
32
+ pix.save(temp_img)
33
+
34
+ if os.path.exists(temp_img):
35
+ img_size = os.path.getsize(temp_img)
36
+ print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
37
+ else:
38
+ print("Failed to create high quality image", file=sys.stderr)
39
+ doc.close()
40
+ return
41
+
42
+ doc.close()
43
+
44
+ # Initialize OCR with optimized settings for medical documents
45
+ print("Initializing OCR with medical document settings...", file=sys.stderr)
46
+ ocr = PaddleOCR(
47
+ use_angle_cls=True, # Detect text orientation
48
+ lang='en', # English language
49
+ show_log=False, # Suppress logs
50
+ use_gpu=False, # CPU mode for serverless
51
+ det_limit_side_len=1960, # Higher detection limit for high-res images
52
+ det_limit_type='max', # Max side length limit
53
+ rec_batch_num=6, # Process more text regions at once
54
+ max_text_length=25, # Allow longer text detection
55
+ use_space_char=True, # Preserve spaces in text
56
+ drop_score=0.2 # Lower threshold to catch more text
57
+ )
58
+ print("OCR initialized with medical settings", file=sys.stderr)
59
+
60
+ # Run OCR with these optimized settings
61
+ print("Running optimized OCR...", file=sys.stderr)
62
+ result = ocr.ocr(temp_img, cls=True)
63
+
64
+ print(f"OCR result type: {type(result)}", file=sys.stderr)
65
+ if result:
66
+ print(f"Result length: {len(result)}", file=sys.stderr)
67
+ if result[0]:
68
+ detections = len(result[0])
69
+ print(f"High-quality approach found {detections} detections", file=sys.stderr)
70
 
71
+ # Extract text with lower confidence threshold
72
+ text_parts = []
73
+ medical_terms = []
74
+ numbers = []
75
 
76
+ for i, detection in enumerate(result[0]):
77
+ if len(detection) >= 2:
78
+ text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
79
+ conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
80
+
81
+ # Show first 20 detections for debugging
82
+ if i < 20:
83
+ print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
84
+
85
+ # Use lower confidence threshold (0.2 instead of 0.3)
86
+ if conf > 0.2:
87
+ text_parts.append(text)
88
 
89
+ # Categorize detections
90
+ if any(char.isdigit() for char in text) and '.' in text:
91
+ numbers.append(text)
92
+ elif len(text) > 3 and text.isalpha():
93
+ medical_terms.append(text)
94
+
95
+ full_text = '\n'.join(text_parts)
 
 
 
 
 
 
96
 
97
  # Clean up
98
  if os.path.exists(temp_img):
99
  os.unlink(temp_img)
100
+
101
+ print(f"Extracted {len(text_parts)} text pieces ({len(numbers)} numbers, {len(medical_terms)} terms)", file=sys.stderr)
102
+
103
+ # Return comprehensive result
104
+ print(json.dumps({
105
+ "success": True,
106
+ "text": full_text,
107
+ "detections": detections,
108
+ "numbers_found": numbers[:10], # First 10 numbers
109
+ "terms_found": medical_terms[:10], # First 10 terms
110
+ "settings": "High-quality 300 DPI with medical optimization"
111
+ }))
112
+
113
  else:
114
+ print("First page result is empty", file=sys.stderr)
115
+ print(json.dumps({"success": False, "error": "No text detected"}))
116
+ else:
117
+ print("OCR returned None", file=sys.stderr)
118
+ print(json.dumps({"success": False, "error": "OCR returned no results"}))
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  except Exception as e:
121
+ # Clean up on error
122
+ if os.path.exists("/tmp/high_quality_page.png"):
123
+ os.unlink("/tmp/high_quality_page.png")
124
+
125
  print(f"Error: {e}", file=sys.stderr)
126
  import traceback
127
  traceback.print_exc(file=sys.stderr)
128
  print(json.dumps({"success": False, "error": str(e)}))
129
 
130
  if __name__ == "__main__":
131
+ test_high_quality_ocr()