mbuckle commited on
Commit
2a0cc07
·
1 Parent(s): b92fc27

Diagnostic test

Browse files
archive/app - enhanced.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import json
3
+ import sys
4
+ import gradio as gr
5
+
6
+ def test_ocr_minimal(file):
7
+ if file is None:
8
+ return "No file uploaded", "", ""
9
+
10
+ try:
11
+ # Run the enhanced test script
12
+ script_path = "/home/user/app/enhanced_paddle_test.py"
13
+ command = [sys.executable, script_path, file.name]
14
+
15
+ print(f"Running: {' '.join(command)}")
16
+
17
+ process = subprocess.run(
18
+ command,
19
+ capture_output=True,
20
+ text=True,
21
+ timeout=300 # 5 minutes for multi-page processing
22
+ )
23
+
24
+ print(f"Return code: {process.returncode}")
25
+ print(f"Stderr: {process.stderr}")
26
+
27
+ if process.returncode == 0:
28
+ try:
29
+ result = json.loads(process.stdout.strip())
30
+
31
+ # Format the comprehensive results
32
+ summary = f"""
33
+ **Enhanced OCR Results:**
34
+ - **Total Detections:** {result.get('total_detections', 0)}
35
+ - **Pages Processed:** {result.get('pages_processed', 0)}
36
+ - **Text Length:** {len(result.get('text', ''))}
37
+ - **Lab Values Found:** {len(result.get('lab_values', {}))}
38
+ - **Settings:** {result.get('settings', 'Unknown')}
39
+
40
+ **Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
41
+
42
+ **Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
43
+
44
+ **Lab Values Detected:**
45
+ """
46
+
47
+ # Add lab values to summary
48
+ lab_values = result.get('lab_values', {})
49
+ if lab_values:
50
+ for name, data in lab_values.items():
51
+ summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
52
+ else:
53
+ summary += "- No lab values detected with current patterns\n"
54
+
55
+ # Format lab values for display
56
+ lab_display = "**Detected Lab Values:**\n\n"
57
+ if lab_values:
58
+ for name, data in lab_values.items():
59
+ lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
60
+ lab_display += f" - Raw text: {data.get('raw_text', 'N/A')}\n"
61
+ lab_display += f" - Confidence: {data.get('confidence', 0):.2f}\n\n"
62
+ else:
63
+ lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
64
+
65
+ return summary, result.get('text', ''), lab_display
66
+
67
+ except json.JSONDecodeError as e:
68
+ return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
69
+ else:
70
+ return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
71
+
72
+ except subprocess.TimeoutExpired:
73
+ return "Process timed out after 5 minutes", "", ""
74
+ except Exception as e:
75
+ return f"Error: {e}", "", ""
76
+
77
+ # Enhanced Gradio interface
78
+ with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
79
+ gr.Markdown("# Enhanced Medical Document OCR")
80
+ gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
81
+
82
+ with gr.Row():
83
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
84
+ test_btn = gr.Button("Run Enhanced OCR", variant="primary")
85
+
86
+ with gr.Row():
87
+ with gr.Column():
88
+ gr.Markdown("### Results Summary")
89
+ summary_output = gr.Markdown(label="Summary")
90
+
91
+ with gr.Column():
92
+ gr.Markdown("### Lab Values")
93
+ lab_output = gr.Markdown(label="Lab Values")
94
+
95
+ with gr.Row():
96
+ gr.Markdown("### Full Extracted Text")
97
+ text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
98
+
99
+ test_btn.click(
100
+ fn=test_ocr_minimal,
101
+ inputs=[file_input],
102
+ outputs=[summary_output, text_output, lab_output]
103
+ )
104
+
105
+ if __name__ == "__main__":
106
+ demo.launch(server_name="0.0.0.0", server_port=7860)
enhanced_paddle_test.py → archive/enhanced_paddle_test.py RENAMED
File without changes
diagnostic_test.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # diagnostic_test.py - Debug PaddleOCR performance issues
3
+
4
+ import sys
5
+ import os
6
+ import json
7
+ import fitz
8
+ import cv2
9
+ import numpy as np
10
+ from paddleocr import PaddleOCR
11
+
12
+ def diagnostic_test():
13
+ if len(sys.argv) < 2:
14
+ print(json.dumps({"error": "No file path provided"}))
15
+ return
16
+
17
+ file_path = sys.argv[1]
18
+
19
+ try:
20
+ print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
21
+
22
+ # Check system info
23
+ print(f"Python version: {sys.version}", file=sys.stderr)
24
+ print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
25
+
26
+ # Check PaddleOCR installation
27
+ try:
28
+ import paddle
29
+ print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
30
+ except:
31
+ print("PaddlePaddle not available", file=sys.stderr)
32
+
33
+ # Open PDF and get basic info
34
+ doc = fitz.open(file_path)
35
+ total_pages = len(doc)
36
+ print(f"PDF pages: {total_pages}", file=sys.stderr)
37
+
38
+ # Test different extraction methods on first page
39
+ page = doc[0]
40
+
41
+ # Method 1: Standard quality (72 DPI)
42
+ print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
43
+ pix_72 = page.get_pixmap(alpha=False)
44
+ temp_72 = "/tmp/test_72dpi.png"
45
+ pix_72.save(temp_72)
46
+ print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
47
+
48
+ # Method 2: High quality (300 DPI)
49
+ print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
50
+ mat = fitz.Matrix(300/72, 300/72)
51
+ pix_300 = page.get_pixmap(matrix=mat, alpha=False)
52
+ temp_300 = "/tmp/test_300dpi.png"
53
+ pix_300.save(temp_300)
54
+ print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
55
+
56
+ # Method 3: Try different preprocessing
57
+ print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
58
+ img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
59
+ # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
60
+ img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
61
+ # Apply some preprocessing
62
+ gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
63
+ # Increase contrast
64
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
65
+ enhanced = clahe.apply(gray)
66
+ temp_enhanced = "/tmp/test_enhanced.png"
67
+ cv2.imwrite(temp_enhanced, enhanced)
68
+ print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
69
+
70
+ doc.close()
71
+
72
+ # Test OCR with minimal settings first
73
+ print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
74
+ ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
75
+ result_minimal = ocr_minimal.ocr(temp_72, cls=False)
76
+ print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
77
+
78
+ # Test OCR with your current settings
79
+ print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
80
+ ocr_current = PaddleOCR(
81
+ use_angle_cls=True,
82
+ lang='en',
83
+ show_log=False,
84
+ use_gpu=False,
85
+ det_limit_side_len=2880,
86
+ det_limit_type='max',
87
+ rec_batch_num=8,
88
+ max_text_length=50,
89
+ use_space_char=True,
90
+ drop_score=0.1
91
+ )
92
+ result_current = ocr_current.ocr(temp_300, cls=True)
93
+ current_detections = len(result_current[0]) if result_current and result_current[0] else 0
94
+ print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
95
+
96
+ # Test OCR with more aggressive settings
97
+ print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
98
+ ocr_aggressive = PaddleOCR(
99
+ use_angle_cls=True,
100
+ lang='en',
101
+ show_log=False,
102
+ use_gpu=False,
103
+ det_limit_side_len=4000, # Even higher
104
+ det_limit_type='max',
105
+ rec_batch_num=1, # Lower batch for memory
106
+ max_text_length=100, # Longer text
107
+ use_space_char=True,
108
+ drop_score=0.05 # Very low threshold
109
+ )
110
+ result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
111
+ aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
112
+ print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
113
+
114
+ # Show sample results from best performing method
115
+ best_result = None
116
+ best_count = 0
117
+ best_method = "none"
118
+
119
+ if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
120
+ best_result = result_minimal
121
+ best_count = len(result_minimal[0])
122
+ best_method = "minimal"
123
+
124
+ if current_detections > best_count:
125
+ best_result = result_current
126
+ best_count = current_detections
127
+ best_method = "current"
128
+
129
+ if aggressive_detections > best_count:
130
+ best_result = result_aggressive
131
+ best_count = aggressive_detections
132
+ best_method = "aggressive"
133
+
134
+ print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
135
+
136
+ # Extract and show sample text from best result
137
+ sample_texts = []
138
+ if best_result and best_result[0]:
139
+ for i, detection in enumerate(best_result[0][:10]): # First 10 only
140
+ if len(detection) >= 2:
141
+ text_info = detection[1]
142
+ if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
143
+ text = str(text_info[0])
144
+ conf = float(text_info[1])
145
+ else:
146
+ text = str(text_info)
147
+ conf = 1.0
148
+
149
+ sample_texts.append(f"'{text}' ({conf:.2f})")
150
+ print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
151
+
152
+ # Clean up
153
+ for temp_file in [temp_72, temp_300, temp_enhanced]:
154
+ if os.path.exists(temp_file):
155
+ os.unlink(temp_file)
156
+
157
+ # Return diagnostic results
158
+ result = {
159
+ "success": True,
160
+ "diagnostics": {
161
+ "total_pages": total_pages,
162
+ "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
163
+ "current_detections": current_detections,
164
+ "aggressive_detections": aggressive_detections,
165
+ "best_method": best_method,
166
+ "best_count": best_count,
167
+ "sample_texts": sample_texts
168
+ }
169
+ }
170
+
171
+ print(json.dumps(result))
172
+
173
+ except Exception as e:
174
+ print(f"Diagnostic error: {e}", file=sys.stderr)
175
+ import traceback
176
+ traceback.print_exc(file=sys.stderr)
177
+ print(json.dumps({"success": False, "error": str(e)}))
178
+
179
+ if __name__ == "__main__":
180
+ diagnostic_test()