Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

b92fc27

1 Parent(s): 3ca6417

Enhanced version 5

Browse files

Files changed (2) hide show

app.py +53 -22
enhanced_paddle_test.py +140 -77

app.py CHANGED Viewed

@@ -5,10 +5,10 @@ import gradio as gr
 def test_ocr_minimal(file):
     if file is None:
-        return "No file uploaded", ""
     try:
-        # Run the focused high-quality test script
         script_path = "/home/user/app/enhanced_paddle_test.py"
         command = [sys.executable, script_path, file.name]
@@ -18,57 +18,88 @@ def test_ocr_minimal(file):
             command,
             capture_output=True,
             text=True,
-            timeout=180  # Increased to 3 minutes for high-quality processing
         )
         print(f"Return code: {process.returncode}")
         print(f"Stderr: {process.stderr}")
-        print(f"Stdout: {process.stdout}")
         if process.returncode == 0:
             try:
                 result = json.loads(process.stdout.strip())
-                # Format the focused results
                 summary = f"""
-**High-Quality OCR Results:**
-- **Detections Found:** {result.get('detections', 0)}
 - **Text Length:** {len(result.get('text', ''))}
 - **Settings:** {result.get('settings', 'Unknown')}
-**Sample Numbers Found:** {', '.join(result.get('numbers_found', []))}
-**Sample Terms Found:** {', '.join(result.get('terms_found', []))}
 """
-                return summary, result.get('text', '')
-            except json.JSONDecodeError:
-                return f"JSON parse error. Stdout: {process.stdout}", ""
         else:
-            return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", ""
     except Exception as e:
-        return f"Error: {e}", ""
-# Simple Gradio interface for testing
-with gr.Blocks(title="Focused High-Quality OCR Test") as demo:
-    gr.Markdown("# Focused High-Quality OCR Test")
-    gr.Markdown("This uses optimized settings for medical documents: 300 DPI, medical-specific OCR parameters, and lower confidence thresholds.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        test_btn = gr.Button("Run High-Quality OCR Test")
     with gr.Row():
-        summary_output = gr.Markdown(label="Results Summary")
     with gr.Row():
-        text_output = gr.Textbox(label="Extracted Text", lines=15)
     test_btn.click(
         fn=test_ocr_minimal,
         inputs=[file_input],
-        outputs=[summary_output, text_output]
     )
 if __name__ == "__main__":

 def test_ocr_minimal(file):
     if file is None:
+        return "No file uploaded", "", ""
     try:
+        # Run the enhanced test script
         script_path = "/home/user/app/enhanced_paddle_test.py"
         command = [sys.executable, script_path, file.name]
             command,
             capture_output=True,
             text=True,
+            timeout=300  # 5 minutes for multi-page processing
         )
         print(f"Return code: {process.returncode}")
         print(f"Stderr: {process.stderr}")
         if process.returncode == 0:
             try:
                 result = json.loads(process.stdout.strip())
+                # Format the comprehensive results
                 summary = f"""
+**Enhanced OCR Results:**
+- **Total Detections:** {result.get('total_detections', 0)}
+- **Pages Processed:** {result.get('pages_processed', 0)}
 - **Text Length:** {len(result.get('text', ''))}
+- **Lab Values Found:** {len(result.get('lab_values', {}))}
 - **Settings:** {result.get('settings', 'Unknown')}
+**Sample Numbers:** {', '.join(result.get('numbers_found', [])[:10])}
+**Sample Terms:** {', '.join(result.get('terms_found', [])[:10])}
+**Lab Values Detected:**
 """
+                # Add lab values to summary
+                lab_values = result.get('lab_values', {})
+                if lab_values:
+                    for name, data in lab_values.items():
+                        summary += f"- **{name}:** {data.get('value', 'N/A')} (confidence: {data.get('confidence', 0):.2f})\n"
+                else:
+                    summary += "- No lab values detected with current patterns\n"
+                # Format lab values for display
+                lab_display = "**Detected Lab Values:**\n\n"
+                if lab_values:
+                    for name, data in lab_values.items():
+                        lab_display += f"**{name}:** {data.get('value', 'N/A')}\n"
+                        lab_display += f"  - Raw text: {data.get('raw_text', 'N/A')}\n"
+                        lab_display += f"  - Confidence: {data.get('confidence', 0):.2f}\n\n"
+                else:
+                    lab_display += "No lab values detected. The OCR may need pattern adjustments for this document format.\n"
+                return summary, result.get('text', ''), lab_display
+            except json.JSONDecodeError as e:
+                return f"JSON parse error: {e}\nStdout: {process.stdout}", "", ""
         else:
+            return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", "", ""
+    except subprocess.TimeoutExpired:
+        return "Process timed out after 5 minutes", "", ""
     except Exception as e:
+        return f"Error: {e}", "", ""
+# Enhanced Gradio interface
+with gr.Blocks(title="Enhanced Medical OCR Test") as demo:
+    gr.Markdown("# Enhanced Medical Document OCR")
+    gr.Markdown("This processes all pages with medical-specific patterns and extracts lab values similar to the local implementation.")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        test_btn = gr.Button("Run Enhanced OCR", variant="primary")
     with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Results Summary")
+            summary_output = gr.Markdown(label="Summary")
+        with gr.Column():
+            gr.Markdown("### Lab Values")
+            lab_output = gr.Markdown(label="Lab Values")
     with gr.Row():
+        gr.Markdown("### Full Extracted Text")
+        text_output = gr.Textbox(label="Complete OCR Text", lines=20, max_lines=30)
     test_btn.click(
         fn=test_ocr_minimal,
         inputs=[file_input],
+        outputs=[summary_output, text_output, lab_output]
     )
 if __name__ == "__main__":

enhanced_paddle_test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# focused_paddle_test.py - Quick test focused on high-quality settings
 import sys
 import os
@@ -19,113 +19,176 @@ def test_high_quality_ocr():
         # Open PDF
         doc = fitz.open(file_path)
-        print(f"PDF has {len(doc)} pages", file=sys.stderr)
-        # Convert first page with high quality settings
-        page = doc[0]
-        # Use higher DPI and better quality settings
-        mat = fitz.Matrix(300/72, 300/72)  # 300 DPI like professional scanners
-        pix = page.get_pixmap(matrix=mat, alpha=False)  # No alpha for better OCR
-        temp_img = "/tmp/high_quality_page.png"
-        pix.save(temp_img)
-        if os.path.exists(temp_img):
-            img_size = os.path.getsize(temp_img)
-            print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
-        else:
-            print("Failed to create high quality image", file=sys.stderr)
-            doc.close()
-            return
-        doc.close()
-        # Initialize OCR with optimized settings for medical documents
         print("Initializing OCR with medical document settings...", file=sys.stderr)
         ocr = PaddleOCR(
             use_angle_cls=True,          # Detect text orientation
             lang='en',                   # English language
             show_log=False,              # Suppress logs
             use_gpu=False,               # CPU mode for serverless
-            det_limit_side_len=1960,     # Higher detection limit for high-res images
             det_limit_type='max',        # Max side length limit
-            rec_batch_num=6,             # Process more text regions at once
-            max_text_length=25,          # Allow longer text detection
             use_space_char=True,         # Preserve spaces in text
-            drop_score=0.2               # Lower threshold to catch more text
         )
         print("OCR initialized with medical settings", file=sys.stderr)
-        # Run OCR with these optimized settings
-        print("Running optimized OCR...", file=sys.stderr)
-        result = ocr.ocr(temp_img, cls=True)
-        print(f"OCR result type: {type(result)}", file=sys.stderr)
-        if result:
-            print(f"Result length: {len(result)}", file=sys.stderr)
-            if result[0]:
-                detections = len(result[0])
-                print(f"High-quality approach found {detections} detections", file=sys.stderr)
                 # Extract text with lower confidence threshold
-                text_parts = []
-                medical_terms = []
-                numbers = []
                 for i, detection in enumerate(result[0]):
                     if len(detection) >= 2:
-                        text = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
-                        conf = float(detection[1][1]) if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
-                        # Show first 20 detections for debugging
-                        if i < 20:
                             print(f"  {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
-                        # Use lower confidence threshold (0.2 instead of 0.3)
-                        if conf > 0.2:
-                            text_parts.append(text)
                             # Categorize detections
-                            if any(char.isdigit() for char in text) and '.' in text:
-                                numbers.append(text)
-                            elif len(text) > 3 and text.isalpha():
-                                medical_terms.append(text)
-                full_text = '\n'.join(text_parts)
-                # Clean up
-                if os.path.exists(temp_img):
-                    os.unlink(temp_img)
-                print(f"Extracted {len(text_parts)} text pieces ({len(numbers)} numbers, {len(medical_terms)} terms)", file=sys.stderr)
-                # Return comprehensive result
-                print(json.dumps({
-                    "success": True,
-                    "text": full_text,
-                    "detections": detections,
-                    "numbers_found": numbers[:10],  # First 10 numbers
-                    "terms_found": medical_terms[:10],  # First 10 terms
-                    "settings": "High-quality 300 DPI with medical optimization"
-                }))
-            else:
-                print("First page result is empty", file=sys.stderr)
-                print(json.dumps({"success": False, "error": "No text detected"}))
-        else:
-            print("OCR returned None", file=sys.stderr)
-            print(json.dumps({"success": False, "error": "OCR returned no results"}))
     except Exception as e:
         # Clean up on error
-        if os.path.exists("/tmp/high_quality_page.png"):
-            os.unlink("/tmp/high_quality_page.png")
         print(f"Error: {e}", file=sys.stderr)
         import traceback
         traceback.print_exc(file=sys.stderr)
         print(json.dumps({"success": False, "error": str(e)}))
 if __name__ == "__main__":
     test_high_quality_ocr()

 #!/usr/bin/env python3
+# enhanced_paddle_test.py - Improved to match local implementation
 import sys
 import os
         # Open PDF
         doc = fitz.open(file_path)
+        total_pages = len(doc)
+        print(f"PDF has {total_pages} pages", file=sys.stderr)
+        all_text_parts = []
+        all_numbers = []
+        all_medical_terms = []
+        total_detections = 0
+        # Initialize OCR once with optimized settings for medical documents
         print("Initializing OCR with medical document settings...", file=sys.stderr)
         ocr = PaddleOCR(
             use_angle_cls=True,          # Detect text orientation
             lang='en',                   # English language
             show_log=False,              # Suppress logs
             use_gpu=False,               # CPU mode for serverless
+            det_limit_side_len=2880,     # Higher detection limit for high-res images
             det_limit_type='max',        # Max side length limit
+            rec_batch_num=8,             # Process more text regions at once
+            max_text_length=50,          # Allow longer text detection
             use_space_char=True,         # Preserve spaces in text
+            drop_score=0.1               # Much lower threshold to catch more text
         )
         print("OCR initialized with medical settings", file=sys.stderr)
+        # Process all pages (not just first page)
+        for page_num in range(total_pages):
+            print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
+            page = doc[page_num]
+            # Use higher DPI and better quality settings
+            mat = fitz.Matrix(300/72, 300/72)  # 300 DPI like professional scanners
+            pix = page.get_pixmap(matrix=mat, alpha=False)  # No alpha for better OCR
+            temp_img = f"/tmp/high_quality_page_{page_num}.png"
+            pix.save(temp_img)
+            if os.path.exists(temp_img):
+                img_size = os.path.getsize(temp_img)
+                print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
+            else:
+                print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
+                continue
+            # Run OCR on this page
+            print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
+            result = ocr.ocr(temp_img, cls=True)
+            if result and result[0]:
+                page_detections = len(result[0])
+                total_detections += page_detections
+                print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
                 # Extract text with lower confidence threshold
+                page_text_parts = []
                 for i, detection in enumerate(result[0]):
                     if len(detection) >= 2:
+                        text_info = detection[1]
+                        if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
+                            text = str(text_info[0])
+                            conf = float(text_info[1])
+                        else:
+                            text = str(text_info)
+                            conf = 1.0
+                        # Show some detections for debugging (first page only)
+                        if page_num == 0 and i < 20:
                             print(f"  {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
+                        # Use very low confidence threshold (0.1 instead of 0.2)
+                        if conf > 0.1 and len(text.strip()) > 0:
+                            page_text_parts.append(text)
+                            all_text_parts.append(text)
                             # Categorize detections
+                            if any(char.isdigit() for char in text):
+                                # Look for numbers with decimals or medical values
+                                if '.' in text or any(c.isdigit() for c in text):
+                                    all_numbers.append(text)
+                            elif len(text) > 2 and any(c.isalpha() for c in text):
+                                # Look for potential medical terms
+                                all_medical_terms.append(text)
+                print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
+            # Clean up page image
+            if os.path.exists(temp_img):
+                os.unlink(temp_img)
+        doc.close()
+        # Combine all text
+        full_text = '\n'.join(all_text_parts)
+        print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
+        print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
+        # Apply basic lab patterns similar to local implementation
+        lab_values = apply_basic_patterns(full_text)
+        # Return comprehensive result
+        result_data = {
+            "success": True,
+            "text": full_text,
+            "total_detections": total_detections,
+            "pages_processed": total_pages,
+            "numbers_found": all_numbers[:20],  # First 20 numbers
+            "terms_found": all_medical_terms[:20],  # First 20 terms
+            "lab_values": lab_values,
+            "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
+        }
+        print(json.dumps(result_data))
     except Exception as e:
         # Clean up on error
+        for i in range(10):  # Clean up any temp files
+            temp_file = f"/tmp/high_quality_page_{i}.png"
+            if os.path.exists(temp_file):
+                os.unlink(temp_file)
         print(f"Error: {e}", file=sys.stderr)
         import traceback
         traceback.print_exc(file=sys.stderr)
         print(json.dumps({"success": False, "error": str(e)}))
+def apply_basic_patterns(text):
+    """Apply basic lab value patterns similar to local implementation"""
+    lab_values = {}
+    if not text:
+        return lab_values
+    # Define basic patterns for common lab values
+    patterns = {
+        'TSH': r'TSH[:\s]*(\d+\.?\d*)',
+        'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
+        'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
+        'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
+        'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
+        'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
+        'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
+        'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
+        'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
+        'WBC': r'WBC[:\s]*(\d+\.?\d*)',
+        'RBC': r'RBC[:\s]*(\d+\.?\d*)'
+    }
+    import re
+    # Normalize text for pattern matching
+    normalized_text = re.sub(r'\s+', ' ', text)
+    for test_name, pattern in patterns.items():
+        try:
+            match = re.search(pattern, normalized_text, re.IGNORECASE)
+            if match:
+                value = float(match.group(1))
+                lab_values[test_name] = {
+                    "value": value,
+                    "raw_text": match.group(0),
+                    "confidence": 0.8
+                }
+                print(f"Found {test_name}: {value}", file=sys.stderr)
+        except (ValueError, IndexError) as e:
+            print(f"Error parsing {test_name}: {e}", file=sys.stderr)
+            continue
+    return lab_values
 if __name__ == "__main__":
     test_high_quality_ocr()