Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

bc396b3

verified ·

1 Parent(s): 9866ebc

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -34

app.py CHANGED Viewed

@@ -22,27 +22,40 @@ from arabic_corrector import get_corrector
 # Constants
 MIN_PIXELS = 3136
-MAX_PIXELS = 11289600
 IMAGE_FACTOR = 28
-# Prompts
-prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
-1. Bbox format: [x1, y1, x2, y2]
-2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
-3. Text Extraction & Formatting Rules:
-    - Picture: For the 'Picture' category, the text field should be omitted.
-    - Formula: Format its text as LaTeX.
-    - Table: Format its text as HTML.
-    - All Others (Text, Title, etc.): Format their text as Markdown.
 4. Constraints:
-    - The output text must be the original text from the image, with no translation.
-    - All layout elements must be sorted according to human reading order.
-5. Final Output: The entire output must be a single JSON object.
 """
 # Utility functions
@@ -540,6 +553,91 @@ def _generate_text_and_confidence_for_crop(
         return "", 0.0
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
@@ -570,6 +668,7 @@ def estimate_text_density(image: Image.Image) -> float:
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
     Returns (should_chunk, reason).
     """
@@ -577,24 +676,28 @@ def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     total_pixels = width * height
     density = estimate_text_density(image)
-    # Criteria for chunking (prioritizing ACCURACY)
-    # 1. Very large images (>8MP) - model struggles with layout detection
-    if total_pixels > 8_000_000:
-        return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for better layout detection"
-    # 2. Dense text (>25% coverage) in large image - overwhelming for single pass
-    if density > 0.25 and total_pixels > 4_000_000:
-        return True, f"Dense text ({density*100:.1f}% coverage) in large image - chunking for accuracy"
-    # 3. Very dense text (>40%) regardless of size - likely tables/forms
-    if density > 0.40:
-        return True, f"Very dense text ({density*100:.1f}% coverage) - likely structured document, chunking"
-    # 4. Extreme aspect ratio - likely scrolled document
     aspect_ratio = max(width, height) / min(width, height)
-    if aspect_ratio > 3.0 and total_pixels > 3_000_000:
-        return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking vertically"
     return False, "Image size and density within optimal range"
@@ -603,25 +706,32 @@ def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
     """
     Chunk image into optimal pieces for processing.
     Uses overlap to prevent text cutting and smart sizing for accuracy.
     Returns list of chunks with metadata.
     """
     width, height = image.size
     # Determine optimal chunk size based on density and dimensions
     density = estimate_text_density(image)
-    if density > 0.40:
-        # Very dense - use smaller chunks for better accuracy
-        chunk_size = 1600
     elif density > 0.25:
-        # Moderate density
-        chunk_size = 2048
     else:
         # Lower density - can use larger chunks
-        chunk_size = 2800
-    overlap = 150  # Generous overlap to prevent text cutting
     chunks = []
     chunk_id = 0
@@ -793,6 +903,13 @@ def process_image(
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data

 # Constants
 MIN_PIXELS = 3136
+MAX_PIXELS = 16000000  # Increased from 11289600 for better detail on dense forms
 IMAGE_FACTOR = 28
+# Prompts - Enhanced for granular form and dense text detection
+prompt = """Please output the detailed layout information from this document image. This may be a form, table, or dense text document. Detect EVERY text element individually with maximum granularity.
+CRITICAL REQUIREMENTS FOR FORMS AND DENSE TEXT:
+- Detect EACH LINE of text as a SEPARATE bbox - do NOT group multiple lines together
+- For forms: detect each field, label, checkbox, and filled value separately
+- For tables: detect each cell as an individual element
+- Include ALL text regions no matter how small
+- Be extremely precise with bounding boxes - they should tightly fit each text element
+1. Bbox format: [x1, y1, x2, y2] - must be tight and accurate for each element
+2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
+   - Use 'Text' for form fields, labels, and general text
+   - Use 'List-item' for form checkboxes and bullet points
+3. Text Extraction Rules:
+    - Picture: Omit text field
+    - Formula: Format as LaTeX
+    - Table: Format as HTML (detect each cell separately)
+    - All Others: Extract exact original text in Markdown format
 4. Constraints:
+    - Output original text with NO translation
+    - Detect handwritten and typed text equally
+    - Sort elements by human reading order (top-to-bottom, left-to-right)
+    - For forms: maintain field relationships (label + value pairs)
+5. Output Format: Single JSON array with maximum detail and granularity.
+REMEMBER: More bboxes = better! Aim for line-level or field-level detection, not paragraph-level.
 """
 # Utility functions
         return "", 0.0
+def split_large_regions(layout_data: List[Dict], image_height: int) -> List[Dict]:
+    """
+    Split large bounding boxes into smaller line-level regions for better granularity.
+    Critical for dense forms where model groups multiple lines together.
+    Args:
+        layout_data: List of layout items with bbox
+        image_height: Height of the image for context
+    Returns:
+        Enhanced layout data with split regions
+    """
+    result = []
+    for item in layout_data:
+        bbox = item.get('bbox', [])
+        category = item.get('category', 'Text')
+        text = item.get('text', '')
+        if len(bbox) != 4 or not text:
+            result.append(item)
+            continue
+        x1, y1, x2, y2 = bbox
+        width = x2 - x1
+        height = y2 - y1
+        # Skip splitting for certain categories
+        if category in ['Picture', 'Formula', 'Table', 'Section-header', 'Title']:
+            result.append(item)
+            continue
+        # Heuristics for splitting:
+        # 1. Very tall regions (likely multiple lines grouped)
+        # 2. Text with newlines (definitely multiple lines)
+        # 3. Aspect ratio suggests multiple stacked lines
+        should_split = False
+        estimated_lines = 1
+        # Check if text has explicit line breaks
+        text_lines = text.strip().split('\n')
+        if len(text_lines) > 1:
+            should_split = True
+            estimated_lines = len(text_lines)
+        # Check if bbox is tall (multiple lines)
+        elif height > 80:  # Assume ~35-40px per line of text
+            should_split = True
+            estimated_lines = max(2, int(height / 40))
+        # Check if aspect ratio suggests stacked text
+        elif height > 60 and width / height < 3:  # Not wide enough for single line
+            should_split = True
+            estimated_lines = max(2, int(height / 35))
+        if should_split and estimated_lines > 1:
+            # Split the region into estimated number of lines
+            line_height = height / estimated_lines
+            for i in range(estimated_lines):
+                new_item = item.copy()
+                new_y1 = y1 + (i * line_height)
+                new_y2 = y1 + ((i + 1) * line_height)
+                new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
+                # Try to split text proportionally
+                if len(text_lines) == estimated_lines:
+                    new_item['text'] = text_lines[i]
+                elif len(text_lines) > 1:
+                    # Distribute available lines
+                    line_idx = int(i * len(text_lines) / estimated_lines)
+                    new_item['text'] = text_lines[line_idx] if line_idx < len(text_lines) else text_lines[-1]
+                else:
+                    # Keep same text but mark as split region
+                    new_item['text'] = text
+                    new_item['is_split'] = True
+                    new_item['split_index'] = i
+                result.append(new_item)
+        else:
+            # Keep as-is
+            result.append(item)
+    return result
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
+    Enhanced for dense forms and structured documents.
     Returns (should_chunk, reason).
     """
     total_pixels = width * height
     density = estimate_text_density(image)
+    # Criteria for chunking (OPTIMIZED FOR FORMS AND DENSE TEXT)
+    # 1. Very large images (>6MP) - reduced threshold for better form detection
+    if total_pixels > 6_000_000:
+        return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for granular layout detection"
+    # 2. Dense text (>20% coverage) in medium+ images - forms often hit this
+    if density > 0.20 and total_pixels > 3_000_000:
+        return True, f"Dense text ({density*100:.1f}% coverage) - form/document chunking for accuracy"
+    # 3. Very dense text (>30%) regardless of size - CRITICAL for forms
+    if density > 0.30:
+        return True, f"Very dense text ({density*100:.1f}% coverage) - form detected, aggressive chunking"
+    # 4. Extreme aspect ratio - likely scrolled document or long form
     aspect_ratio = max(width, height) / min(width, height)
+    if aspect_ratio > 2.5 and total_pixels > 2_500_000:
+        return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - document chunking"
+    # 5. Medium density + medium size - conservative chunking for forms
+    if density > 0.15 and total_pixels > 4_500_000:
+        return True, f"Medium-high density ({density*100:.1f}%) on large image - preventive chunking"
     return False, "Image size and density within optimal range"
     """
     Chunk image into optimal pieces for processing.
     Uses overlap to prevent text cutting and smart sizing for accuracy.
+    OPTIMIZED FOR DENSE FORMS: Smaller chunks, more overlap, better granularity.
     Returns list of chunks with metadata.
     """
     width, height = image.size
     # Determine optimal chunk size based on density and dimensions
+    # AGGRESSIVE chunking for forms: smaller sizes = better layout detection
     density = estimate_text_density(image)
+    if density > 0.35:
+        # Very dense forms - use small chunks for maximum accuracy
+        chunk_size = 1400
+        overlap = 200  # Extra overlap for dense text
     elif density > 0.25:
+        # Moderate density forms
+        chunk_size = 1600
+        overlap = 180
+    elif density > 0.15:
+        # Light-medium density
+        chunk_size = 2000
+        overlap = 160
     else:
         # Lower density - can use larger chunks
+        chunk_size = 2400
+        overlap = 150
     chunks = []
     chunk_id = 0
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
+            # 🔧 SMART REGION SPLITTING: Break large bboxes into line-level regions
+            # Critical for forms where model groups multiple fields/lines together
+            original_count = len(layout_data)
+            layout_data = split_large_regions(layout_data, image.height)
+            if len(layout_data) > original_count:
+                print(f"📐 Split {original_count} regions into {len(layout_data)} granular regions (+{len(layout_data)-original_count} regions)")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data