Spaces:

VanguardAI
/

Arabic-OCR

Running

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

98b1d96

verified ·

1 Parent(s): bc396b3

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -152

app.py CHANGED Viewed

@@ -22,40 +22,27 @@ from arabic_corrector import get_corrector
 # Constants
 MIN_PIXELS = 3136
-MAX_PIXELS = 16000000  # Increased from 11289600 for better detail on dense forms
 IMAGE_FACTOR = 28
-# Prompts - Enhanced for granular form and dense text detection
-prompt = """Please output the detailed layout information from this document image. This may be a form, table, or dense text document. Detect EVERY text element individually with maximum granularity.
-CRITICAL REQUIREMENTS FOR FORMS AND DENSE TEXT:
-- Detect EACH LINE of text as a SEPARATE bbox - do NOT group multiple lines together
-- For forms: detect each field, label, checkbox, and filled value separately
-- For tables: detect each cell as an individual element
-- Include ALL text regions no matter how small
-- Be extremely precise with bounding boxes - they should tightly fit each text element
-1. Bbox format: [x1, y1, x2, y2] - must be tight and accurate for each element
-2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
-   - Use 'Text' for form fields, labels, and general text
-   - Use 'List-item' for form checkboxes and bullet points
-3. Text Extraction Rules:
-    - Picture: Omit text field
-    - Formula: Format as LaTeX
-    - Table: Format as HTML (detect each cell separately)
-    - All Others: Extract exact original text in Markdown format
 4. Constraints:
-    - Output original text with NO translation
-    - Detect handwritten and typed text equally
-    - Sort elements by human reading order (top-to-bottom, left-to-right)
-    - For forms: maintain field relationships (label + value pairs)
-5. Output Format: Single JSON array with maximum detail and granularity.
-REMEMBER: More bboxes = better! Aim for line-level or field-level detection, not paragraph-level.
 """
 # Utility functions
@@ -553,91 +540,6 @@ def _generate_text_and_confidence_for_crop(
         return "", 0.0
-def split_large_regions(layout_data: List[Dict], image_height: int) -> List[Dict]:
-    """
-    Split large bounding boxes into smaller line-level regions for better granularity.
-    Critical for dense forms where model groups multiple lines together.
-    Args:
-        layout_data: List of layout items with bbox
-        image_height: Height of the image for context
-    Returns:
-        Enhanced layout data with split regions
-    """
-    result = []
-    for item in layout_data:
-        bbox = item.get('bbox', [])
-        category = item.get('category', 'Text')
-        text = item.get('text', '')
-        if len(bbox) != 4 or not text:
-            result.append(item)
-            continue
-        x1, y1, x2, y2 = bbox
-        width = x2 - x1
-        height = y2 - y1
-        # Skip splitting for certain categories
-        if category in ['Picture', 'Formula', 'Table', 'Section-header', 'Title']:
-            result.append(item)
-            continue
-        # Heuristics for splitting:
-        # 1. Very tall regions (likely multiple lines grouped)
-        # 2. Text with newlines (definitely multiple lines)
-        # 3. Aspect ratio suggests multiple stacked lines
-        should_split = False
-        estimated_lines = 1
-        # Check if text has explicit line breaks
-        text_lines = text.strip().split('\n')
-        if len(text_lines) > 1:
-            should_split = True
-            estimated_lines = len(text_lines)
-        # Check if bbox is tall (multiple lines)
-        elif height > 80:  # Assume ~35-40px per line of text
-            should_split = True
-            estimated_lines = max(2, int(height / 40))
-        # Check if aspect ratio suggests stacked text
-        elif height > 60 and width / height < 3:  # Not wide enough for single line
-            should_split = True
-            estimated_lines = max(2, int(height / 35))
-        if should_split and estimated_lines > 1:
-            # Split the region into estimated number of lines
-            line_height = height / estimated_lines
-            for i in range(estimated_lines):
-                new_item = item.copy()
-                new_y1 = y1 + (i * line_height)
-                new_y2 = y1 + ((i + 1) * line_height)
-                new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
-                # Try to split text proportionally
-                if len(text_lines) == estimated_lines:
-                    new_item['text'] = text_lines[i]
-                elif len(text_lines) > 1:
-                    # Distribute available lines
-                    line_idx = int(i * len(text_lines) / estimated_lines)
-                    new_item['text'] = text_lines[line_idx] if line_idx < len(text_lines) else text_lines[-1]
-                else:
-                    # Keep same text but mark as split region
-                    new_item['text'] = text
-                    new_item['is_split'] = True
-                    new_item['split_index'] = i
-                result.append(new_item)
-        else:
-            # Keep as-is
-            result.append(item)
-    return result
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
@@ -668,7 +570,6 @@ def estimate_text_density(image: Image.Image) -> float:
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
-    Enhanced for dense forms and structured documents.
     Returns (should_chunk, reason).
     """
@@ -676,28 +577,24 @@ def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     total_pixels = width * height
     density = estimate_text_density(image)
-    # Criteria for chunking (OPTIMIZED FOR FORMS AND DENSE TEXT)
-    # 1. Very large images (>6MP) - reduced threshold for better form detection
-    if total_pixels > 6_000_000:
-        return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for granular layout detection"
-    # 2. Dense text (>20% coverage) in medium+ images - forms often hit this
-    if density > 0.20 and total_pixels > 3_000_000:
-        return True, f"Dense text ({density*100:.1f}% coverage) - form/document chunking for accuracy"
-    # 3. Very dense text (>30%) regardless of size - CRITICAL for forms
-    if density > 0.30:
-        return True, f"Very dense text ({density*100:.1f}% coverage) - form detected, aggressive chunking"
-    # 4. Extreme aspect ratio - likely scrolled document or long form
     aspect_ratio = max(width, height) / min(width, height)
-    if aspect_ratio > 2.5 and total_pixels > 2_500_000:
-        return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - document chunking"
-    # 5. Medium density + medium size - conservative chunking for forms
-    if density > 0.15 and total_pixels > 4_500_000:
-        return True, f"Medium-high density ({density*100:.1f}%) on large image - preventive chunking"
     return False, "Image size and density within optimal range"
@@ -706,32 +603,25 @@ def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
     """
     Chunk image into optimal pieces for processing.
     Uses overlap to prevent text cutting and smart sizing for accuracy.
-    OPTIMIZED FOR DENSE FORMS: Smaller chunks, more overlap, better granularity.
     Returns list of chunks with metadata.
     """
     width, height = image.size
     # Determine optimal chunk size based on density and dimensions
-    # AGGRESSIVE chunking for forms: smaller sizes = better layout detection
     density = estimate_text_density(image)
-    if density > 0.35:
-        # Very dense forms - use small chunks for maximum accuracy
-        chunk_size = 1400
-        overlap = 200  # Extra overlap for dense text
-    elif density > 0.25:
-        # Moderate density forms
         chunk_size = 1600
-        overlap = 180
-    elif density > 0.15:
-        # Light-medium density
-        chunk_size = 2000
-        overlap = 160
     else:
         # Lower density - can use larger chunks
-        chunk_size = 2400
-        overlap = 150
     chunks = []
     chunk_id = 0
@@ -903,13 +793,6 @@ def process_image(
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
-            # 🔧 SMART REGION SPLITTING: Break large bboxes into line-level regions
-            # Critical for forms where model groups multiple fields/lines together
-            original_count = len(layout_data)
-            layout_data = split_large_regions(layout_data, image.height)
-            if len(layout_data) > original_count:
-                print(f"📐 Split {original_count} regions into {len(layout_data)} granular regions (+{len(layout_data)-original_count} regions)")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data
@@ -1659,4 +1542,4 @@ if __name__ == "__main__":
         share=False,
         debug=True,
         show_error=True
-    )

 # Constants
 MIN_PIXELS = 3136
+MAX_PIXELS = 11289600
 IMAGE_FACTOR = 28
+# Prompts
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
 4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+5. Final Output: The entire output must be a single JSON object.
 """
 # Utility functions
         return "", 0.0
 def estimate_text_density(image: Image.Image) -> float:
     """
     Estimate text density in image using pixel analysis.
 def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
     """
     Intelligently determine if image should be chunked for better accuracy.
     Returns (should_chunk, reason).
     """
     total_pixels = width * height
     density = estimate_text_density(image)
+    # Criteria for chunking (prioritizing ACCURACY)
+    # 1. Very large images (>8MP) - model struggles with layout detection
+    if total_pixels > 8_000_000:
+        return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for better layout detection"
+    # 2. Dense text (>25% coverage) in large image - overwhelming for single pass
+    if density > 0.25 and total_pixels > 4_000_000:
+        return True, f"Dense text ({density*100:.1f}% coverage) in large image - chunking for accuracy"
+    # 3. Very dense text (>40%) regardless of size - likely tables/forms
+    if density > 0.40:
+        return True, f"Very dense text ({density*100:.1f}% coverage) - likely structured document, chunking"
+    # 4. Extreme aspect ratio - likely scrolled document
     aspect_ratio = max(width, height) / min(width, height)
+    if aspect_ratio > 3.0 and total_pixels > 3_000_000:
+        return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking vertically"
     return False, "Image size and density within optimal range"
     """
     Chunk image into optimal pieces for processing.
     Uses overlap to prevent text cutting and smart sizing for accuracy.
     Returns list of chunks with metadata.
     """
     width, height = image.size
     # Determine optimal chunk size based on density and dimensions
     density = estimate_text_density(image)
+    if density > 0.40:
+        # Very dense - use smaller chunks for better accuracy
         chunk_size = 1600
+    elif density > 0.25:
+        # Moderate density
+        chunk_size = 2048
     else:
         # Lower density - can use larger chunks
+        chunk_size = 2800
+    overlap = 150  # Generous overlap to prevent text cutting
     chunks = []
     chunk_id = 0
             # Try to parse JSON output
             layout_data = json.loads(raw_output)
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             num_text_regions = sum(1 for item in layout_data
         share=False,
         debug=True,
         show_error=True
+    )