Spaces:

VanguardAI
/

Arabic-OCR

Sleeping

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

ba601ca

verified ·

1 Parent(s): 38e40e9

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -12

app.py CHANGED Viewed

@@ -647,13 +647,14 @@ def estimate_text_density(image: Image.Image) -> float:
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
-    min_line_height: int = 25,
-    max_line_height: int = 80  # More aggressive - split anything taller than ~2 lines
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
     This ensures each line gets its own bounding box for easier verification.
     Args:
         image: Original image
@@ -681,22 +682,59 @@ def split_text_regions_into_lines(
         height = y2 - y1
         width = x2 - x1
-        print(f"   Checking region: height={height}px, width={width}px, category={category}")
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
-            print(f"   → Splitting! (height {height}px > threshold {max_line_height}px)")
-            # Estimate number of lines based on typical line height
-            # Arabic handwritten text: ~40-60px per line
-            # Arabic typed text: ~30-50px per line
-            avg_line_height = 45  # Middle ground
             estimated_lines = max(1, round(height / avg_line_height))
             # Don't split into too many lines (might be a paragraph)
             estimated_lines = min(estimated_lines, 10)
             line_height = height / estimated_lines
             # Split text content by newlines if available
             text_lines = text_content.split('\n') if text_content else []
@@ -706,8 +744,12 @@ def split_text_regions_into_lines(
                     if not line_text.strip():
                         continue
                     new_item = item.copy()
-                    new_y1 = y1 + (i * line_height)
-                    new_y2 = y1 + ((i + 1) * line_height)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     new_item['text'] = line_text.strip()
                     new_item['split_from_parent'] = True
@@ -717,8 +759,17 @@ def split_text_regions_into_lines(
                 # Split geometrically - mark for re-OCR to get accurate per-line text
                 for i in range(estimated_lines):
                     new_item = item.copy()
-                    new_y1 = y1 + (i * line_height)
-                    new_y2 = y1 + ((i + 1) * line_height)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     # Clear text - will be re-OCR'd per-line for accuracy
                     new_item['text'] = ""

 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
+    min_line_height: int = 35,  # Increased to avoid cutting through text
+    max_line_height: int = 100  # Less aggressive to prevent over-splitting
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
     This ensures each line gets its own bounding box for easier verification.
+    Uses smart estimation to avoid cutting through the middle of text lines.
     Args:
         image: Original image
         height = y2 - y1
         width = x2 - x1
+        print(f"   Checking region: height={height:.0f}px, width={width:.0f}px, category={category}")
+        # Safety check: if region is too small (< min_line_height), it might be incorrect/noise
+        if height < min_line_height:
+            print(f"   ⚠️  Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
+            result.append(item)  # Keep it but flag the issue
+            continue
+        # Safety check: if region is reasonably line-sized already, keep it
+        # This prevents unnecessary splitting of well-detected regions
+        if min_line_height <= height <= max_line_height:
+            print(f"   ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
+            result.append(item)
+            continue
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
+            print(f"   → Will split (height {height:.0f}px > threshold {max_line_height}px)")
+            # Smart line height estimation based on region characteristics
+            # Arabic handwritten: typically 50-70px per line
+            # Arabic typed: typically 35-50px per line
+            # Use conservative estimate to avoid cutting through text
+            # Estimate based on height
+            if height < 150:
+                avg_line_height = 60  # Conservative for small regions
+            elif height < 300:
+                avg_line_height = 55  # Medium regions
+            else:
+                avg_line_height = 50  # Larger documents
             estimated_lines = max(1, round(height / avg_line_height))
             # Don't split into too many lines (might be a paragraph)
             estimated_lines = min(estimated_lines, 10)
+            # Validate: ensure we have at least 2 lines to split, otherwise keep original
+            if estimated_lines < 2:
+                print(f"   → Keeping original (only {estimated_lines} line estimated)")
+                result.append(item)
+                continue
             line_height = height / estimated_lines
+            # Validate: each split line must meet minimum height requirement
+            if line_height < min_line_height:
+                print(f"   → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
+                result.append(item)
+                continue
+            print(f"   → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
             # Split text content by newlines if available
             text_lines = text_content.split('\n') if text_content else []
                     if not line_text.strip():
                         continue
                     new_item = item.copy()
+                    # Add small padding to avoid cutting through text (5% margin)
+                    margin = line_height * 0.05
+                    new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
+                    new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     new_item['text'] = line_text.strip()
                     new_item['split_from_parent'] = True
                 # Split geometrically - mark for re-OCR to get accurate per-line text
                 for i in range(estimated_lines):
                     new_item = item.copy()
+                    # Add padding between lines to avoid cutting through text
+                    # 8% margin between lines to create separation
+                    margin = line_height * 0.08
+                    new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
+                    new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
+                    # Ensure bbox is valid
+                    if new_y2 <= new_y1:
+                        continue
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     # Clear text - will be re-OCR'd per-line for accuracy
                     new_item['text'] = ""