Spaces:

VanguardAI
/

Arabic-OCR

Sleeping

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

c265696

verified ·

1 Parent(s): ba601ca

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -64

app.py CHANGED Viewed

@@ -647,14 +647,13 @@ def estimate_text_density(image: Image.Image) -> float:
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
-    min_line_height: int = 35,  # Increased to avoid cutting through text
-    max_line_height: int = 100  # Less aggressive to prevent over-splitting
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
     This ensures each line gets its own bounding box for easier verification.
-    Uses smart estimation to avoid cutting through the middle of text lines.
     Args:
         image: Original image
@@ -682,59 +681,22 @@ def split_text_regions_into_lines(
         height = y2 - y1
         width = x2 - x1
-        print(f"   Checking region: height={height:.0f}px, width={width:.0f}px, category={category}")
-        # Safety check: if region is too small (< min_line_height), it might be incorrect/noise
-        if height < min_line_height:
-            print(f"   ⚠️  Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
-            result.append(item)  # Keep it but flag the issue
-            continue
-        # Safety check: if region is reasonably line-sized already, keep it
-        # This prevents unnecessary splitting of well-detected regions
-        if min_line_height <= height <= max_line_height:
-            print(f"   ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
-            result.append(item)
-            continue
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
-            print(f"   → Will split (height {height:.0f}px > threshold {max_line_height}px)")
-            # Smart line height estimation based on region characteristics
-            # Arabic handwritten: typically 50-70px per line
-            # Arabic typed: typically 35-50px per line
-            # Use conservative estimate to avoid cutting through text
-            # Estimate based on height
-            if height < 150:
-                avg_line_height = 60  # Conservative for small regions
-            elif height < 300:
-                avg_line_height = 55  # Medium regions
-            else:
-                avg_line_height = 50  # Larger documents
             estimated_lines = max(1, round(height / avg_line_height))
             # Don't split into too many lines (might be a paragraph)
             estimated_lines = min(estimated_lines, 10)
-            # Validate: ensure we have at least 2 lines to split, otherwise keep original
-            if estimated_lines < 2:
-                print(f"   → Keeping original (only {estimated_lines} line estimated)")
-                result.append(item)
-                continue
             line_height = height / estimated_lines
-            # Validate: each split line must meet minimum height requirement
-            if line_height < min_line_height:
-                print(f"   → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
-                result.append(item)
-                continue
-            print(f"   → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
             # Split text content by newlines if available
             text_lines = text_content.split('\n') if text_content else []
@@ -744,12 +706,8 @@ def split_text_regions_into_lines(
                     if not line_text.strip():
                         continue
                     new_item = item.copy()
-                    # Add small padding to avoid cutting through text (5% margin)
-                    margin = line_height * 0.05
-                    new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
-                    new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     new_item['text'] = line_text.strip()
                     new_item['split_from_parent'] = True
@@ -759,17 +717,8 @@ def split_text_regions_into_lines(
                 # Split geometrically - mark for re-OCR to get accurate per-line text
                 for i in range(estimated_lines):
                     new_item = item.copy()
-                    # Add padding between lines to avoid cutting through text
-                    # 8% margin between lines to create separation
-                    margin = line_height * 0.08
-                    new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
-                    new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
-                    # Ensure bbox is valid
-                    if new_y2 <= new_y1:
-                        continue
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     # Clear text - will be re-OCR'd per-line for accuracy
                     new_item['text'] = ""
@@ -1853,4 +1802,4 @@ if __name__ == "__main__":
         share=False,
         debug=True,
         show_error=True
-    )

 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
+    min_line_height: int = 25,
+    max_line_height: int = 80  # More aggressive - split anything taller than ~2 lines
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
     This ensures each line gets its own bounding box for easier verification.
     Args:
         image: Original image
         height = y2 - y1
         width = x2 - x1
+        print(f"   Checking region: height={height}px, width={width}px, category={category}")
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
+            print(f"   → Splitting! (height {height}px > threshold {max_line_height}px)")
+            # Estimate number of lines based on typical line height
+            # Arabic handwritten text: ~40-60px per line
+            # Arabic typed text: ~30-50px per line
+            avg_line_height = 45  # Middle ground
             estimated_lines = max(1, round(height / avg_line_height))
             # Don't split into too many lines (might be a paragraph)
             estimated_lines = min(estimated_lines, 10)
             line_height = height / estimated_lines
             # Split text content by newlines if available
             text_lines = text_content.split('\n') if text_content else []
                     if not line_text.strip():
                         continue
                     new_item = item.copy()
+                    new_y1 = y1 + (i * line_height)
+                    new_y2 = y1 + ((i + 1) * line_height)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     new_item['text'] = line_text.strip()
                     new_item['split_from_parent'] = True
                 # Split geometrically - mark for re-OCR to get accurate per-line text
                 for i in range(estimated_lines):
                     new_item = item.copy()
+                    new_y1 = y1 + (i * line_height)
+                    new_y2 = y1 + ((i + 1) * line_height)
                     new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
                     # Clear text - will be re-OCR'd per-line for accuracy
                     new_item['text'] = ""
         share=False,
         debug=True,
         show_error=True
+    )