Spaces:

VanguardAI
/

Arabic-OCR

Sleeping

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

67294ce

verified ·

1 Parent(s): 6896445

Update app.py

Browse files

Files changed (1) hide show

app.py +257 -100

app.py CHANGED Viewed

@@ -644,7 +644,112 @@ def estimate_text_density(image: Image.Image) -> float:
         return 0.1  # Default to low density
-def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
     """
     Detect average line spacing in a text region using horizontal projection analysis.
@@ -662,37 +767,32 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
             return None
         # Horizontal projection: sum of dark pixels per row
-        # Text lines will have higher values
-        row_sums = np.sum(img_array < 128, axis=1)  # Count dark pixels per row
-        if len(row_sums) < 10:  # Need at least some rows
             return None
         # Find peaks (text lines) and valleys (spacing between lines)
-        # Use adaptive threshold to identify text rows
         mean_val = np.mean(row_sums)
         std_val = np.std(row_sums)
-        threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
         text_rows = np.where(row_sums > threshold)[0]
         if len(text_rows) < 2:
-            return None  # Can't detect spacing with less than 2 text rows
-        # Find gaps between text rows (line spacing)
         # Group consecutive rows to find line centers
         line_centers = []
         current_group = [text_rows[0]]
         for i in range(1, len(text_rows)):
-            if text_rows[i] - text_rows[i-1] <= 3:  # Consecutive or very close rows
                 current_group.append(text_rows[i])
             else:
-                # End of current line, start new
                 line_centers.append(int(np.mean(current_group)))
                 current_group = [text_rows[i]]
-        # Add last group
         if current_group:
             line_centers.append(int(np.mean(current_group)))
@@ -703,41 +803,118 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
         spacings = []
         for i in range(len(line_centers) - 1):
             spacing = line_centers[i+1] - line_centers[i]
-            if spacing > 10:  # Minimum reasonable spacing
                 spacings.append(spacing)
         if spacings:
-            # Use median for robustness against outliers
-            avg_spacing = np.median(spacings)
-            print(f"   → Detected {len(line_centers)} lines with avg spacing {avg_spacing:.1f}px")
-            return float(avg_spacing)
         return None
     except Exception as e:
-        print(f"   ⚠️ Could not detect line spacing: {e}")
         return None
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
-    min_line_height: int = 25,
-    max_line_height: int = 80  # More aggressive - split anything taller than ~2 lines
 ) -> List[Dict[str, Any]]:
     """
-    Post-process layout data to split large text regions into individual lines.
-    Uses intelligent line spacing detection and padding to avoid cutting through text.
     Args:
         image: Original image
         layout_data: Layout detection results
-        min_line_height: Minimum height for a text line (pixels)
-        max_line_height: Maximum height for a single line before splitting
     Returns:
         Updated layout data with lines split
     """
     result = []
     split_count = 0
@@ -755,110 +932,90 @@ def split_text_regions_into_lines(
         height = y2 - y1
         width = x2 - x1
-        print(f"   Checking region: height={height}px, width={width}px, category={category}")
-        # If region is already reasonably line-sized, keep it
-        if height <= max_line_height:
-            print(f"   ✓ Already line-sized (height {height}px <= {max_line_height}px)")
-            result.append(item)
-            continue
-        # If region is tall enough to contain multiple lines, split it
-        print(f"   → Splitting! (height {height}px > threshold {max_line_height}px)")
-        # Try to detect actual line spacing from the image
-        detected_spacing = detect_line_spacing(image, bbox)
-        if detected_spacing and detected_spacing > min_line_height:
-            # Use detected spacing for more accurate splitting
-            estimated_lines = max(2, round(height / detected_spacing))
-            line_height = detected_spacing
-            print(f"   → Detected line spacing: {detected_spacing:.1f}px, splitting into ~{estimated_lines} lines")
-        else:
-            # Fallback to estimated line height
-            # Arabic handwritten text: ~40-60px per line
-            # Arabic typed text: ~30-50px per line
-            avg_line_height = 45  # Middle ground
-            estimated_lines = max(2, round(height / avg_line_height))
-            line_height = height / estimated_lines
-            print(f"   → Using estimated line height: {avg_line_height}px, splitting into {estimated_lines} lines")
-        # Don't split into too many lines (might be a paragraph)
-        estimated_lines = min(estimated_lines, 10)
-        # Calculate padding to avoid cutting through text
-        # Use 10% of line height as padding, but at least 3px
-        padding = max(3, int(line_height * 0.1))
-        # Split text content by newlines if available
-        text_lines = text_content.split('\n') if text_content else []
-        # If we have the same number of text lines as estimated, use them
-        if len(text_lines) == estimated_lines and len(text_lines) > 1:
-            for i, line_text in enumerate(text_lines):
-                if not line_text.strip():
-                    continue
                 new_item = item.copy()
-                # Calculate bbox with padding to avoid cutting text
-                if i == 0:
-                    # First line: pad bottom only
-                    new_y1 = y1
-                    new_y2 = y1 + line_height + padding
-                elif i == estimated_lines - 1:
-                    # Last line: pad top only
-                    new_y1 = y1 + (i * line_height) - padding
-                    new_y2 = y2
-                else:
-                    # Middle lines: pad both top and bottom
-                    new_y1 = y1 + (i * line_height) - padding
-                    new_y2 = y1 + ((i + 1) * line_height) + padding
-                # Ensure bbox is valid and within image bounds
-                new_y1 = max(y1, int(new_y1))
-                new_y2 = min(y2, int(new_y2))
-                if new_y2 > new_y1:  # Valid bbox
-                    new_item['bbox'] = [x1, new_y1, x2, new_y2]
-                    new_item['text'] = line_text.strip()
-                    new_item['split_from_parent'] = True
-                    result.append(new_item)
             split_count += 1
-        else:
-            # Split geometrically - mark for re-OCR to get accurate per-line text
             for i in range(estimated_lines):
                 new_item = item.copy()
-                # Calculate bbox with padding to avoid cutting text
                 if i == 0:
-                    # First line: pad bottom only
                     new_y1 = y1
                     new_y2 = y1 + line_height + padding
                 elif i == estimated_lines - 1:
-                    # Last line: pad top only
                     new_y1 = y1 + (i * line_height) - padding
                     new_y2 = y2
                 else:
-                    # Middle lines: pad both top and bottom
                     new_y1 = y1 + (i * line_height) - padding
                     new_y2 = y1 + ((i + 1) * line_height) + padding
-                # Ensure bbox is valid and within image bounds
                 new_y1 = max(y1, int(new_y1))
                 new_y2 = min(y2, int(new_y2))
-                if new_y2 > new_y1:  # Valid bbox
                     new_item['bbox'] = [x1, new_y1, x2, new_y2]
-                    # Clear text - will be re-OCR'd per-line for accuracy
                     new_item['text'] = ""
                     new_item['split_from_parent'] = True
-                    new_item['needs_reocr'] = True  # Flag for re-processing
                     new_item['line_number'] = i + 1
                     result.append(new_item)
             split_count += 1
     if split_count > 0:
-        print(f"📏 Split {split_count} large regions into individual lines ({len(layout_data)} → {len(result)} regions)")
     return result

         return 0.1  # Default to low density
+def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
+    """
+    Analyze image to determine optimal line detection parameters.
+    Works adaptively for any image type (sparse, dense, tables, forms).
+    Returns dict with: avg_line_height, min_line_height, max_line_height, line_spacing
+    """
+    try:
+        width, height = image.size
+        gray = image.convert('L')
+        img_array = np.array(gray)
+        # Horizontal projection: sum of dark pixels per row
+        row_sums = np.sum(img_array < 128, axis=1)
+        if len(row_sums) < 10:
+            # Fallback for very small images
+            return {
+                'avg_line_height': height / 10,  # Assume ~10 lines
+                'min_line_height': max(15, height / 20),
+                'max_line_height': height / 3,  # Split if > 1/3 of image height
+                'line_spacing': height / 15
+            }
+        # Find text rows (peaks in projection)
+        mean_val = np.mean(row_sums)
+        std_val = np.std(row_sums)
+        threshold = max(mean_val * 0.2, mean_val - std_val * 0.3)
+        text_rows = np.where(row_sums > threshold)[0]
+        if len(text_rows) < 2:
+            # No clear text lines detected, use conservative estimates
+            estimated_lines = max(5, height // 50)
+            return {
+                'avg_line_height': height / estimated_lines,
+                'min_line_height': max(15, height / (estimated_lines * 2)),
+                'max_line_height': height / 2,  # Split if > half image
+                'line_spacing': height / estimated_lines
+            }
+        # Group consecutive text rows into lines
+        line_centers = []
+        current_group = [text_rows[0]]
+        for i in range(1, len(text_rows)):
+            if text_rows[i] - text_rows[i-1] <= 5:  # Consecutive rows
+                current_group.append(text_rows[i])
+            else:
+                line_centers.append(int(np.mean(current_group)))
+                current_group = [text_rows[i]]
+        if current_group:
+            line_centers.append(int(np.mean(current_group)))
+        if len(line_centers) < 2:
+            # Can't determine spacing
+            estimated_lines = max(3, height // 60)
+            return {
+                'avg_line_height': height / estimated_lines,
+                'min_line_height': max(20, height / (estimated_lines * 2)),
+                'max_line_height': height / 2,
+                'line_spacing': height / estimated_lines
+            }
+        # Calculate spacing between lines
+        spacings = []
+        for i in range(len(line_centers) - 1):
+            spacing = line_centers[i+1] - line_centers[i]
+            if spacing > 8:  # Minimum reasonable spacing
+                spacings.append(spacing)
+        if spacings:
+            avg_spacing = np.median(spacings)
+            min_spacing = np.percentile(spacings, 25)
+            max_spacing = np.percentile(spacings, 75)
+            return {
+                'avg_line_height': float(avg_spacing),
+                'min_line_height': float(max(15, min_spacing * 0.6)),  # 60% of min spacing
+                'max_line_height': float(max_spacing * 1.5),  # 1.5x max spacing = likely multi-line
+                'line_spacing': float(avg_spacing),
+                'num_lines_detected': len(line_centers)
+            }
+        # Fallback
+        estimated_lines = max(3, height // 50)
+        return {
+            'avg_line_height': height / estimated_lines,
+            'min_line_height': max(20, height / (estimated_lines * 2)),
+            'max_line_height': height / 2,
+            'line_spacing': height / estimated_lines
+        }
+    except Exception as e:
+        print(f"   ⚠️ Error analyzing image: {e}")
+        # Ultra-conservative fallback
+        width, height = image.size
+        return {
+            'avg_line_height': 50,
+            'min_line_height': 25,
+            'max_line_height': 100,
+            'line_spacing': 50
+        }
+def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
     """
     Detect average line spacing in a text region using horizontal projection analysis.
             return None
         # Horizontal projection: sum of dark pixels per row
+        row_sums = np.sum(img_array < 128, axis=1)
+        if len(row_sums) < 10:
             return None
         # Find peaks (text lines) and valleys (spacing between lines)
         mean_val = np.mean(row_sums)
         std_val = np.std(row_sums)
+        threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
         text_rows = np.where(row_sums > threshold)[0]
         if len(text_rows) < 2:
+            return None
         # Group consecutive rows to find line centers
         line_centers = []
         current_group = [text_rows[0]]
         for i in range(1, len(text_rows)):
+            if text_rows[i] - text_rows[i-1] <= 3:
                 current_group.append(text_rows[i])
             else:
                 line_centers.append(int(np.mean(current_group)))
                 current_group = [text_rows[i]]
         if current_group:
             line_centers.append(int(np.mean(current_group)))
         spacings = []
         for i in range(len(line_centers) - 1):
             spacing = line_centers[i+1] - line_centers[i]
+            if spacing > 10:
                 spacings.append(spacing)
         if spacings:
+            return float(np.median(spacings))
         return None
     except Exception as e:
         return None
+def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
+    """
+    Detect actual line break positions within a text region using horizontal projection.
+    Returns list of y-coordinates where lines break.
+    """
+    try:
+        x1, y1, x2, y2 = bbox
+        crop = image.crop((x1, y1, x2, y2))
+        gray = crop.convert('L')
+        img_array = np.array(gray)
+        if img_array.size == 0:
+            return []
+        # Horizontal projection
+        row_sums = np.sum(img_array < 128, axis=1)
+        if len(row_sums) < 5:
+            return []
+        # Find valleys (spaces between lines) and peaks (text lines)
+        mean_val = np.mean(row_sums)
+        std_val = np.std(row_sums)
+        text_threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
+        space_threshold = mean_val * 0.15  # Much lower for spaces
+        # Find text rows and space rows
+        text_rows = np.where(row_sums > text_threshold)[0]
+        space_rows = np.where(row_sums < space_threshold)[0]
+        if len(text_rows) < 2:
+            return []
+        # Group text rows into lines
+        line_groups = []
+        current_group = [text_rows[0]]
+        for i in range(1, len(text_rows)):
+            if text_rows[i] - text_rows[i-1] <= 3:
+                current_group.append(text_rows[i])
+            else:
+                if len(current_group) > 0:
+                    line_groups.append(current_group)
+                current_group = [text_rows[i]]
+        if len(current_group) > 0:
+            line_groups.append(current_group)
+        if len(line_groups) < 2:
+            return []  # Single line or can't detect
+        # Find break points (midpoints between line groups)
+        break_points = []
+        for i in range(len(line_groups) - 1):
+            last_row_of_line1 = max(line_groups[i])
+            first_row_of_line2 = min(line_groups[i+1])
+            break_point = (last_row_of_line1 + first_row_of_line2) // 2
+            break_points.append(y1 + break_point)  # Convert to image coordinates
+        return break_points
+    except Exception as e:
+        print(f"   ⚠️ Error detecting line breaks: {e}")
+        return []
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
+    min_line_height: Optional[int] = None,
+    max_line_height: Optional[int] = None
 ) -> List[Dict[str, Any]]:
     """
+    Intelligently split text regions into individual lines.
+    ADAPTIVE APPROACH:
+    - Analyzes image to determine optimal parameters
+    - Detects actual line breaks using image analysis
+    - Works for any image type (sparse, dense, tables, forms)
+    - No hardcoded thresholds
     Args:
         image: Original image
         layout_data: Layout detection results
+        min_line_height: Optional override (auto-detected if None)
+        max_line_height: Optional override (auto-detected if None)
     Returns:
         Updated layout data with lines split
     """
+    # Analyze image to get adaptive parameters
+    img_chars = analyze_image_line_characteristics(image)
+    adaptive_min = min_line_height if min_line_height else int(img_chars['min_line_height'])
+    adaptive_max = max_line_height if max_line_height else int(img_chars['max_line_height'])
+    avg_line_height = img_chars['avg_line_height']
+    print(f"\n📊 Image analysis: avg_line_height={avg_line_height:.1f}px, "
+          f"min={adaptive_min}px, max={adaptive_max}px")
+    if 'num_lines_detected' in img_chars:
+        print(f"   Detected ~{img_chars['num_lines_detected']} lines in image")
     result = []
     split_count = 0
         height = y2 - y1
         width = x2 - x1
+        # ALWAYS check if region contains multiple lines, regardless of height
+        # Use image analysis to detect actual line breaks
+        line_breaks = detect_actual_line_breaks_in_region(image, bbox)
+        if len(line_breaks) > 0:
+            # We detected actual line breaks - split at those positions
+            print(f"   Region: {category} (h={height}px) - Detected {len(line_breaks)+1} lines via image analysis")
+            # Create lines based on detected breaks
+            current_y = y1
+            for i, break_y in enumerate(line_breaks):
+                # Create line from current_y to break_y
                 new_item = item.copy()
+                new_item['bbox'] = [x1, int(current_y), x2, int(break_y)]
+                new_item['text'] = ""  # Will be re-OCR'd
+                new_item['split_from_parent'] = True
+                new_item['needs_reocr'] = True
+                new_item['line_number'] = i + 1
+                result.append(new_item)
+                current_y = break_y
+            # Add last line
+            new_item = item.copy()
+            new_item['bbox'] = [x1, int(current_y), x2, y2]
+            new_item['text'] = ""
+            new_item['split_from_parent'] = True
+            new_item['needs_reocr'] = True
+            new_item['line_number'] = len(line_breaks) + 1
+            result.append(new_item)
             split_count += 1
+        elif height > adaptive_max:
+            # No line breaks detected but region is tall - use spacing-based split
+            print(f"   Region: {category} (h={height}px) - Tall region, using spacing-based split")
+            # Try to detect spacing in this specific region
+            detected_spacing = detect_line_spacing(image, bbox)
+            if detected_spacing and detected_spacing > adaptive_min:
+                line_height = detected_spacing
+                estimated_lines = max(2, round(height / line_height))
+            else:
+                line_height = avg_line_height
+                estimated_lines = max(2, round(height / line_height))
+            estimated_lines = min(estimated_lines, 15)  # Cap at 15 lines
+            # Calculate padding (adaptive: 8% of line height, min 2px)
+            padding = max(2, int(line_height * 0.08))
+            # Split geometrically
             for i in range(estimated_lines):
                 new_item = item.copy()
                 if i == 0:
                     new_y1 = y1
                     new_y2 = y1 + line_height + padding
                 elif i == estimated_lines - 1:
                     new_y1 = y1 + (i * line_height) - padding
                     new_y2 = y2
                 else:
                     new_y1 = y1 + (i * line_height) - padding
                     new_y2 = y1 + ((i + 1) * line_height) + padding
                 new_y1 = max(y1, int(new_y1))
                 new_y2 = min(y2, int(new_y2))
+                if new_y2 > new_y1:
                     new_item['bbox'] = [x1, new_y1, x2, new_y2]
                     new_item['text'] = ""
                     new_item['split_from_parent'] = True
+                    new_item['needs_reocr'] = True
                     new_item['line_number'] = i + 1
                     result.append(new_item)
             split_count += 1
+        else:
+            # Region is reasonably sized - keep as is
+            print(f"   Region: {category} (h={height}px) - Keeping as single line")
+            result.append(item)
     if split_count > 0:
+        print(f"📏 Split {split_count} regions into {len(result)} total lines")
     return result