VanguardAI commited on
Commit
ba601ca
·
verified ·
1 Parent(s): 38e40e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -12
app.py CHANGED
@@ -647,13 +647,14 @@ def estimate_text_density(image: Image.Image) -> float:
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
650
- min_line_height: int = 25,
651
- max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
652
  ) -> List[Dict[str, Any]]:
653
  """
654
  Post-process layout data to split large text regions into individual lines.
655
 
656
  This ensures each line gets its own bounding box for easier verification.
 
657
 
658
  Args:
659
  image: Original image
@@ -681,22 +682,59 @@ def split_text_regions_into_lines(
681
  height = y2 - y1
682
  width = x2 - x1
683
 
684
- print(f" Checking region: height={height}px, width={width}px, category={category}")
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  # If region is tall enough to contain multiple lines, split it
687
  if height > max_line_height:
688
- print(f" → Splitting! (height {height}px > threshold {max_line_height}px)")
689
- # Estimate number of lines based on typical line height
690
- # Arabic handwritten text: ~40-60px per line
691
- # Arabic typed text: ~30-50px per line
692
- avg_line_height = 45 # Middle ground
 
 
 
 
 
 
 
 
 
 
693
  estimated_lines = max(1, round(height / avg_line_height))
694
 
695
  # Don't split into too many lines (might be a paragraph)
696
  estimated_lines = min(estimated_lines, 10)
697
 
 
 
 
 
 
 
698
  line_height = height / estimated_lines
699
 
 
 
 
 
 
 
 
 
700
  # Split text content by newlines if available
701
  text_lines = text_content.split('\n') if text_content else []
702
 
@@ -706,8 +744,12 @@ def split_text_regions_into_lines(
706
  if not line_text.strip():
707
  continue
708
  new_item = item.copy()
709
- new_y1 = y1 + (i * line_height)
710
- new_y2 = y1 + ((i + 1) * line_height)
 
 
 
 
711
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
712
  new_item['text'] = line_text.strip()
713
  new_item['split_from_parent'] = True
@@ -717,8 +759,17 @@ def split_text_regions_into_lines(
717
  # Split geometrically - mark for re-OCR to get accurate per-line text
718
  for i in range(estimated_lines):
719
  new_item = item.copy()
720
- new_y1 = y1 + (i * line_height)
721
- new_y2 = y1 + ((i + 1) * line_height)
 
 
 
 
 
 
 
 
 
722
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
723
  # Clear text - will be re-OCR'd per-line for accuracy
724
  new_item['text'] = ""
 
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
650
+ min_line_height: int = 35, # Increased to avoid cutting through text
651
+ max_line_height: int = 100 # Less aggressive to prevent over-splitting
652
  ) -> List[Dict[str, Any]]:
653
  """
654
  Post-process layout data to split large text regions into individual lines.
655
 
656
  This ensures each line gets its own bounding box for easier verification.
657
+ Uses smart estimation to avoid cutting through the middle of text lines.
658
 
659
  Args:
660
  image: Original image
 
682
  height = y2 - y1
683
  width = x2 - x1
684
 
685
+ print(f" Checking region: height={height:.0f}px, width={width:.0f}px, category={category}")
686
+
687
+ # Safety check: if region is too small (< min_line_height), it might be incorrect/noise
688
+ if height < min_line_height:
689
+ print(f" ⚠️ Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
690
+ result.append(item) # Keep it but flag the issue
691
+ continue
692
+
693
+ # Safety check: if region is reasonably line-sized already, keep it
694
+ # This prevents unnecessary splitting of well-detected regions
695
+ if min_line_height <= height <= max_line_height:
696
+ print(f" ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
697
+ result.append(item)
698
+ continue
699
 
700
  # If region is tall enough to contain multiple lines, split it
701
  if height > max_line_height:
702
+ print(f" → Will split (height {height:.0f}px > threshold {max_line_height}px)")
703
+
704
+ # Smart line height estimation based on region characteristics
705
+ # Arabic handwritten: typically 50-70px per line
706
+ # Arabic typed: typically 35-50px per line
707
+ # Use conservative estimate to avoid cutting through text
708
+
709
+ # Estimate based on height
710
+ if height < 150:
711
+ avg_line_height = 60 # Conservative for small regions
712
+ elif height < 300:
713
+ avg_line_height = 55 # Medium regions
714
+ else:
715
+ avg_line_height = 50 # Larger documents
716
+
717
  estimated_lines = max(1, round(height / avg_line_height))
718
 
719
  # Don't split into too many lines (might be a paragraph)
720
  estimated_lines = min(estimated_lines, 10)
721
 
722
+ # Validate: ensure we have at least 2 lines to split, otherwise keep original
723
+ if estimated_lines < 2:
724
+ print(f" → Keeping original (only {estimated_lines} line estimated)")
725
+ result.append(item)
726
+ continue
727
+
728
  line_height = height / estimated_lines
729
 
730
+ # Validate: each split line must meet minimum height requirement
731
+ if line_height < min_line_height:
732
+ print(f" → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
733
+ result.append(item)
734
+ continue
735
+
736
+ print(f" → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
737
+
738
  # Split text content by newlines if available
739
  text_lines = text_content.split('\n') if text_content else []
740
 
 
744
  if not line_text.strip():
745
  continue
746
  new_item = item.copy()
747
+
748
+ # Add small padding to avoid cutting through text (5% margin)
749
+ margin = line_height * 0.05
750
+ new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
751
+ new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
752
+
753
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
754
  new_item['text'] = line_text.strip()
755
  new_item['split_from_parent'] = True
 
759
  # Split geometrically - mark for re-OCR to get accurate per-line text
760
  for i in range(estimated_lines):
761
  new_item = item.copy()
762
+
763
+ # Add padding between lines to avoid cutting through text
764
+ # 8% margin between lines to create separation
765
+ margin = line_height * 0.08
766
+ new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
767
+ new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
768
+
769
+ # Ensure bbox is valid
770
+ if new_y2 <= new_y1:
771
+ continue
772
+
773
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
774
  # Clear text - will be re-OCR'd per-line for accuracy
775
  new_item['text'] = ""