VanguardAI commited on
Commit
c265696
·
verified ·
1 Parent(s): ba601ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -64
app.py CHANGED
@@ -647,14 +647,13 @@ def estimate_text_density(image: Image.Image) -> float:
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
650
- min_line_height: int = 35, # Increased to avoid cutting through text
651
- max_line_height: int = 100 # Less aggressive to prevent over-splitting
652
  ) -> List[Dict[str, Any]]:
653
  """
654
  Post-process layout data to split large text regions into individual lines.
655
 
656
  This ensures each line gets its own bounding box for easier verification.
657
- Uses smart estimation to avoid cutting through the middle of text lines.
658
 
659
  Args:
660
  image: Original image
@@ -682,59 +681,22 @@ def split_text_regions_into_lines(
682
  height = y2 - y1
683
  width = x2 - x1
684
 
685
- print(f" Checking region: height={height:.0f}px, width={width:.0f}px, category={category}")
686
-
687
- # Safety check: if region is too small (< min_line_height), it might be incorrect/noise
688
- if height < min_line_height:
689
- print(f" ⚠️ Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
690
- result.append(item) # Keep it but flag the issue
691
- continue
692
-
693
- # Safety check: if region is reasonably line-sized already, keep it
694
- # This prevents unnecessary splitting of well-detected regions
695
- if min_line_height <= height <= max_line_height:
696
- print(f" ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
697
- result.append(item)
698
- continue
699
 
700
  # If region is tall enough to contain multiple lines, split it
701
  if height > max_line_height:
702
- print(f" → Will split (height {height:.0f}px > threshold {max_line_height}px)")
703
-
704
- # Smart line height estimation based on region characteristics
705
- # Arabic handwritten: typically 50-70px per line
706
- # Arabic typed: typically 35-50px per line
707
- # Use conservative estimate to avoid cutting through text
708
-
709
- # Estimate based on height
710
- if height < 150:
711
- avg_line_height = 60 # Conservative for small regions
712
- elif height < 300:
713
- avg_line_height = 55 # Medium regions
714
- else:
715
- avg_line_height = 50 # Larger documents
716
-
717
  estimated_lines = max(1, round(height / avg_line_height))
718
 
719
  # Don't split into too many lines (might be a paragraph)
720
  estimated_lines = min(estimated_lines, 10)
721
 
722
- # Validate: ensure we have at least 2 lines to split, otherwise keep original
723
- if estimated_lines < 2:
724
- print(f" → Keeping original (only {estimated_lines} line estimated)")
725
- result.append(item)
726
- continue
727
-
728
  line_height = height / estimated_lines
729
 
730
- # Validate: each split line must meet minimum height requirement
731
- if line_height < min_line_height:
732
- print(f" → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
733
- result.append(item)
734
- continue
735
-
736
- print(f" → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
737
-
738
  # Split text content by newlines if available
739
  text_lines = text_content.split('\n') if text_content else []
740
 
@@ -744,12 +706,8 @@ def split_text_regions_into_lines(
744
  if not line_text.strip():
745
  continue
746
  new_item = item.copy()
747
-
748
- # Add small padding to avoid cutting through text (5% margin)
749
- margin = line_height * 0.05
750
- new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
751
- new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
752
-
753
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
754
  new_item['text'] = line_text.strip()
755
  new_item['split_from_parent'] = True
@@ -759,17 +717,8 @@ def split_text_regions_into_lines(
759
  # Split geometrically - mark for re-OCR to get accurate per-line text
760
  for i in range(estimated_lines):
761
  new_item = item.copy()
762
-
763
- # Add padding between lines to avoid cutting through text
764
- # 8% margin between lines to create separation
765
- margin = line_height * 0.08
766
- new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
767
- new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
768
-
769
- # Ensure bbox is valid
770
- if new_y2 <= new_y1:
771
- continue
772
-
773
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
774
  # Clear text - will be re-OCR'd per-line for accuracy
775
  new_item['text'] = ""
@@ -1853,4 +1802,4 @@ if __name__ == "__main__":
1853
  share=False,
1854
  debug=True,
1855
  show_error=True
1856
- )
 
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
650
+ min_line_height: int = 25,
651
+ max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
652
  ) -> List[Dict[str, Any]]:
653
  """
654
  Post-process layout data to split large text regions into individual lines.
655
 
656
  This ensures each line gets its own bounding box for easier verification.
 
657
 
658
  Args:
659
  image: Original image
 
681
  height = y2 - y1
682
  width = x2 - x1
683
 
684
+ print(f" Checking region: height={height}px, width={width}px, category={category}")
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  # If region is tall enough to contain multiple lines, split it
687
  if height > max_line_height:
688
+ print(f" → Splitting! (height {height}px > threshold {max_line_height}px)")
689
+ # Estimate number of lines based on typical line height
690
+ # Arabic handwritten text: ~40-60px per line
691
+ # Arabic typed text: ~30-50px per line
692
+ avg_line_height = 45 # Middle ground
 
 
 
 
 
 
 
 
 
 
693
  estimated_lines = max(1, round(height / avg_line_height))
694
 
695
  # Don't split into too many lines (might be a paragraph)
696
  estimated_lines = min(estimated_lines, 10)
697
 
 
 
 
 
 
 
698
  line_height = height / estimated_lines
699
 
 
 
 
 
 
 
 
 
700
  # Split text content by newlines if available
701
  text_lines = text_content.split('\n') if text_content else []
702
 
 
706
  if not line_text.strip():
707
  continue
708
  new_item = item.copy()
709
+ new_y1 = y1 + (i * line_height)
710
+ new_y2 = y1 + ((i + 1) * line_height)
 
 
 
 
711
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
712
  new_item['text'] = line_text.strip()
713
  new_item['split_from_parent'] = True
 
717
  # Split geometrically - mark for re-OCR to get accurate per-line text
718
  for i in range(estimated_lines):
719
  new_item = item.copy()
720
+ new_y1 = y1 + (i * line_height)
721
+ new_y2 = y1 + ((i + 1) * line_height)
 
 
 
 
 
 
 
 
 
722
  new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
723
  # Clear text - will be re-OCR'd per-line for accuracy
724
  new_item['text'] = ""
 
1802
  share=False,
1803
  debug=True,
1804
  show_error=True
1805
+ )