VanguardAI commited on
Commit
6896445
Β·
verified Β·
1 Parent(s): c265696

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -36
app.py CHANGED
@@ -644,6 +644,80 @@ def estimate_text_density(image: Image.Image) -> float:
644
  return 0.1 # Default to low density
645
 
646
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
@@ -653,7 +727,7 @@ def split_text_regions_into_lines(
653
  """
654
  Post-process layout data to split large text regions into individual lines.
655
 
656
- This ensures each line gets its own bounding box for easier verification.
657
 
658
  Args:
659
  image: Original image
@@ -683,53 +757,105 @@ def split_text_regions_into_lines(
683
 
684
  print(f" Checking region: height={height}px, width={width}px, category={category}")
685
 
 
 
 
 
 
 
686
  # If region is tall enough to contain multiple lines, split it
687
- if height > max_line_height:
688
- print(f" β†’ Splitting! (height {height}px > threshold {max_line_height}px)")
689
- # Estimate number of lines based on typical line height
 
 
 
 
 
 
 
 
 
690
  # Arabic handwritten text: ~40-60px per line
691
  # Arabic typed text: ~30-50px per line
692
  avg_line_height = 45 # Middle ground
693
- estimated_lines = max(1, round(height / avg_line_height))
694
-
695
- # Don't split into too many lines (might be a paragraph)
696
- estimated_lines = min(estimated_lines, 10)
697
-
698
  line_height = height / estimated_lines
699
-
700
- # Split text content by newlines if available
701
- text_lines = text_content.split('\n') if text_content else []
702
-
703
- # If we have the same number of text lines as estimated, use them
704
- if len(text_lines) == estimated_lines and len(text_lines) > 1:
705
- for i, line_text in enumerate(text_lines):
706
- if not line_text.strip():
707
- continue
708
- new_item = item.copy()
709
- new_y1 = y1 + (i * line_height)
710
- new_y2 = y1 + ((i + 1) * line_height)
711
- new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
  new_item['text'] = line_text.strip()
713
  new_item['split_from_parent'] = True
714
  result.append(new_item)
715
- split_count += 1
716
- else:
717
- # Split geometrically - mark for re-OCR to get accurate per-line text
718
- for i in range(estimated_lines):
719
- new_item = item.copy()
720
- new_y1 = y1 + (i * line_height)
721
- new_y2 = y1 + ((i + 1) * line_height)
722
- new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  # Clear text - will be re-OCR'd per-line for accuracy
724
  new_item['text'] = ""
725
  new_item['split_from_parent'] = True
726
  new_item['needs_reocr'] = True # Flag for re-processing
727
  new_item['line_number'] = i + 1
728
  result.append(new_item)
729
- split_count += 1
730
- else:
731
- # Region is already line-sized, keep as is
732
- result.append(item)
733
 
734
  if split_count > 0:
735
  print(f"πŸ“ Split {split_count} large regions into individual lines ({len(layout_data)} β†’ {len(result)} regions)")
@@ -996,8 +1122,22 @@ def process_image(
996
  if x2 <= x1 or y2 <= y1:
997
  continue
998
 
 
 
 
 
 
 
 
999
  # Crop and preprocess the line region
1000
- crop_img = image.crop((x1, y1, x2, y2))
 
 
 
 
 
 
 
1001
 
1002
  # Apply preprocessing to enhance handwriting quality
1003
  crop_img = preprocess_for_handwriting_ocr(crop_img)
@@ -1802,4 +1942,4 @@ if __name__ == "__main__":
1802
  share=False,
1803
  debug=True,
1804
  show_error=True
1805
- )
 
644
  return 0.1 # Default to low density
645
 
646
 
647
+ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
648
+ """
649
+ Detect average line spacing in a text region using horizontal projection analysis.
650
+
651
+ Returns estimated line height in pixels, or None if detection fails.
652
+ """
653
+ try:
654
+ x1, y1, x2, y2 = bbox
655
+ crop = image.crop((x1, y1, x2, y2))
656
+
657
+ # Convert to grayscale
658
+ gray = crop.convert('L')
659
+ img_array = np.array(gray)
660
+
661
+ if img_array.size == 0:
662
+ return None
663
+
664
+ # Horizontal projection: sum of dark pixels per row
665
+ # Text lines will have higher values
666
+ row_sums = np.sum(img_array < 128, axis=1) # Count dark pixels per row
667
+
668
+ if len(row_sums) < 10: # Need at least some rows
669
+ return None
670
+
671
+ # Find peaks (text lines) and valleys (spacing between lines)
672
+ # Use adaptive threshold to identify text rows
673
+ mean_val = np.mean(row_sums)
674
+ std_val = np.std(row_sums)
675
+ threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
676
+
677
+ text_rows = np.where(row_sums > threshold)[0]
678
+
679
+ if len(text_rows) < 2:
680
+ return None # Can't detect spacing with less than 2 text rows
681
+
682
+ # Find gaps between text rows (line spacing)
683
+ # Group consecutive rows to find line centers
684
+ line_centers = []
685
+ current_group = [text_rows[0]]
686
+
687
+ for i in range(1, len(text_rows)):
688
+ if text_rows[i] - text_rows[i-1] <= 3: # Consecutive or very close rows
689
+ current_group.append(text_rows[i])
690
+ else:
691
+ # End of current line, start new
692
+ line_centers.append(int(np.mean(current_group)))
693
+ current_group = [text_rows[i]]
694
+
695
+ # Add last group
696
+ if current_group:
697
+ line_centers.append(int(np.mean(current_group)))
698
+
699
+ if len(line_centers) < 2:
700
+ return None
701
+
702
+ # Calculate spacing between line centers
703
+ spacings = []
704
+ for i in range(len(line_centers) - 1):
705
+ spacing = line_centers[i+1] - line_centers[i]
706
+ if spacing > 10: # Minimum reasonable spacing
707
+ spacings.append(spacing)
708
+
709
+ if spacings:
710
+ # Use median for robustness against outliers
711
+ avg_spacing = np.median(spacings)
712
+ print(f" β†’ Detected {len(line_centers)} lines with avg spacing {avg_spacing:.1f}px")
713
+ return float(avg_spacing)
714
+
715
+ return None
716
+ except Exception as e:
717
+ print(f" ⚠️ Could not detect line spacing: {e}")
718
+ return None
719
+
720
+
721
  def split_text_regions_into_lines(
722
  image: Image.Image,
723
  layout_data: List[Dict[str, Any]],
 
727
  """
728
  Post-process layout data to split large text regions into individual lines.
729
 
730
+ Uses intelligent line spacing detection and padding to avoid cutting through text.
731
 
732
  Args:
733
  image: Original image
 
757
 
758
  print(f" Checking region: height={height}px, width={width}px, category={category}")
759
 
760
+ # If region is already reasonably line-sized, keep it
761
+ if height <= max_line_height:
762
+ print(f" βœ“ Already line-sized (height {height}px <= {max_line_height}px)")
763
+ result.append(item)
764
+ continue
765
+
766
  # If region is tall enough to contain multiple lines, split it
767
+ print(f" β†’ Splitting! (height {height}px > threshold {max_line_height}px)")
768
+
769
+ # Try to detect actual line spacing from the image
770
+ detected_spacing = detect_line_spacing(image, bbox)
771
+
772
+ if detected_spacing and detected_spacing > min_line_height:
773
+ # Use detected spacing for more accurate splitting
774
+ estimated_lines = max(2, round(height / detected_spacing))
775
+ line_height = detected_spacing
776
+ print(f" β†’ Detected line spacing: {detected_spacing:.1f}px, splitting into ~{estimated_lines} lines")
777
+ else:
778
+ # Fallback to estimated line height
779
  # Arabic handwritten text: ~40-60px per line
780
  # Arabic typed text: ~30-50px per line
781
  avg_line_height = 45 # Middle ground
782
+ estimated_lines = max(2, round(height / avg_line_height))
 
 
 
 
783
  line_height = height / estimated_lines
784
+ print(f" β†’ Using estimated line height: {avg_line_height}px, splitting into {estimated_lines} lines")
785
+
786
+ # Don't split into too many lines (might be a paragraph)
787
+ estimated_lines = min(estimated_lines, 10)
788
+
789
+ # Calculate padding to avoid cutting through text
790
+ # Use 10% of line height as padding, but at least 3px
791
+ padding = max(3, int(line_height * 0.1))
792
+
793
+ # Split text content by newlines if available
794
+ text_lines = text_content.split('\n') if text_content else []
795
+
796
+ # If we have the same number of text lines as estimated, use them
797
+ if len(text_lines) == estimated_lines and len(text_lines) > 1:
798
+ for i, line_text in enumerate(text_lines):
799
+ if not line_text.strip():
800
+ continue
801
+ new_item = item.copy()
802
+
803
+ # Calculate bbox with padding to avoid cutting text
804
+ if i == 0:
805
+ # First line: pad bottom only
806
+ new_y1 = y1
807
+ new_y2 = y1 + line_height + padding
808
+ elif i == estimated_lines - 1:
809
+ # Last line: pad top only
810
+ new_y1 = y1 + (i * line_height) - padding
811
+ new_y2 = y2
812
+ else:
813
+ # Middle lines: pad both top and bottom
814
+ new_y1 = y1 + (i * line_height) - padding
815
+ new_y2 = y1 + ((i + 1) * line_height) + padding
816
+
817
+ # Ensure bbox is valid and within image bounds
818
+ new_y1 = max(y1, int(new_y1))
819
+ new_y2 = min(y2, int(new_y2))
820
+
821
+ if new_y2 > new_y1: # Valid bbox
822
+ new_item['bbox'] = [x1, new_y1, x2, new_y2]
823
  new_item['text'] = line_text.strip()
824
  new_item['split_from_parent'] = True
825
  result.append(new_item)
826
+ split_count += 1
827
+ else:
828
+ # Split geometrically - mark for re-OCR to get accurate per-line text
829
+ for i in range(estimated_lines):
830
+ new_item = item.copy()
831
+
832
+ # Calculate bbox with padding to avoid cutting text
833
+ if i == 0:
834
+ # First line: pad bottom only
835
+ new_y1 = y1
836
+ new_y2 = y1 + line_height + padding
837
+ elif i == estimated_lines - 1:
838
+ # Last line: pad top only
839
+ new_y1 = y1 + (i * line_height) - padding
840
+ new_y2 = y2
841
+ else:
842
+ # Middle lines: pad both top and bottom
843
+ new_y1 = y1 + (i * line_height) - padding
844
+ new_y2 = y1 + ((i + 1) * line_height) + padding
845
+
846
+ # Ensure bbox is valid and within image bounds
847
+ new_y1 = max(y1, int(new_y1))
848
+ new_y2 = min(y2, int(new_y2))
849
+
850
+ if new_y2 > new_y1: # Valid bbox
851
+ new_item['bbox'] = [x1, new_y1, x2, new_y2]
852
  # Clear text - will be re-OCR'd per-line for accuracy
853
  new_item['text'] = ""
854
  new_item['split_from_parent'] = True
855
  new_item['needs_reocr'] = True # Flag for re-processing
856
  new_item['line_number'] = i + 1
857
  result.append(new_item)
858
+ split_count += 1
 
 
 
859
 
860
  if split_count > 0:
861
  print(f"πŸ“ Split {split_count} large regions into individual lines ({len(layout_data)} β†’ {len(result)} regions)")
 
1122
  if x2 <= x1 or y2 <= y1:
1123
  continue
1124
 
1125
+ # Add small safety margin to ensure we capture full text
1126
+ margin = 2 # Small margin to avoid edge clipping
1127
+ crop_x1 = max(0, x1 - margin)
1128
+ crop_y1 = max(0, y1 - margin)
1129
+ crop_x2 = min(image.width, x2 + margin)
1130
+ crop_y2 = min(image.height, y2 + margin)
1131
+
1132
  # Crop and preprocess the line region
1133
+ crop_img = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
1134
+
1135
+ # Validate crop is reasonable size
1136
+ if crop_img.size[0] < 10 or crop_img.size[1] < 10:
1137
+ print(f" ⚠️ Skipping line {idx+1}: crop too small ({crop_img.size})")
1138
+ item['text'] = "[Crop too small]"
1139
+ item['confidence'] = 0.0
1140
+ continue
1141
 
1142
  # Apply preprocessing to enhance handwriting quality
1143
  crop_img = preprocess_for_handwriting_ocr(crop_img)
 
1942
  share=False,
1943
  debug=True,
1944
  show_error=True
1945
+ )