VanguardAI commited on
Commit
67294ce
·
verified ·
1 Parent(s): 6896445

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -100
app.py CHANGED
@@ -644,7 +644,112 @@ def estimate_text_density(image: Image.Image) -> float:
644
  return 0.1 # Default to low density
645
 
646
 
647
- def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  """
649
  Detect average line spacing in a text region using horizontal projection analysis.
650
 
@@ -662,37 +767,32 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
662
  return None
663
 
664
  # Horizontal projection: sum of dark pixels per row
665
- # Text lines will have higher values
666
- row_sums = np.sum(img_array < 128, axis=1) # Count dark pixels per row
667
 
668
- if len(row_sums) < 10: # Need at least some rows
669
  return None
670
 
671
  # Find peaks (text lines) and valleys (spacing between lines)
672
- # Use adaptive threshold to identify text rows
673
  mean_val = np.mean(row_sums)
674
  std_val = np.std(row_sums)
675
- threshold = max(mean_val * 0.3, mean_val - std_val * 0.5)
676
 
677
  text_rows = np.where(row_sums > threshold)[0]
678
 
679
  if len(text_rows) < 2:
680
- return None # Can't detect spacing with less than 2 text rows
681
 
682
- # Find gaps between text rows (line spacing)
683
  # Group consecutive rows to find line centers
684
  line_centers = []
685
  current_group = [text_rows[0]]
686
 
687
  for i in range(1, len(text_rows)):
688
- if text_rows[i] - text_rows[i-1] <= 3: # Consecutive or very close rows
689
  current_group.append(text_rows[i])
690
  else:
691
- # End of current line, start new
692
  line_centers.append(int(np.mean(current_group)))
693
  current_group = [text_rows[i]]
694
 
695
- # Add last group
696
  if current_group:
697
  line_centers.append(int(np.mean(current_group)))
698
 
@@ -703,41 +803,118 @@ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> float:
703
  spacings = []
704
  for i in range(len(line_centers) - 1):
705
  spacing = line_centers[i+1] - line_centers[i]
706
- if spacing > 10: # Minimum reasonable spacing
707
  spacings.append(spacing)
708
 
709
  if spacings:
710
- # Use median for robustness against outliers
711
- avg_spacing = np.median(spacings)
712
- print(f" → Detected {len(line_centers)} lines with avg spacing {avg_spacing:.1f}px")
713
- return float(avg_spacing)
714
 
715
  return None
716
  except Exception as e:
717
- print(f" ⚠️ Could not detect line spacing: {e}")
718
  return None
719
 
720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  def split_text_regions_into_lines(
722
  image: Image.Image,
723
  layout_data: List[Dict[str, Any]],
724
- min_line_height: int = 25,
725
- max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
726
  ) -> List[Dict[str, Any]]:
727
  """
728
- Post-process layout data to split large text regions into individual lines.
729
 
730
- Uses intelligent line spacing detection and padding to avoid cutting through text.
 
 
 
 
731
 
732
  Args:
733
  image: Original image
734
  layout_data: Layout detection results
735
- min_line_height: Minimum height for a text line (pixels)
736
- max_line_height: Maximum height for a single line before splitting
737
 
738
  Returns:
739
  Updated layout data with lines split
740
  """
 
 
 
 
 
 
 
 
 
 
 
741
  result = []
742
  split_count = 0
743
 
@@ -755,110 +932,90 @@ def split_text_regions_into_lines(
755
  height = y2 - y1
756
  width = x2 - x1
757
 
758
- print(f" Checking region: height={height}px, width={width}px, category={category}")
 
 
759
 
760
- # If region is already reasonably line-sized, keep it
761
- if height <= max_line_height:
762
- print(f" Already line-sized (height {height}px <= {max_line_height}px)")
763
- result.append(item)
764
- continue
765
-
766
- # If region is tall enough to contain multiple lines, split it
767
- print(f" → Splitting! (height {height}px > threshold {max_line_height}px)")
768
-
769
- # Try to detect actual line spacing from the image
770
- detected_spacing = detect_line_spacing(image, bbox)
771
-
772
- if detected_spacing and detected_spacing > min_line_height:
773
- # Use detected spacing for more accurate splitting
774
- estimated_lines = max(2, round(height / detected_spacing))
775
- line_height = detected_spacing
776
- print(f" → Detected line spacing: {detected_spacing:.1f}px, splitting into ~{estimated_lines} lines")
777
- else:
778
- # Fallback to estimated line height
779
- # Arabic handwritten text: ~40-60px per line
780
- # Arabic typed text: ~30-50px per line
781
- avg_line_height = 45 # Middle ground
782
- estimated_lines = max(2, round(height / avg_line_height))
783
- line_height = height / estimated_lines
784
- print(f" → Using estimated line height: {avg_line_height}px, splitting into {estimated_lines} lines")
785
-
786
- # Don't split into too many lines (might be a paragraph)
787
- estimated_lines = min(estimated_lines, 10)
788
-
789
- # Calculate padding to avoid cutting through text
790
- # Use 10% of line height as padding, but at least 3px
791
- padding = max(3, int(line_height * 0.1))
792
-
793
- # Split text content by newlines if available
794
- text_lines = text_content.split('\n') if text_content else []
795
-
796
- # If we have the same number of text lines as estimated, use them
797
- if len(text_lines) == estimated_lines and len(text_lines) > 1:
798
- for i, line_text in enumerate(text_lines):
799
- if not line_text.strip():
800
- continue
801
  new_item = item.copy()
802
-
803
- # Calculate bbox with padding to avoid cutting text
804
- if i == 0:
805
- # First line: pad bottom only
806
- new_y1 = y1
807
- new_y2 = y1 + line_height + padding
808
- elif i == estimated_lines - 1:
809
- # Last line: pad top only
810
- new_y1 = y1 + (i * line_height) - padding
811
- new_y2 = y2
812
- else:
813
- # Middle lines: pad both top and bottom
814
- new_y1 = y1 + (i * line_height) - padding
815
- new_y2 = y1 + ((i + 1) * line_height) + padding
816
-
817
- # Ensure bbox is valid and within image bounds
818
- new_y1 = max(y1, int(new_y1))
819
- new_y2 = min(y2, int(new_y2))
820
-
821
- if new_y2 > new_y1: # Valid bbox
822
- new_item['bbox'] = [x1, new_y1, x2, new_y2]
823
- new_item['text'] = line_text.strip()
824
- new_item['split_from_parent'] = True
825
- result.append(new_item)
826
  split_count += 1
827
- else:
828
- # Split geometrically - mark for re-OCR to get accurate per-line text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  for i in range(estimated_lines):
830
  new_item = item.copy()
831
 
832
- # Calculate bbox with padding to avoid cutting text
833
  if i == 0:
834
- # First line: pad bottom only
835
  new_y1 = y1
836
  new_y2 = y1 + line_height + padding
837
  elif i == estimated_lines - 1:
838
- # Last line: pad top only
839
  new_y1 = y1 + (i * line_height) - padding
840
  new_y2 = y2
841
  else:
842
- # Middle lines: pad both top and bottom
843
  new_y1 = y1 + (i * line_height) - padding
844
  new_y2 = y1 + ((i + 1) * line_height) + padding
845
 
846
- # Ensure bbox is valid and within image bounds
847
  new_y1 = max(y1, int(new_y1))
848
  new_y2 = min(y2, int(new_y2))
849
 
850
- if new_y2 > new_y1: # Valid bbox
851
  new_item['bbox'] = [x1, new_y1, x2, new_y2]
852
- # Clear text - will be re-OCR'd per-line for accuracy
853
  new_item['text'] = ""
854
  new_item['split_from_parent'] = True
855
- new_item['needs_reocr'] = True # Flag for re-processing
856
  new_item['line_number'] = i + 1
857
  result.append(new_item)
 
858
  split_count += 1
 
 
 
 
 
859
 
860
  if split_count > 0:
861
- print(f"📏 Split {split_count} large regions into individual lines ({len(layout_data)} {len(result)} regions)")
862
 
863
  return result
864
 
 
644
  return 0.1 # Default to low density
645
 
646
 
647
+ def analyze_image_line_characteristics(image: Image.Image) -> Dict[str, float]:
648
+ """
649
+ Analyze image to determine optimal line detection parameters.
650
+ Works adaptively for any image type (sparse, dense, tables, forms).
651
+
652
+ Returns dict with: avg_line_height, min_line_height, max_line_height, line_spacing
653
+ """
654
+ try:
655
+ width, height = image.size
656
+ gray = image.convert('L')
657
+ img_array = np.array(gray)
658
+
659
+ # Horizontal projection: sum of dark pixels per row
660
+ row_sums = np.sum(img_array < 128, axis=1)
661
+
662
+ if len(row_sums) < 10:
663
+ # Fallback for very small images
664
+ return {
665
+ 'avg_line_height': height / 10, # Assume ~10 lines
666
+ 'min_line_height': max(15, height / 20),
667
+ 'max_line_height': height / 3, # Split if > 1/3 of image height
668
+ 'line_spacing': height / 15
669
+ }
670
+
671
+ # Find text rows (peaks in projection)
672
+ mean_val = np.mean(row_sums)
673
+ std_val = np.std(row_sums)
674
+ threshold = max(mean_val * 0.2, mean_val - std_val * 0.3)
675
+ text_rows = np.where(row_sums > threshold)[0]
676
+
677
+ if len(text_rows) < 2:
678
+ # No clear text lines detected, use conservative estimates
679
+ estimated_lines = max(5, height // 50)
680
+ return {
681
+ 'avg_line_height': height / estimated_lines,
682
+ 'min_line_height': max(15, height / (estimated_lines * 2)),
683
+ 'max_line_height': height / 2, # Split if > half image
684
+ 'line_spacing': height / estimated_lines
685
+ }
686
+
687
+ # Group consecutive text rows into lines
688
+ line_centers = []
689
+ current_group = [text_rows[0]]
690
+
691
+ for i in range(1, len(text_rows)):
692
+ if text_rows[i] - text_rows[i-1] <= 5: # Consecutive rows
693
+ current_group.append(text_rows[i])
694
+ else:
695
+ line_centers.append(int(np.mean(current_group)))
696
+ current_group = [text_rows[i]]
697
+
698
+ if current_group:
699
+ line_centers.append(int(np.mean(current_group)))
700
+
701
+ if len(line_centers) < 2:
702
+ # Can't determine spacing
703
+ estimated_lines = max(3, height // 60)
704
+ return {
705
+ 'avg_line_height': height / estimated_lines,
706
+ 'min_line_height': max(20, height / (estimated_lines * 2)),
707
+ 'max_line_height': height / 2,
708
+ 'line_spacing': height / estimated_lines
709
+ }
710
+
711
+ # Calculate spacing between lines
712
+ spacings = []
713
+ for i in range(len(line_centers) - 1):
714
+ spacing = line_centers[i+1] - line_centers[i]
715
+ if spacing > 8: # Minimum reasonable spacing
716
+ spacings.append(spacing)
717
+
718
+ if spacings:
719
+ avg_spacing = np.median(spacings)
720
+ min_spacing = np.percentile(spacings, 25)
721
+ max_spacing = np.percentile(spacings, 75)
722
+
723
+ return {
724
+ 'avg_line_height': float(avg_spacing),
725
+ 'min_line_height': float(max(15, min_spacing * 0.6)), # 60% of min spacing
726
+ 'max_line_height': float(max_spacing * 1.5), # 1.5x max spacing = likely multi-line
727
+ 'line_spacing': float(avg_spacing),
728
+ 'num_lines_detected': len(line_centers)
729
+ }
730
+
731
+ # Fallback
732
+ estimated_lines = max(3, height // 50)
733
+ return {
734
+ 'avg_line_height': height / estimated_lines,
735
+ 'min_line_height': max(20, height / (estimated_lines * 2)),
736
+ 'max_line_height': height / 2,
737
+ 'line_spacing': height / estimated_lines
738
+ }
739
+
740
+ except Exception as e:
741
+ print(f" ⚠️ Error analyzing image: {e}")
742
+ # Ultra-conservative fallback
743
+ width, height = image.size
744
+ return {
745
+ 'avg_line_height': 50,
746
+ 'min_line_height': 25,
747
+ 'max_line_height': 100,
748
+ 'line_spacing': 50
749
+ }
750
+
751
+
752
+ def detect_line_spacing(image: Image.Image, bbox: List[int]) -> Optional[float]:
753
  """
754
  Detect average line spacing in a text region using horizontal projection analysis.
755
 
 
767
  return None
768
 
769
  # Horizontal projection: sum of dark pixels per row
770
+ row_sums = np.sum(img_array < 128, axis=1)
 
771
 
772
+ if len(row_sums) < 10:
773
  return None
774
 
775
  # Find peaks (text lines) and valleys (spacing between lines)
 
776
  mean_val = np.mean(row_sums)
777
  std_val = np.std(row_sums)
778
+ threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
779
 
780
  text_rows = np.where(row_sums > threshold)[0]
781
 
782
  if len(text_rows) < 2:
783
+ return None
784
 
 
785
  # Group consecutive rows to find line centers
786
  line_centers = []
787
  current_group = [text_rows[0]]
788
 
789
  for i in range(1, len(text_rows)):
790
+ if text_rows[i] - text_rows[i-1] <= 3:
791
  current_group.append(text_rows[i])
792
  else:
 
793
  line_centers.append(int(np.mean(current_group)))
794
  current_group = [text_rows[i]]
795
 
 
796
  if current_group:
797
  line_centers.append(int(np.mean(current_group)))
798
 
 
803
  spacings = []
804
  for i in range(len(line_centers) - 1):
805
  spacing = line_centers[i+1] - line_centers[i]
806
+ if spacing > 10:
807
  spacings.append(spacing)
808
 
809
  if spacings:
810
+ return float(np.median(spacings))
 
 
 
811
 
812
  return None
813
  except Exception as e:
 
814
  return None
815
 
816
 
817
+ def detect_actual_line_breaks_in_region(image: Image.Image, bbox: List[int]) -> List[int]:
818
+ """
819
+ Detect actual line break positions within a text region using horizontal projection.
820
+ Returns list of y-coordinates where lines break.
821
+ """
822
+ try:
823
+ x1, y1, x2, y2 = bbox
824
+ crop = image.crop((x1, y1, x2, y2))
825
+ gray = crop.convert('L')
826
+ img_array = np.array(gray)
827
+
828
+ if img_array.size == 0:
829
+ return []
830
+
831
+ # Horizontal projection
832
+ row_sums = np.sum(img_array < 128, axis=1)
833
+
834
+ if len(row_sums) < 5:
835
+ return []
836
+
837
+ # Find valleys (spaces between lines) and peaks (text lines)
838
+ mean_val = np.mean(row_sums)
839
+ std_val = np.std(row_sums)
840
+ text_threshold = max(mean_val * 0.25, mean_val - std_val * 0.4)
841
+ space_threshold = mean_val * 0.15 # Much lower for spaces
842
+
843
+ # Find text rows and space rows
844
+ text_rows = np.where(row_sums > text_threshold)[0]
845
+ space_rows = np.where(row_sums < space_threshold)[0]
846
+
847
+ if len(text_rows) < 2:
848
+ return []
849
+
850
+ # Group text rows into lines
851
+ line_groups = []
852
+ current_group = [text_rows[0]]
853
+
854
+ for i in range(1, len(text_rows)):
855
+ if text_rows[i] - text_rows[i-1] <= 3:
856
+ current_group.append(text_rows[i])
857
+ else:
858
+ if len(current_group) > 0:
859
+ line_groups.append(current_group)
860
+ current_group = [text_rows[i]]
861
+
862
+ if len(current_group) > 0:
863
+ line_groups.append(current_group)
864
+
865
+ if len(line_groups) < 2:
866
+ return [] # Single line or can't detect
867
+
868
+ # Find break points (midpoints between line groups)
869
+ break_points = []
870
+ for i in range(len(line_groups) - 1):
871
+ last_row_of_line1 = max(line_groups[i])
872
+ first_row_of_line2 = min(line_groups[i+1])
873
+ break_point = (last_row_of_line1 + first_row_of_line2) // 2
874
+ break_points.append(y1 + break_point) # Convert to image coordinates
875
+
876
+ return break_points
877
+
878
+ except Exception as e:
879
+ print(f" ⚠️ Error detecting line breaks: {e}")
880
+ return []
881
+
882
+
883
  def split_text_regions_into_lines(
884
  image: Image.Image,
885
  layout_data: List[Dict[str, Any]],
886
+ min_line_height: Optional[int] = None,
887
+ max_line_height: Optional[int] = None
888
  ) -> List[Dict[str, Any]]:
889
  """
890
+ Intelligently split text regions into individual lines.
891
 
892
+ ADAPTIVE APPROACH:
893
+ - Analyzes image to determine optimal parameters
894
+ - Detects actual line breaks using image analysis
895
+ - Works for any image type (sparse, dense, tables, forms)
896
+ - No hardcoded thresholds
897
 
898
  Args:
899
  image: Original image
900
  layout_data: Layout detection results
901
+ min_line_height: Optional override (auto-detected if None)
902
+ max_line_height: Optional override (auto-detected if None)
903
 
904
  Returns:
905
  Updated layout data with lines split
906
  """
907
+ # Analyze image to get adaptive parameters
908
+ img_chars = analyze_image_line_characteristics(image)
909
+ adaptive_min = min_line_height if min_line_height else int(img_chars['min_line_height'])
910
+ adaptive_max = max_line_height if max_line_height else int(img_chars['max_line_height'])
911
+ avg_line_height = img_chars['avg_line_height']
912
+
913
+ print(f"\n📊 Image analysis: avg_line_height={avg_line_height:.1f}px, "
914
+ f"min={adaptive_min}px, max={adaptive_max}px")
915
+ if 'num_lines_detected' in img_chars:
916
+ print(f" Detected ~{img_chars['num_lines_detected']} lines in image")
917
+
918
  result = []
919
  split_count = 0
920
 
 
932
  height = y2 - y1
933
  width = x2 - x1
934
 
935
+ # ALWAYS check if region contains multiple lines, regardless of height
936
+ # Use image analysis to detect actual line breaks
937
+ line_breaks = detect_actual_line_breaks_in_region(image, bbox)
938
 
939
+ if len(line_breaks) > 0:
940
+ # We detected actual line breaks - split at those positions
941
+ print(f" Region: {category} (h={height}px) - Detected {len(line_breaks)+1} lines via image analysis")
942
+
943
+ # Create lines based on detected breaks
944
+ current_y = y1
945
+ for i, break_y in enumerate(line_breaks):
946
+ # Create line from current_y to break_y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  new_item = item.copy()
948
+ new_item['bbox'] = [x1, int(current_y), x2, int(break_y)]
949
+ new_item['text'] = "" # Will be re-OCR'd
950
+ new_item['split_from_parent'] = True
951
+ new_item['needs_reocr'] = True
952
+ new_item['line_number'] = i + 1
953
+ result.append(new_item)
954
+ current_y = break_y
955
+
956
+ # Add last line
957
+ new_item = item.copy()
958
+ new_item['bbox'] = [x1, int(current_y), x2, y2]
959
+ new_item['text'] = ""
960
+ new_item['split_from_parent'] = True
961
+ new_item['needs_reocr'] = True
962
+ new_item['line_number'] = len(line_breaks) + 1
963
+ result.append(new_item)
 
 
 
 
 
 
 
 
964
  split_count += 1
965
+
966
+ elif height > adaptive_max:
967
+ # No line breaks detected but region is tall - use spacing-based split
968
+ print(f" Region: {category} (h={height}px) - Tall region, using spacing-based split")
969
+
970
+ # Try to detect spacing in this specific region
971
+ detected_spacing = detect_line_spacing(image, bbox)
972
+
973
+ if detected_spacing and detected_spacing > adaptive_min:
974
+ line_height = detected_spacing
975
+ estimated_lines = max(2, round(height / line_height))
976
+ else:
977
+ line_height = avg_line_height
978
+ estimated_lines = max(2, round(height / line_height))
979
+
980
+ estimated_lines = min(estimated_lines, 15) # Cap at 15 lines
981
+
982
+ # Calculate padding (adaptive: 8% of line height, min 2px)
983
+ padding = max(2, int(line_height * 0.08))
984
+
985
+ # Split geometrically
986
  for i in range(estimated_lines):
987
  new_item = item.copy()
988
 
 
989
  if i == 0:
 
990
  new_y1 = y1
991
  new_y2 = y1 + line_height + padding
992
  elif i == estimated_lines - 1:
 
993
  new_y1 = y1 + (i * line_height) - padding
994
  new_y2 = y2
995
  else:
 
996
  new_y1 = y1 + (i * line_height) - padding
997
  new_y2 = y1 + ((i + 1) * line_height) + padding
998
 
 
999
  new_y1 = max(y1, int(new_y1))
1000
  new_y2 = min(y2, int(new_y2))
1001
 
1002
+ if new_y2 > new_y1:
1003
  new_item['bbox'] = [x1, new_y1, x2, new_y2]
 
1004
  new_item['text'] = ""
1005
  new_item['split_from_parent'] = True
1006
+ new_item['needs_reocr'] = True
1007
  new_item['line_number'] = i + 1
1008
  result.append(new_item)
1009
+
1010
  split_count += 1
1011
+
1012
+ else:
1013
+ # Region is reasonably sized - keep as is
1014
+ print(f" Region: {category} (h={height}px) - Keeping as single line")
1015
+ result.append(item)
1016
 
1017
  if split_count > 0:
1018
+ print(f"📏 Split {split_count} regions into {len(result)} total lines")
1019
 
1020
  return result
1021