Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -647,14 +647,13 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 647 |
def split_text_regions_into_lines(
|
| 648 |
image: Image.Image,
|
| 649 |
layout_data: List[Dict[str, Any]],
|
| 650 |
-
min_line_height: int =
|
| 651 |
-
max_line_height: int =
|
| 652 |
) -> List[Dict[str, Any]]:
|
| 653 |
"""
|
| 654 |
Post-process layout data to split large text regions into individual lines.
|
| 655 |
|
| 656 |
This ensures each line gets its own bounding box for easier verification.
|
| 657 |
-
Uses smart estimation to avoid cutting through the middle of text lines.
|
| 658 |
|
| 659 |
Args:
|
| 660 |
image: Original image
|
|
@@ -682,59 +681,22 @@ def split_text_regions_into_lines(
|
|
| 682 |
height = y2 - y1
|
| 683 |
width = x2 - x1
|
| 684 |
|
| 685 |
-
print(f" Checking region: height={height
|
| 686 |
-
|
| 687 |
-
# Safety check: if region is too small (< min_line_height), it might be incorrect/noise
|
| 688 |
-
if height < min_line_height:
|
| 689 |
-
print(f" ⚠️ Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
|
| 690 |
-
result.append(item) # Keep it but flag the issue
|
| 691 |
-
continue
|
| 692 |
-
|
| 693 |
-
# Safety check: if region is reasonably line-sized already, keep it
|
| 694 |
-
# This prevents unnecessary splitting of well-detected regions
|
| 695 |
-
if min_line_height <= height <= max_line_height:
|
| 696 |
-
print(f" ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
|
| 697 |
-
result.append(item)
|
| 698 |
-
continue
|
| 699 |
|
| 700 |
# If region is tall enough to contain multiple lines, split it
|
| 701 |
if height > max_line_height:
|
| 702 |
-
print(f" →
|
| 703 |
-
|
| 704 |
-
#
|
| 705 |
-
# Arabic
|
| 706 |
-
|
| 707 |
-
# Use conservative estimate to avoid cutting through text
|
| 708 |
-
|
| 709 |
-
# Estimate based on height
|
| 710 |
-
if height < 150:
|
| 711 |
-
avg_line_height = 60 # Conservative for small regions
|
| 712 |
-
elif height < 300:
|
| 713 |
-
avg_line_height = 55 # Medium regions
|
| 714 |
-
else:
|
| 715 |
-
avg_line_height = 50 # Larger documents
|
| 716 |
-
|
| 717 |
estimated_lines = max(1, round(height / avg_line_height))
|
| 718 |
|
| 719 |
# Don't split into too many lines (might be a paragraph)
|
| 720 |
estimated_lines = min(estimated_lines, 10)
|
| 721 |
|
| 722 |
-
# Validate: ensure we have at least 2 lines to split, otherwise keep original
|
| 723 |
-
if estimated_lines < 2:
|
| 724 |
-
print(f" → Keeping original (only {estimated_lines} line estimated)")
|
| 725 |
-
result.append(item)
|
| 726 |
-
continue
|
| 727 |
-
|
| 728 |
line_height = height / estimated_lines
|
| 729 |
|
| 730 |
-
# Validate: each split line must meet minimum height requirement
|
| 731 |
-
if line_height < min_line_height:
|
| 732 |
-
print(f" → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
|
| 733 |
-
result.append(item)
|
| 734 |
-
continue
|
| 735 |
-
|
| 736 |
-
print(f" → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
|
| 737 |
-
|
| 738 |
# Split text content by newlines if available
|
| 739 |
text_lines = text_content.split('\n') if text_content else []
|
| 740 |
|
|
@@ -744,12 +706,8 @@ def split_text_regions_into_lines(
|
|
| 744 |
if not line_text.strip():
|
| 745 |
continue
|
| 746 |
new_item = item.copy()
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
margin = line_height * 0.05
|
| 750 |
-
new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
|
| 751 |
-
new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
|
| 752 |
-
|
| 753 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 754 |
new_item['text'] = line_text.strip()
|
| 755 |
new_item['split_from_parent'] = True
|
|
@@ -759,17 +717,8 @@ def split_text_regions_into_lines(
|
|
| 759 |
# Split geometrically - mark for re-OCR to get accurate per-line text
|
| 760 |
for i in range(estimated_lines):
|
| 761 |
new_item = item.copy()
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
# 8% margin between lines to create separation
|
| 765 |
-
margin = line_height * 0.08
|
| 766 |
-
new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
|
| 767 |
-
new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
|
| 768 |
-
|
| 769 |
-
# Ensure bbox is valid
|
| 770 |
-
if new_y2 <= new_y1:
|
| 771 |
-
continue
|
| 772 |
-
|
| 773 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 774 |
# Clear text - will be re-OCR'd per-line for accuracy
|
| 775 |
new_item['text'] = ""
|
|
@@ -1853,4 +1802,4 @@ if __name__ == "__main__":
|
|
| 1853 |
share=False,
|
| 1854 |
debug=True,
|
| 1855 |
show_error=True
|
| 1856 |
-
)
|
|
|
|
| 647 |
def split_text_regions_into_lines(
|
| 648 |
image: Image.Image,
|
| 649 |
layout_data: List[Dict[str, Any]],
|
| 650 |
+
min_line_height: int = 25,
|
| 651 |
+
max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
|
| 652 |
) -> List[Dict[str, Any]]:
|
| 653 |
"""
|
| 654 |
Post-process layout data to split large text regions into individual lines.
|
| 655 |
|
| 656 |
This ensures each line gets its own bounding box for easier verification.
|
|
|
|
| 657 |
|
| 658 |
Args:
|
| 659 |
image: Original image
|
|
|
|
| 681 |
height = y2 - y1
|
| 682 |
width = x2 - x1
|
| 683 |
|
| 684 |
+
print(f" Checking region: height={height}px, width={width}px, category={category}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
# If region is tall enough to contain multiple lines, split it
|
| 687 |
if height > max_line_height:
|
| 688 |
+
print(f" → Splitting! (height {height}px > threshold {max_line_height}px)")
|
| 689 |
+
# Estimate number of lines based on typical line height
|
| 690 |
+
# Arabic handwritten text: ~40-60px per line
|
| 691 |
+
# Arabic typed text: ~30-50px per line
|
| 692 |
+
avg_line_height = 45 # Middle ground
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
estimated_lines = max(1, round(height / avg_line_height))
|
| 694 |
|
| 695 |
# Don't split into too many lines (might be a paragraph)
|
| 696 |
estimated_lines = min(estimated_lines, 10)
|
| 697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
line_height = height / estimated_lines
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
# Split text content by newlines if available
|
| 701 |
text_lines = text_content.split('\n') if text_content else []
|
| 702 |
|
|
|
|
| 706 |
if not line_text.strip():
|
| 707 |
continue
|
| 708 |
new_item = item.copy()
|
| 709 |
+
new_y1 = y1 + (i * line_height)
|
| 710 |
+
new_y2 = y1 + ((i + 1) * line_height)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 712 |
new_item['text'] = line_text.strip()
|
| 713 |
new_item['split_from_parent'] = True
|
|
|
|
| 717 |
# Split geometrically - mark for re-OCR to get accurate per-line text
|
| 718 |
for i in range(estimated_lines):
|
| 719 |
new_item = item.copy()
|
| 720 |
+
new_y1 = y1 + (i * line_height)
|
| 721 |
+
new_y2 = y1 + ((i + 1) * line_height)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 723 |
# Clear text - will be re-OCR'd per-line for accuracy
|
| 724 |
new_item['text'] = ""
|
|
|
|
| 1802 |
share=False,
|
| 1803 |
debug=True,
|
| 1804 |
show_error=True
|
| 1805 |
+
)
|