Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -647,13 +647,14 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 647 |
def split_text_regions_into_lines(
|
| 648 |
image: Image.Image,
|
| 649 |
layout_data: List[Dict[str, Any]],
|
| 650 |
-
min_line_height: int =
|
| 651 |
-
max_line_height: int =
|
| 652 |
) -> List[Dict[str, Any]]:
|
| 653 |
"""
|
| 654 |
Post-process layout data to split large text regions into individual lines.
|
| 655 |
|
| 656 |
This ensures each line gets its own bounding box for easier verification.
|
|
|
|
| 657 |
|
| 658 |
Args:
|
| 659 |
image: Original image
|
|
@@ -681,22 +682,59 @@ def split_text_regions_into_lines(
|
|
| 681 |
height = y2 - y1
|
| 682 |
width = x2 - x1
|
| 683 |
|
| 684 |
-
print(f" Checking region: height={height}px, width={width}px, category={category}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
# If region is tall enough to contain multiple lines, split it
|
| 687 |
if height > max_line_height:
|
| 688 |
-
print(f" →
|
| 689 |
-
|
| 690 |
-
#
|
| 691 |
-
# Arabic
|
| 692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
estimated_lines = max(1, round(height / avg_line_height))
|
| 694 |
|
| 695 |
# Don't split into too many lines (might be a paragraph)
|
| 696 |
estimated_lines = min(estimated_lines, 10)
|
| 697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
line_height = height / estimated_lines
|
| 699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
# Split text content by newlines if available
|
| 701 |
text_lines = text_content.split('\n') if text_content else []
|
| 702 |
|
|
@@ -706,8 +744,12 @@ def split_text_regions_into_lines(
|
|
| 706 |
if not line_text.strip():
|
| 707 |
continue
|
| 708 |
new_item = item.copy()
|
| 709 |
-
|
| 710 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 712 |
new_item['text'] = line_text.strip()
|
| 713 |
new_item['split_from_parent'] = True
|
|
@@ -717,8 +759,17 @@ def split_text_regions_into_lines(
|
|
| 717 |
# Split geometrically - mark for re-OCR to get accurate per-line text
|
| 718 |
for i in range(estimated_lines):
|
| 719 |
new_item = item.copy()
|
| 720 |
-
|
| 721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 723 |
# Clear text - will be re-OCR'd per-line for accuracy
|
| 724 |
new_item['text'] = ""
|
|
|
|
| 647 |
def split_text_regions_into_lines(
|
| 648 |
image: Image.Image,
|
| 649 |
layout_data: List[Dict[str, Any]],
|
| 650 |
+
min_line_height: int = 35, # Increased to avoid cutting through text
|
| 651 |
+
max_line_height: int = 100 # Less aggressive to prevent over-splitting
|
| 652 |
) -> List[Dict[str, Any]]:
|
| 653 |
"""
|
| 654 |
Post-process layout data to split large text regions into individual lines.
|
| 655 |
|
| 656 |
This ensures each line gets its own bounding box for easier verification.
|
| 657 |
+
Uses smart estimation to avoid cutting through the middle of text lines.
|
| 658 |
|
| 659 |
Args:
|
| 660 |
image: Original image
|
|
|
|
| 682 |
height = y2 - y1
|
| 683 |
width = x2 - x1
|
| 684 |
|
| 685 |
+
print(f" Checking region: height={height:.0f}px, width={width:.0f}px, category={category}")
|
| 686 |
+
|
| 687 |
+
# Safety check: if region is too small (< min_line_height), it might be incorrect/noise
|
| 688 |
+
if height < min_line_height:
|
| 689 |
+
print(f" ⚠️ Region too small (height {height:.0f}px < {min_line_height}px) - may be incomplete line")
|
| 690 |
+
result.append(item) # Keep it but flag the issue
|
| 691 |
+
continue
|
| 692 |
+
|
| 693 |
+
# Safety check: if region is reasonably line-sized already, keep it
|
| 694 |
+
# This prevents unnecessary splitting of well-detected regions
|
| 695 |
+
if min_line_height <= height <= max_line_height:
|
| 696 |
+
print(f" ✓ Already good size (within {min_line_height}-{max_line_height}px range)")
|
| 697 |
+
result.append(item)
|
| 698 |
+
continue
|
| 699 |
|
| 700 |
# If region is tall enough to contain multiple lines, split it
|
| 701 |
if height > max_line_height:
|
| 702 |
+
print(f" → Will split (height {height:.0f}px > threshold {max_line_height}px)")
|
| 703 |
+
|
| 704 |
+
# Smart line height estimation based on region characteristics
|
| 705 |
+
# Arabic handwritten: typically 50-70px per line
|
| 706 |
+
# Arabic typed: typically 35-50px per line
|
| 707 |
+
# Use conservative estimate to avoid cutting through text
|
| 708 |
+
|
| 709 |
+
# Estimate based on height
|
| 710 |
+
if height < 150:
|
| 711 |
+
avg_line_height = 60 # Conservative for small regions
|
| 712 |
+
elif height < 300:
|
| 713 |
+
avg_line_height = 55 # Medium regions
|
| 714 |
+
else:
|
| 715 |
+
avg_line_height = 50 # Larger documents
|
| 716 |
+
|
| 717 |
estimated_lines = max(1, round(height / avg_line_height))
|
| 718 |
|
| 719 |
# Don't split into too many lines (might be a paragraph)
|
| 720 |
estimated_lines = min(estimated_lines, 10)
|
| 721 |
|
| 722 |
+
# Validate: ensure we have at least 2 lines to split, otherwise keep original
|
| 723 |
+
if estimated_lines < 2:
|
| 724 |
+
print(f" → Keeping original (only {estimated_lines} line estimated)")
|
| 725 |
+
result.append(item)
|
| 726 |
+
continue
|
| 727 |
+
|
| 728 |
line_height = height / estimated_lines
|
| 729 |
|
| 730 |
+
# Validate: each split line must meet minimum height requirement
|
| 731 |
+
if line_height < min_line_height:
|
| 732 |
+
print(f" → Keeping original (split lines would be too small: {line_height:.0f}px < {min_line_height}px)")
|
| 733 |
+
result.append(item)
|
| 734 |
+
continue
|
| 735 |
+
|
| 736 |
+
print(f" → Splitting into {estimated_lines} lines (each ~{line_height:.0f}px)")
|
| 737 |
+
|
| 738 |
# Split text content by newlines if available
|
| 739 |
text_lines = text_content.split('\n') if text_content else []
|
| 740 |
|
|
|
|
| 744 |
if not line_text.strip():
|
| 745 |
continue
|
| 746 |
new_item = item.copy()
|
| 747 |
+
|
| 748 |
+
# Add small padding to avoid cutting through text (5% margin)
|
| 749 |
+
margin = line_height * 0.05
|
| 750 |
+
new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
|
| 751 |
+
new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
|
| 752 |
+
|
| 753 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 754 |
new_item['text'] = line_text.strip()
|
| 755 |
new_item['split_from_parent'] = True
|
|
|
|
| 759 |
# Split geometrically - mark for re-OCR to get accurate per-line text
|
| 760 |
for i in range(estimated_lines):
|
| 761 |
new_item = item.copy()
|
| 762 |
+
|
| 763 |
+
# Add padding between lines to avoid cutting through text
|
| 764 |
+
# 8% margin between lines to create separation
|
| 765 |
+
margin = line_height * 0.08
|
| 766 |
+
new_y1 = y1 + (i * line_height) + (margin if i > 0 else 0)
|
| 767 |
+
new_y2 = y1 + ((i + 1) * line_height) - (margin if i < estimated_lines - 1 else 0)
|
| 768 |
+
|
| 769 |
+
# Ensure bbox is valid
|
| 770 |
+
if new_y2 <= new_y1:
|
| 771 |
+
continue
|
| 772 |
+
|
| 773 |
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 774 |
# Clear text - will be re-OCR'd per-line for accuracy
|
| 775 |
new_item['text'] = ""
|