Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,40 +22,27 @@ from arabic_corrector import get_corrector
|
|
| 22 |
|
| 23 |
# Constants
|
| 24 |
MIN_PIXELS = 3136
|
| 25 |
-
MAX_PIXELS =
|
| 26 |
IMAGE_FACTOR = 28
|
| 27 |
|
| 28 |
-
# Prompts
|
| 29 |
-
prompt = """Please output the
|
| 30 |
|
| 31 |
-
|
| 32 |
-
- Detect EACH LINE of text as a SEPARATE bbox - do NOT group multiple lines together
|
| 33 |
-
- For forms: detect each field, label, checkbox, and filled value separately
|
| 34 |
-
- For tables: detect each cell as an individual element
|
| 35 |
-
- Include ALL text regions no matter how small
|
| 36 |
-
- Be extremely precise with bounding boxes - they should tightly fit each text element
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
- Picture: Omit text field
|
| 46 |
-
- Formula: Format as LaTeX
|
| 47 |
-
- Table: Format as HTML (detect each cell separately)
|
| 48 |
-
- All Others: Extract exact original text in Markdown format
|
| 49 |
|
| 50 |
4. Constraints:
|
| 51 |
-
-
|
| 52 |
-
-
|
| 53 |
-
- Sort elements by human reading order (top-to-bottom, left-to-right)
|
| 54 |
-
- For forms: maintain field relationships (label + value pairs)
|
| 55 |
-
|
| 56 |
-
5. Output Format: Single JSON array with maximum detail and granularity.
|
| 57 |
|
| 58 |
-
|
| 59 |
"""
|
| 60 |
|
| 61 |
# Utility functions
|
|
@@ -553,91 +540,6 @@ def _generate_text_and_confidence_for_crop(
|
|
| 553 |
return "", 0.0
|
| 554 |
|
| 555 |
|
| 556 |
-
def split_large_regions(layout_data: List[Dict], image_height: int) -> List[Dict]:
|
| 557 |
-
"""
|
| 558 |
-
Split large bounding boxes into smaller line-level regions for better granularity.
|
| 559 |
-
Critical for dense forms where model groups multiple lines together.
|
| 560 |
-
|
| 561 |
-
Args:
|
| 562 |
-
layout_data: List of layout items with bbox
|
| 563 |
-
image_height: Height of the image for context
|
| 564 |
-
|
| 565 |
-
Returns:
|
| 566 |
-
Enhanced layout data with split regions
|
| 567 |
-
"""
|
| 568 |
-
result = []
|
| 569 |
-
|
| 570 |
-
for item in layout_data:
|
| 571 |
-
bbox = item.get('bbox', [])
|
| 572 |
-
category = item.get('category', 'Text')
|
| 573 |
-
text = item.get('text', '')
|
| 574 |
-
|
| 575 |
-
if len(bbox) != 4 or not text:
|
| 576 |
-
result.append(item)
|
| 577 |
-
continue
|
| 578 |
-
|
| 579 |
-
x1, y1, x2, y2 = bbox
|
| 580 |
-
width = x2 - x1
|
| 581 |
-
height = y2 - y1
|
| 582 |
-
|
| 583 |
-
# Skip splitting for certain categories
|
| 584 |
-
if category in ['Picture', 'Formula', 'Table', 'Section-header', 'Title']:
|
| 585 |
-
result.append(item)
|
| 586 |
-
continue
|
| 587 |
-
|
| 588 |
-
# Heuristics for splitting:
|
| 589 |
-
# 1. Very tall regions (likely multiple lines grouped)
|
| 590 |
-
# 2. Text with newlines (definitely multiple lines)
|
| 591 |
-
# 3. Aspect ratio suggests multiple stacked lines
|
| 592 |
-
|
| 593 |
-
should_split = False
|
| 594 |
-
estimated_lines = 1
|
| 595 |
-
|
| 596 |
-
# Check if text has explicit line breaks
|
| 597 |
-
text_lines = text.strip().split('\n')
|
| 598 |
-
if len(text_lines) > 1:
|
| 599 |
-
should_split = True
|
| 600 |
-
estimated_lines = len(text_lines)
|
| 601 |
-
# Check if bbox is tall (multiple lines)
|
| 602 |
-
elif height > 80: # Assume ~35-40px per line of text
|
| 603 |
-
should_split = True
|
| 604 |
-
estimated_lines = max(2, int(height / 40))
|
| 605 |
-
# Check if aspect ratio suggests stacked text
|
| 606 |
-
elif height > 60 and width / height < 3: # Not wide enough for single line
|
| 607 |
-
should_split = True
|
| 608 |
-
estimated_lines = max(2, int(height / 35))
|
| 609 |
-
|
| 610 |
-
if should_split and estimated_lines > 1:
|
| 611 |
-
# Split the region into estimated number of lines
|
| 612 |
-
line_height = height / estimated_lines
|
| 613 |
-
|
| 614 |
-
for i in range(estimated_lines):
|
| 615 |
-
new_item = item.copy()
|
| 616 |
-
new_y1 = y1 + (i * line_height)
|
| 617 |
-
new_y2 = y1 + ((i + 1) * line_height)
|
| 618 |
-
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 619 |
-
|
| 620 |
-
# Try to split text proportionally
|
| 621 |
-
if len(text_lines) == estimated_lines:
|
| 622 |
-
new_item['text'] = text_lines[i]
|
| 623 |
-
elif len(text_lines) > 1:
|
| 624 |
-
# Distribute available lines
|
| 625 |
-
line_idx = int(i * len(text_lines) / estimated_lines)
|
| 626 |
-
new_item['text'] = text_lines[line_idx] if line_idx < len(text_lines) else text_lines[-1]
|
| 627 |
-
else:
|
| 628 |
-
# Keep same text but mark as split region
|
| 629 |
-
new_item['text'] = text
|
| 630 |
-
new_item['is_split'] = True
|
| 631 |
-
new_item['split_index'] = i
|
| 632 |
-
|
| 633 |
-
result.append(new_item)
|
| 634 |
-
else:
|
| 635 |
-
# Keep as-is
|
| 636 |
-
result.append(item)
|
| 637 |
-
|
| 638 |
-
return result
|
| 639 |
-
|
| 640 |
-
|
| 641 |
def estimate_text_density(image: Image.Image) -> float:
|
| 642 |
"""
|
| 643 |
Estimate text density in image using pixel analysis.
|
|
@@ -668,7 +570,6 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 668 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 669 |
"""
|
| 670 |
Intelligently determine if image should be chunked for better accuracy.
|
| 671 |
-
Enhanced for dense forms and structured documents.
|
| 672 |
|
| 673 |
Returns (should_chunk, reason).
|
| 674 |
"""
|
|
@@ -676,28 +577,24 @@ def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
|
| 676 |
total_pixels = width * height
|
| 677 |
density = estimate_text_density(image)
|
| 678 |
|
| 679 |
-
# Criteria for chunking (
|
| 680 |
|
| 681 |
-
# 1. Very large images (>
|
| 682 |
-
if total_pixels >
|
| 683 |
-
return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for
|
| 684 |
|
| 685 |
-
# 2. Dense text (>
|
| 686 |
-
if density > 0.
|
| 687 |
-
return True, f"Dense text ({density*100:.1f}% coverage) -
|
| 688 |
|
| 689 |
-
# 3. Very dense text (>
|
| 690 |
-
if density > 0.
|
| 691 |
-
return True, f"Very dense text ({density*100:.1f}% coverage) -
|
| 692 |
|
| 693 |
-
# 4. Extreme aspect ratio - likely scrolled document
|
| 694 |
aspect_ratio = max(width, height) / min(width, height)
|
| 695 |
-
if aspect_ratio >
|
| 696 |
-
return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) -
|
| 697 |
-
|
| 698 |
-
# 5. Medium density + medium size - conservative chunking for forms
|
| 699 |
-
if density > 0.15 and total_pixels > 4_500_000:
|
| 700 |
-
return True, f"Medium-high density ({density*100:.1f}%) on large image - preventive chunking"
|
| 701 |
|
| 702 |
return False, "Image size and density within optimal range"
|
| 703 |
|
|
@@ -706,32 +603,25 @@ def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
|
|
| 706 |
"""
|
| 707 |
Chunk image into optimal pieces for processing.
|
| 708 |
Uses overlap to prevent text cutting and smart sizing for accuracy.
|
| 709 |
-
OPTIMIZED FOR DENSE FORMS: Smaller chunks, more overlap, better granularity.
|
| 710 |
|
| 711 |
Returns list of chunks with metadata.
|
| 712 |
"""
|
| 713 |
width, height = image.size
|
| 714 |
|
| 715 |
# Determine optimal chunk size based on density and dimensions
|
| 716 |
-
# AGGRESSIVE chunking for forms: smaller sizes = better layout detection
|
| 717 |
density = estimate_text_density(image)
|
| 718 |
|
| 719 |
-
if density > 0.
|
| 720 |
-
# Very dense
|
| 721 |
-
chunk_size = 1400
|
| 722 |
-
overlap = 200 # Extra overlap for dense text
|
| 723 |
-
elif density > 0.25:
|
| 724 |
-
# Moderate density forms
|
| 725 |
chunk_size = 1600
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
chunk_size = 2000
|
| 730 |
-
overlap = 160
|
| 731 |
else:
|
| 732 |
# Lower density - can use larger chunks
|
| 733 |
-
chunk_size =
|
| 734 |
-
|
|
|
|
| 735 |
|
| 736 |
chunks = []
|
| 737 |
chunk_id = 0
|
|
@@ -903,13 +793,6 @@ def process_image(
|
|
| 903 |
# Try to parse JSON output
|
| 904 |
layout_data = json.loads(raw_output)
|
| 905 |
|
| 906 |
-
# 🔧 SMART REGION SPLITTING: Break large bboxes into line-level regions
|
| 907 |
-
# Critical for forms where model groups multiple fields/lines together
|
| 908 |
-
original_count = len(layout_data)
|
| 909 |
-
layout_data = split_large_regions(layout_data, image.height)
|
| 910 |
-
if len(layout_data) > original_count:
|
| 911 |
-
print(f"📐 Split {original_count} regions into {len(layout_data)} granular regions (+{len(layout_data)-original_count} regions)")
|
| 912 |
-
|
| 913 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 914 |
# Count text regions to determine if per-region scoring is feasible
|
| 915 |
num_text_regions = sum(1 for item in layout_data
|
|
@@ -1659,4 +1542,4 @@ if __name__ == "__main__":
|
|
| 1659 |
share=False,
|
| 1660 |
debug=True,
|
| 1661 |
show_error=True
|
| 1662 |
-
)
|
|
|
|
| 22 |
|
| 23 |
# Constants
|
| 24 |
MIN_PIXELS = 3136
|
| 25 |
+
MAX_PIXELS = 11289600
|
| 26 |
IMAGE_FACTOR = 28
|
| 27 |
|
| 28 |
+
# Prompts
|
| 29 |
+
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
| 30 |
|
| 31 |
+
1. Bbox format: [x1, y1, x2, y2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
|
| 34 |
|
| 35 |
+
3. Text Extraction & Formatting Rules:
|
| 36 |
+
- Picture: For the 'Picture' category, the text field should be omitted.
|
| 37 |
+
- Formula: Format its text as LaTeX.
|
| 38 |
+
- Table: Format its text as HTML.
|
| 39 |
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
4. Constraints:
|
| 42 |
+
- The output text must be the original text from the image, with no translation.
|
| 43 |
+
- All layout elements must be sorted according to human reading order.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
5. Final Output: The entire output must be a single JSON object.
|
| 46 |
"""
|
| 47 |
|
| 48 |
# Utility functions
|
|
|
|
| 540 |
return "", 0.0
|
| 541 |
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
def estimate_text_density(image: Image.Image) -> float:
|
| 544 |
"""
|
| 545 |
Estimate text density in image using pixel analysis.
|
|
|
|
| 570 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 571 |
"""
|
| 572 |
Intelligently determine if image should be chunked for better accuracy.
|
|
|
|
| 573 |
|
| 574 |
Returns (should_chunk, reason).
|
| 575 |
"""
|
|
|
|
| 577 |
total_pixels = width * height
|
| 578 |
density = estimate_text_density(image)
|
| 579 |
|
| 580 |
+
# Criteria for chunking (prioritizing ACCURACY)
|
| 581 |
|
| 582 |
+
# 1. Very large images (>8MP) - model struggles with layout detection
|
| 583 |
+
if total_pixels > 8_000_000:
|
| 584 |
+
return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for better layout detection"
|
| 585 |
|
| 586 |
+
# 2. Dense text (>25% coverage) in large image - overwhelming for single pass
|
| 587 |
+
if density > 0.25 and total_pixels > 4_000_000:
|
| 588 |
+
return True, f"Dense text ({density*100:.1f}% coverage) in large image - chunking for accuracy"
|
| 589 |
|
| 590 |
+
# 3. Very dense text (>40%) regardless of size - likely tables/forms
|
| 591 |
+
if density > 0.40:
|
| 592 |
+
return True, f"Very dense text ({density*100:.1f}% coverage) - likely structured document, chunking"
|
| 593 |
|
| 594 |
+
# 4. Extreme aspect ratio - likely scrolled document
|
| 595 |
aspect_ratio = max(width, height) / min(width, height)
|
| 596 |
+
if aspect_ratio > 3.0 and total_pixels > 3_000_000:
|
| 597 |
+
return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking vertically"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
return False, "Image size and density within optimal range"
|
| 600 |
|
|
|
|
| 603 |
"""
|
| 604 |
Chunk image into optimal pieces for processing.
|
| 605 |
Uses overlap to prevent text cutting and smart sizing for accuracy.
|
|
|
|
| 606 |
|
| 607 |
Returns list of chunks with metadata.
|
| 608 |
"""
|
| 609 |
width, height = image.size
|
| 610 |
|
| 611 |
# Determine optimal chunk size based on density and dimensions
|
|
|
|
| 612 |
density = estimate_text_density(image)
|
| 613 |
|
| 614 |
+
if density > 0.40:
|
| 615 |
+
# Very dense - use smaller chunks for better accuracy
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
chunk_size = 1600
|
| 617 |
+
elif density > 0.25:
|
| 618 |
+
# Moderate density
|
| 619 |
+
chunk_size = 2048
|
|
|
|
|
|
|
| 620 |
else:
|
| 621 |
# Lower density - can use larger chunks
|
| 622 |
+
chunk_size = 2800
|
| 623 |
+
|
| 624 |
+
overlap = 150 # Generous overlap to prevent text cutting
|
| 625 |
|
| 626 |
chunks = []
|
| 627 |
chunk_id = 0
|
|
|
|
| 793 |
# Try to parse JSON output
|
| 794 |
layout_data = json.loads(raw_output)
|
| 795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 797 |
# Count text regions to determine if per-region scoring is feasible
|
| 798 |
num_text_regions = sum(1 for item in layout_data
|
|
|
|
| 1542 |
share=False,
|
| 1543 |
debug=True,
|
| 1544 |
show_error=True
|
| 1545 |
+
)
|