Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,27 +22,40 @@ from arabic_corrector import get_corrector
|
|
| 22 |
|
| 23 |
# Constants
|
| 24 |
MIN_PIXELS = 3136
|
| 25 |
-
MAX_PIXELS = 11289600
|
| 26 |
IMAGE_FACTOR = 28
|
| 27 |
|
| 28 |
-
# Prompts
|
| 29 |
-
prompt = """Please output the layout information from
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
4. Constraints:
|
| 42 |
-
-
|
| 43 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
"""
|
| 47 |
|
| 48 |
# Utility functions
|
|
@@ -540,6 +553,91 @@ def _generate_text_and_confidence_for_crop(
|
|
| 540 |
return "", 0.0
|
| 541 |
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
def estimate_text_density(image: Image.Image) -> float:
|
| 544 |
"""
|
| 545 |
Estimate text density in image using pixel analysis.
|
|
@@ -570,6 +668,7 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 570 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 571 |
"""
|
| 572 |
Intelligently determine if image should be chunked for better accuracy.
|
|
|
|
| 573 |
|
| 574 |
Returns (should_chunk, reason).
|
| 575 |
"""
|
|
@@ -577,24 +676,28 @@ def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
|
| 577 |
total_pixels = width * height
|
| 578 |
density = estimate_text_density(image)
|
| 579 |
|
| 580 |
-
# Criteria for chunking (
|
| 581 |
|
| 582 |
-
# 1. Very large images (>
|
| 583 |
-
if total_pixels >
|
| 584 |
-
return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for
|
| 585 |
|
| 586 |
-
# 2. Dense text (>
|
| 587 |
-
if density > 0.
|
| 588 |
-
return True, f"Dense text ({density*100:.1f}% coverage)
|
| 589 |
|
| 590 |
-
# 3. Very dense text (>
|
| 591 |
-
if density > 0.
|
| 592 |
-
return True, f"Very dense text ({density*100:.1f}% coverage) -
|
| 593 |
|
| 594 |
-
# 4. Extreme aspect ratio - likely scrolled document
|
| 595 |
aspect_ratio = max(width, height) / min(width, height)
|
| 596 |
-
if aspect_ratio >
|
| 597 |
-
return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
return False, "Image size and density within optimal range"
|
| 600 |
|
|
@@ -603,25 +706,32 @@ def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
|
|
| 603 |
"""
|
| 604 |
Chunk image into optimal pieces for processing.
|
| 605 |
Uses overlap to prevent text cutting and smart sizing for accuracy.
|
|
|
|
| 606 |
|
| 607 |
Returns list of chunks with metadata.
|
| 608 |
"""
|
| 609 |
width, height = image.size
|
| 610 |
|
| 611 |
# Determine optimal chunk size based on density and dimensions
|
|
|
|
| 612 |
density = estimate_text_density(image)
|
| 613 |
|
| 614 |
-
if density > 0.
|
| 615 |
-
# Very dense - use
|
| 616 |
-
chunk_size =
|
|
|
|
| 617 |
elif density > 0.25:
|
| 618 |
-
# Moderate density
|
| 619 |
-
chunk_size =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
else:
|
| 621 |
# Lower density - can use larger chunks
|
| 622 |
-
chunk_size =
|
| 623 |
-
|
| 624 |
-
overlap = 150 # Generous overlap to prevent text cutting
|
| 625 |
|
| 626 |
chunks = []
|
| 627 |
chunk_id = 0
|
|
@@ -793,6 +903,13 @@ def process_image(
|
|
| 793 |
# Try to parse JSON output
|
| 794 |
layout_data = json.loads(raw_output)
|
| 795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 797 |
# Count text regions to determine if per-region scoring is feasible
|
| 798 |
num_text_regions = sum(1 for item in layout_data
|
|
|
|
| 22 |
|
| 23 |
# Constants
|
| 24 |
MIN_PIXELS = 3136
|
| 25 |
+
MAX_PIXELS = 16000000 # Increased from 11289600 for better detail on dense forms
|
| 26 |
IMAGE_FACTOR = 28
|
| 27 |
|
| 28 |
+
# Prompts - Enhanced for granular form and dense text detection
|
| 29 |
+
prompt = """Please output the detailed layout information from this document image. This may be a form, table, or dense text document. Detect EVERY text element individually with maximum granularity.
|
| 30 |
|
| 31 |
+
CRITICAL REQUIREMENTS FOR FORMS AND DENSE TEXT:
|
| 32 |
+
- Detect EACH LINE of text as a SEPARATE bbox - do NOT group multiple lines together
|
| 33 |
+
- For forms: detect each field, label, checkbox, and filled value separately
|
| 34 |
+
- For tables: detect each cell as an individual element
|
| 35 |
+
- Include ALL text regions no matter how small
|
| 36 |
+
- Be extremely precise with bounding boxes - they should tightly fit each text element
|
| 37 |
|
| 38 |
+
1. Bbox format: [x1, y1, x2, y2] - must be tight and accurate for each element
|
| 39 |
|
| 40 |
+
2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
|
| 41 |
+
- Use 'Text' for form fields, labels, and general text
|
| 42 |
+
- Use 'List-item' for form checkboxes and bullet points
|
| 43 |
+
|
| 44 |
+
3. Text Extraction Rules:
|
| 45 |
+
- Picture: Omit text field
|
| 46 |
+
- Formula: Format as LaTeX
|
| 47 |
+
- Table: Format as HTML (detect each cell separately)
|
| 48 |
+
- All Others: Extract exact original text in Markdown format
|
| 49 |
|
| 50 |
4. Constraints:
|
| 51 |
+
- Output original text with NO translation
|
| 52 |
+
- Detect handwritten and typed text equally
|
| 53 |
+
- Sort elements by human reading order (top-to-bottom, left-to-right)
|
| 54 |
+
- For forms: maintain field relationships (label + value pairs)
|
| 55 |
+
|
| 56 |
+
5. Output Format: Single JSON array with maximum detail and granularity.
|
| 57 |
|
| 58 |
+
REMEMBER: More bboxes = better! Aim for line-level or field-level detection, not paragraph-level.
|
| 59 |
"""
|
| 60 |
|
| 61 |
# Utility functions
|
|
|
|
| 553 |
return "", 0.0
|
| 554 |
|
| 555 |
|
| 556 |
+
def split_large_regions(layout_data: List[Dict], image_height: int) -> List[Dict]:
|
| 557 |
+
"""
|
| 558 |
+
Split large bounding boxes into smaller line-level regions for better granularity.
|
| 559 |
+
Critical for dense forms where model groups multiple lines together.
|
| 560 |
+
|
| 561 |
+
Args:
|
| 562 |
+
layout_data: List of layout items with bbox
|
| 563 |
+
image_height: Height of the image for context
|
| 564 |
+
|
| 565 |
+
Returns:
|
| 566 |
+
Enhanced layout data with split regions
|
| 567 |
+
"""
|
| 568 |
+
result = []
|
| 569 |
+
|
| 570 |
+
for item in layout_data:
|
| 571 |
+
bbox = item.get('bbox', [])
|
| 572 |
+
category = item.get('category', 'Text')
|
| 573 |
+
text = item.get('text', '')
|
| 574 |
+
|
| 575 |
+
if len(bbox) != 4 or not text:
|
| 576 |
+
result.append(item)
|
| 577 |
+
continue
|
| 578 |
+
|
| 579 |
+
x1, y1, x2, y2 = bbox
|
| 580 |
+
width = x2 - x1
|
| 581 |
+
height = y2 - y1
|
| 582 |
+
|
| 583 |
+
# Skip splitting for certain categories
|
| 584 |
+
if category in ['Picture', 'Formula', 'Table', 'Section-header', 'Title']:
|
| 585 |
+
result.append(item)
|
| 586 |
+
continue
|
| 587 |
+
|
| 588 |
+
# Heuristics for splitting:
|
| 589 |
+
# 1. Very tall regions (likely multiple lines grouped)
|
| 590 |
+
# 2. Text with newlines (definitely multiple lines)
|
| 591 |
+
# 3. Aspect ratio suggests multiple stacked lines
|
| 592 |
+
|
| 593 |
+
should_split = False
|
| 594 |
+
estimated_lines = 1
|
| 595 |
+
|
| 596 |
+
# Check if text has explicit line breaks
|
| 597 |
+
text_lines = text.strip().split('\n')
|
| 598 |
+
if len(text_lines) > 1:
|
| 599 |
+
should_split = True
|
| 600 |
+
estimated_lines = len(text_lines)
|
| 601 |
+
# Check if bbox is tall (multiple lines)
|
| 602 |
+
elif height > 80: # Assume ~35-40px per line of text
|
| 603 |
+
should_split = True
|
| 604 |
+
estimated_lines = max(2, int(height / 40))
|
| 605 |
+
# Check if aspect ratio suggests stacked text
|
| 606 |
+
elif height > 60 and width / height < 3: # Not wide enough for single line
|
| 607 |
+
should_split = True
|
| 608 |
+
estimated_lines = max(2, int(height / 35))
|
| 609 |
+
|
| 610 |
+
if should_split and estimated_lines > 1:
|
| 611 |
+
# Split the region into estimated number of lines
|
| 612 |
+
line_height = height / estimated_lines
|
| 613 |
+
|
| 614 |
+
for i in range(estimated_lines):
|
| 615 |
+
new_item = item.copy()
|
| 616 |
+
new_y1 = y1 + (i * line_height)
|
| 617 |
+
new_y2 = y1 + ((i + 1) * line_height)
|
| 618 |
+
new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
|
| 619 |
+
|
| 620 |
+
# Try to split text proportionally
|
| 621 |
+
if len(text_lines) == estimated_lines:
|
| 622 |
+
new_item['text'] = text_lines[i]
|
| 623 |
+
elif len(text_lines) > 1:
|
| 624 |
+
# Distribute available lines
|
| 625 |
+
line_idx = int(i * len(text_lines) / estimated_lines)
|
| 626 |
+
new_item['text'] = text_lines[line_idx] if line_idx < len(text_lines) else text_lines[-1]
|
| 627 |
+
else:
|
| 628 |
+
# Keep same text but mark as split region
|
| 629 |
+
new_item['text'] = text
|
| 630 |
+
new_item['is_split'] = True
|
| 631 |
+
new_item['split_index'] = i
|
| 632 |
+
|
| 633 |
+
result.append(new_item)
|
| 634 |
+
else:
|
| 635 |
+
# Keep as-is
|
| 636 |
+
result.append(item)
|
| 637 |
+
|
| 638 |
+
return result
|
| 639 |
+
|
| 640 |
+
|
| 641 |
def estimate_text_density(image: Image.Image) -> float:
|
| 642 |
"""
|
| 643 |
Estimate text density in image using pixel analysis.
|
|
|
|
| 668 |
def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
|
| 669 |
"""
|
| 670 |
Intelligently determine if image should be chunked for better accuracy.
|
| 671 |
+
Enhanced for dense forms and structured documents.
|
| 672 |
|
| 673 |
Returns (should_chunk, reason).
|
| 674 |
"""
|
|
|
|
| 676 |
total_pixels = width * height
|
| 677 |
density = estimate_text_density(image)
|
| 678 |
|
| 679 |
+
# Criteria for chunking (OPTIMIZED FOR FORMS AND DENSE TEXT)
|
| 680 |
|
| 681 |
+
# 1. Very large images (>6MP) - reduced threshold for better form detection
|
| 682 |
+
if total_pixels > 6_000_000:
|
| 683 |
+
return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for granular layout detection"
|
| 684 |
|
| 685 |
+
# 2. Dense text (>20% coverage) in medium+ images - forms often hit this
|
| 686 |
+
if density > 0.20 and total_pixels > 3_000_000:
|
| 687 |
+
return True, f"Dense text ({density*100:.1f}% coverage) - form/document chunking for accuracy"
|
| 688 |
|
| 689 |
+
# 3. Very dense text (>30%) regardless of size - CRITICAL for forms
|
| 690 |
+
if density > 0.30:
|
| 691 |
+
return True, f"Very dense text ({density*100:.1f}% coverage) - form detected, aggressive chunking"
|
| 692 |
|
| 693 |
+
# 4. Extreme aspect ratio - likely scrolled document or long form
|
| 694 |
aspect_ratio = max(width, height) / min(width, height)
|
| 695 |
+
if aspect_ratio > 2.5 and total_pixels > 2_500_000:
|
| 696 |
+
return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - document chunking"
|
| 697 |
+
|
| 698 |
+
# 5. Medium density + medium size - conservative chunking for forms
|
| 699 |
+
if density > 0.15 and total_pixels > 4_500_000:
|
| 700 |
+
return True, f"Medium-high density ({density*100:.1f}%) on large image - preventive chunking"
|
| 701 |
|
| 702 |
return False, "Image size and density within optimal range"
|
| 703 |
|
|
|
|
| 706 |
"""
|
| 707 |
Chunk image into optimal pieces for processing.
|
| 708 |
Uses overlap to prevent text cutting and smart sizing for accuracy.
|
| 709 |
+
OPTIMIZED FOR DENSE FORMS: Smaller chunks, more overlap, better granularity.
|
| 710 |
|
| 711 |
Returns list of chunks with metadata.
|
| 712 |
"""
|
| 713 |
width, height = image.size
|
| 714 |
|
| 715 |
# Determine optimal chunk size based on density and dimensions
|
| 716 |
+
# AGGRESSIVE chunking for forms: smaller sizes = better layout detection
|
| 717 |
density = estimate_text_density(image)
|
| 718 |
|
| 719 |
+
if density > 0.35:
|
| 720 |
+
# Very dense forms - use small chunks for maximum accuracy
|
| 721 |
+
chunk_size = 1400
|
| 722 |
+
overlap = 200 # Extra overlap for dense text
|
| 723 |
elif density > 0.25:
|
| 724 |
+
# Moderate density forms
|
| 725 |
+
chunk_size = 1600
|
| 726 |
+
overlap = 180
|
| 727 |
+
elif density > 0.15:
|
| 728 |
+
# Light-medium density
|
| 729 |
+
chunk_size = 2000
|
| 730 |
+
overlap = 160
|
| 731 |
else:
|
| 732 |
# Lower density - can use larger chunks
|
| 733 |
+
chunk_size = 2400
|
| 734 |
+
overlap = 150
|
|
|
|
| 735 |
|
| 736 |
chunks = []
|
| 737 |
chunk_id = 0
|
|
|
|
| 903 |
# Try to parse JSON output
|
| 904 |
layout_data = json.loads(raw_output)
|
| 905 |
|
| 906 |
+
# 🔧 SMART REGION SPLITTING: Break large bboxes into line-level regions
|
| 907 |
+
# Critical for forms where model groups multiple fields/lines together
|
| 908 |
+
original_count = len(layout_data)
|
| 909 |
+
layout_data = split_large_regions(layout_data, image.height)
|
| 910 |
+
if len(layout_data) > original_count:
|
| 911 |
+
print(f"📐 Split {original_count} regions into {len(layout_data)} granular regions (+{len(layout_data)-original_count} regions)")
|
| 912 |
+
|
| 913 |
# 🎯 INTELLIGENT CONFIDENCE SCORING
|
| 914 |
# Count text regions to determine if per-region scoring is feasible
|
| 915 |
num_text_regions = sum(1 for item in layout_data
|