VanguardAI commited on
Commit
98b1d96
·
verified ·
1 Parent(s): bc396b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -152
app.py CHANGED
@@ -22,40 +22,27 @@ from arabic_corrector import get_corrector
22
 
23
  # Constants
24
  MIN_PIXELS = 3136
25
- MAX_PIXELS = 16000000 # Increased from 11289600 for better detail on dense forms
26
  IMAGE_FACTOR = 28
27
 
28
- # Prompts - Enhanced for granular form and dense text detection
29
- prompt = """Please output the detailed layout information from this document image. This may be a form, table, or dense text document. Detect EVERY text element individually with maximum granularity.
30
 
31
- CRITICAL REQUIREMENTS FOR FORMS AND DENSE TEXT:
32
- - Detect EACH LINE of text as a SEPARATE bbox - do NOT group multiple lines together
33
- - For forms: detect each field, label, checkbox, and filled value separately
34
- - For tables: detect each cell as an individual element
35
- - Include ALL text regions no matter how small
36
- - Be extremely precise with bounding boxes - they should tightly fit each text element
37
 
38
- 1. Bbox format: [x1, y1, x2, y2] - must be tight and accurate for each element
39
 
40
- 2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
41
- - Use 'Text' for form fields, labels, and general text
42
- - Use 'List-item' for form checkboxes and bullet points
43
-
44
- 3. Text Extraction Rules:
45
- - Picture: Omit text field
46
- - Formula: Format as LaTeX
47
- - Table: Format as HTML (detect each cell separately)
48
- - All Others: Extract exact original text in Markdown format
49
 
50
  4. Constraints:
51
- - Output original text with NO translation
52
- - Detect handwritten and typed text equally
53
- - Sort elements by human reading order (top-to-bottom, left-to-right)
54
- - For forms: maintain field relationships (label + value pairs)
55
-
56
- 5. Output Format: Single JSON array with maximum detail and granularity.
57
 
58
- REMEMBER: More bboxes = better! Aim for line-level or field-level detection, not paragraph-level.
59
  """
60
 
61
  # Utility functions
@@ -553,91 +540,6 @@ def _generate_text_and_confidence_for_crop(
553
  return "", 0.0
554
 
555
 
556
- def split_large_regions(layout_data: List[Dict], image_height: int) -> List[Dict]:
557
- """
558
- Split large bounding boxes into smaller line-level regions for better granularity.
559
- Critical for dense forms where model groups multiple lines together.
560
-
561
- Args:
562
- layout_data: List of layout items with bbox
563
- image_height: Height of the image for context
564
-
565
- Returns:
566
- Enhanced layout data with split regions
567
- """
568
- result = []
569
-
570
- for item in layout_data:
571
- bbox = item.get('bbox', [])
572
- category = item.get('category', 'Text')
573
- text = item.get('text', '')
574
-
575
- if len(bbox) != 4 or not text:
576
- result.append(item)
577
- continue
578
-
579
- x1, y1, x2, y2 = bbox
580
- width = x2 - x1
581
- height = y2 - y1
582
-
583
- # Skip splitting for certain categories
584
- if category in ['Picture', 'Formula', 'Table', 'Section-header', 'Title']:
585
- result.append(item)
586
- continue
587
-
588
- # Heuristics for splitting:
589
- # 1. Very tall regions (likely multiple lines grouped)
590
- # 2. Text with newlines (definitely multiple lines)
591
- # 3. Aspect ratio suggests multiple stacked lines
592
-
593
- should_split = False
594
- estimated_lines = 1
595
-
596
- # Check if text has explicit line breaks
597
- text_lines = text.strip().split('\n')
598
- if len(text_lines) > 1:
599
- should_split = True
600
- estimated_lines = len(text_lines)
601
- # Check if bbox is tall (multiple lines)
602
- elif height > 80: # Assume ~35-40px per line of text
603
- should_split = True
604
- estimated_lines = max(2, int(height / 40))
605
- # Check if aspect ratio suggests stacked text
606
- elif height > 60 and width / height < 3: # Not wide enough for single line
607
- should_split = True
608
- estimated_lines = max(2, int(height / 35))
609
-
610
- if should_split and estimated_lines > 1:
611
- # Split the region into estimated number of lines
612
- line_height = height / estimated_lines
613
-
614
- for i in range(estimated_lines):
615
- new_item = item.copy()
616
- new_y1 = y1 + (i * line_height)
617
- new_y2 = y1 + ((i + 1) * line_height)
618
- new_item['bbox'] = [x1, int(new_y1), x2, int(new_y2)]
619
-
620
- # Try to split text proportionally
621
- if len(text_lines) == estimated_lines:
622
- new_item['text'] = text_lines[i]
623
- elif len(text_lines) > 1:
624
- # Distribute available lines
625
- line_idx = int(i * len(text_lines) / estimated_lines)
626
- new_item['text'] = text_lines[line_idx] if line_idx < len(text_lines) else text_lines[-1]
627
- else:
628
- # Keep same text but mark as split region
629
- new_item['text'] = text
630
- new_item['is_split'] = True
631
- new_item['split_index'] = i
632
-
633
- result.append(new_item)
634
- else:
635
- # Keep as-is
636
- result.append(item)
637
-
638
- return result
639
-
640
-
641
  def estimate_text_density(image: Image.Image) -> float:
642
  """
643
  Estimate text density in image using pixel analysis.
@@ -668,7 +570,6 @@ def estimate_text_density(image: Image.Image) -> float:
668
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
669
  """
670
  Intelligently determine if image should be chunked for better accuracy.
671
- Enhanced for dense forms and structured documents.
672
 
673
  Returns (should_chunk, reason).
674
  """
@@ -676,28 +577,24 @@ def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
676
  total_pixels = width * height
677
  density = estimate_text_density(image)
678
 
679
- # Criteria for chunking (OPTIMIZED FOR FORMS AND DENSE TEXT)
680
 
681
- # 1. Very large images (>6MP) - reduced threshold for better form detection
682
- if total_pixels > 6_000_000:
683
- return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for granular layout detection"
684
 
685
- # 2. Dense text (>20% coverage) in medium+ images - forms often hit this
686
- if density > 0.20 and total_pixels > 3_000_000:
687
- return True, f"Dense text ({density*100:.1f}% coverage) - form/document chunking for accuracy"
688
 
689
- # 3. Very dense text (>30%) regardless of size - CRITICAL for forms
690
- if density > 0.30:
691
- return True, f"Very dense text ({density*100:.1f}% coverage) - form detected, aggressive chunking"
692
 
693
- # 4. Extreme aspect ratio - likely scrolled document or long form
694
  aspect_ratio = max(width, height) / min(width, height)
695
- if aspect_ratio > 2.5 and total_pixels > 2_500_000:
696
- return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - document chunking"
697
-
698
- # 5. Medium density + medium size - conservative chunking for forms
699
- if density > 0.15 and total_pixels > 4_500_000:
700
- return True, f"Medium-high density ({density*100:.1f}%) on large image - preventive chunking"
701
 
702
  return False, "Image size and density within optimal range"
703
 
@@ -706,32 +603,25 @@ def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
706
  """
707
  Chunk image into optimal pieces for processing.
708
  Uses overlap to prevent text cutting and smart sizing for accuracy.
709
- OPTIMIZED FOR DENSE FORMS: Smaller chunks, more overlap, better granularity.
710
 
711
  Returns list of chunks with metadata.
712
  """
713
  width, height = image.size
714
 
715
  # Determine optimal chunk size based on density and dimensions
716
- # AGGRESSIVE chunking for forms: smaller sizes = better layout detection
717
  density = estimate_text_density(image)
718
 
719
- if density > 0.35:
720
- # Very dense forms - use small chunks for maximum accuracy
721
- chunk_size = 1400
722
- overlap = 200 # Extra overlap for dense text
723
- elif density > 0.25:
724
- # Moderate density forms
725
  chunk_size = 1600
726
- overlap = 180
727
- elif density > 0.15:
728
- # Light-medium density
729
- chunk_size = 2000
730
- overlap = 160
731
  else:
732
  # Lower density - can use larger chunks
733
- chunk_size = 2400
734
- overlap = 150
 
735
 
736
  chunks = []
737
  chunk_id = 0
@@ -903,13 +793,6 @@ def process_image(
903
  # Try to parse JSON output
904
  layout_data = json.loads(raw_output)
905
 
906
- # 🔧 SMART REGION SPLITTING: Break large bboxes into line-level regions
907
- # Critical for forms where model groups multiple fields/lines together
908
- original_count = len(layout_data)
909
- layout_data = split_large_regions(layout_data, image.height)
910
- if len(layout_data) > original_count:
911
- print(f"📐 Split {original_count} regions into {len(layout_data)} granular regions (+{len(layout_data)-original_count} regions)")
912
-
913
  # 🎯 INTELLIGENT CONFIDENCE SCORING
914
  # Count text regions to determine if per-region scoring is feasible
915
  num_text_regions = sum(1 for item in layout_data
@@ -1659,4 +1542,4 @@ if __name__ == "__main__":
1659
  share=False,
1660
  debug=True,
1661
  show_error=True
1662
- )
 
22
 
23
  # Constants
24
  MIN_PIXELS = 3136
25
+ MAX_PIXELS = 11289600
26
  IMAGE_FACTOR = 28
27
 
28
+ # Prompts
29
+ prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
30
 
31
+ 1. Bbox format: [x1, y1, x2, y2]
 
 
 
 
 
32
 
33
+ 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
34
 
35
+ 3. Text Extraction & Formatting Rules:
36
+ - Picture: For the 'Picture' category, the text field should be omitted.
37
+ - Formula: Format its text as LaTeX.
38
+ - Table: Format its text as HTML.
39
+ - All Others (Text, Title, etc.): Format their text as Markdown.
 
 
 
 
40
 
41
  4. Constraints:
42
+ - The output text must be the original text from the image, with no translation.
43
+ - All layout elements must be sorted according to human reading order.
 
 
 
 
44
 
45
+ 5. Final Output: The entire output must be a single JSON object.
46
  """
47
 
48
  # Utility functions
 
540
  return "", 0.0
541
 
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  def estimate_text_density(image: Image.Image) -> float:
544
  """
545
  Estimate text density in image using pixel analysis.
 
570
  def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
571
  """
572
  Intelligently determine if image should be chunked for better accuracy.
 
573
 
574
  Returns (should_chunk, reason).
575
  """
 
577
  total_pixels = width * height
578
  density = estimate_text_density(image)
579
 
580
+ # Criteria for chunking (prioritizing ACCURACY)
581
 
582
+ # 1. Very large images (>8MP) - model struggles with layout detection
583
+ if total_pixels > 8_000_000:
584
+ return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for better layout detection"
585
 
586
+ # 2. Dense text (>25% coverage) in large image - overwhelming for single pass
587
+ if density > 0.25 and total_pixels > 4_000_000:
588
+ return True, f"Dense text ({density*100:.1f}% coverage) in large image - chunking for accuracy"
589
 
590
+ # 3. Very dense text (>40%) regardless of size - likely tables/forms
591
+ if density > 0.40:
592
+ return True, f"Very dense text ({density*100:.1f}% coverage) - likely structured document, chunking"
593
 
594
+ # 4. Extreme aspect ratio - likely scrolled document
595
  aspect_ratio = max(width, height) / min(width, height)
596
+ if aspect_ratio > 3.0 and total_pixels > 3_000_000:
597
+ return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking vertically"
 
 
 
 
598
 
599
  return False, "Image size and density within optimal range"
600
 
 
603
  """
604
  Chunk image into optimal pieces for processing.
605
  Uses overlap to prevent text cutting and smart sizing for accuracy.
 
606
 
607
  Returns list of chunks with metadata.
608
  """
609
  width, height = image.size
610
 
611
  # Determine optimal chunk size based on density and dimensions
 
612
  density = estimate_text_density(image)
613
 
614
+ if density > 0.40:
615
+ # Very dense - use smaller chunks for better accuracy
 
 
 
 
616
  chunk_size = 1600
617
+ elif density > 0.25:
618
+ # Moderate density
619
+ chunk_size = 2048
 
 
620
  else:
621
  # Lower density - can use larger chunks
622
+ chunk_size = 2800
623
+
624
+ overlap = 150 # Generous overlap to prevent text cutting
625
 
626
  chunks = []
627
  chunk_id = 0
 
793
  # Try to parse JSON output
794
  layout_data = json.loads(raw_output)
795
 
 
 
 
 
 
 
 
796
  # 🎯 INTELLIGENT CONFIDENCE SCORING
797
  # Count text regions to determine if per-region scoring is feasible
798
  num_text_regions = sum(1 for item in layout_data
 
1542
  share=False,
1543
  debug=True,
1544
  show_error=True
1545
+ )