VanguardAI commited on
Commit
38e40e9
Β·
verified Β·
1 Parent(s): f52a750

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -8
app.py CHANGED
@@ -500,7 +500,18 @@ def _generate_text_and_confidence_for_crop(
500
  {"type": "image", "image": image},
501
  {
502
  "type": "text",
503
- "text": "Extract the exact Arabic text from this image. This may be handwritten or typed Arabic text. Output ONLY the Arabic text you see, with no translation, no English, no additional words or explanations. If there is no text, output nothing.",
 
 
 
 
 
 
 
 
 
 
 
504
  },
505
  ],
506
  }
@@ -636,8 +647,8 @@ def estimate_text_density(image: Image.Image) -> float:
636
  def split_text_regions_into_lines(
637
  image: Image.Image,
638
  layout_data: List[Dict[str, Any]],
639
- min_line_height: int = 30,
640
- max_line_height: int = 120
641
  ) -> List[Dict[str, Any]]:
642
  """
643
  Post-process layout data to split large text regions into individual lines.
@@ -668,9 +679,13 @@ def split_text_regions_into_lines(
668
 
669
  x1, y1, x2, y2 = bbox
670
  height = y2 - y1
 
 
 
671
 
672
  # If region is tall enough to contain multiple lines, split it
673
  if height > max_line_height:
 
674
  # Estimate number of lines based on typical line height
675
  # Arabic handwritten text: ~40-60px per line
676
  # Arabic typed text: ~30-50px per line
@@ -950,10 +965,20 @@ def process_image(
950
 
951
  # πŸ“ LINE-LEVEL SPLITTING: Split large text regions into individual lines
952
  # This ensures each line gets its own bounding box for easier verification
 
 
 
 
 
 
 
953
  try:
 
954
  layout_data = split_text_regions_into_lines(image, layout_data)
 
955
  except Exception as e:
956
  print(f"⚠️ Warning: Could not split text regions: {e}")
 
957
  # Continue with original layout data
958
 
959
  # πŸ”„ RE-OCR SPLIT LINES: For split regions, perform per-line OCR
@@ -979,15 +1004,43 @@ def process_image(
979
 
980
  # Re-OCR this specific line
981
  line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
982
- item['text'] = line_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983
  item['confidence'] = line_conf
984
  item['reocr_completed'] = True
985
 
986
- print(f" Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' ({line_conf:.1f}%)")
987
  except Exception as e:
988
- print(f"⚠️ Error re-OCRing line {idx}: {e}")
 
989
  item['text'] = "[OCR Failed]"
990
  item['confidence'] = 0.0
 
 
 
 
 
 
 
991
 
992
  # 🎯 INTELLIGENT CONFIDENCE SCORING
993
  # Count text regions to determine if per-region scoring is feasible
@@ -1050,9 +1103,10 @@ def process_image(
1050
 
1051
  # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
1052
  try:
1053
- print("πŸ”§ Applying Arabic text correction...")
1054
  corrector = get_corrector()
1055
 
 
1056
  for idx, item in enumerate(layout_data):
1057
  text_content = item.get('text', '')
1058
  category = item.get('category', '')
@@ -1061,6 +1115,8 @@ def process_image(
1061
  if not text_content or category in ['Picture', 'Formula', 'Table']:
1062
  continue
1063
 
 
 
1064
  # Apply correction
1065
  correction_result = corrector.correct_text(text_content)
1066
 
@@ -1073,13 +1129,17 @@ def process_image(
1073
 
1074
  # Update the text field to use corrected version
1075
  item['text'] = correction_result['corrected']
 
 
 
 
1076
 
1077
  # Regenerate markdown with corrected text
1078
  corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
1079
  result['markdown_content_corrected'] = corrected_markdown
1080
  result['markdown_content_original'] = markdown_content
1081
 
1082
- print(f"βœ… Correction complete")
1083
 
1084
  except Exception as e:
1085
  print(f"⚠️ Error during Arabic correction: {e}")
 
500
  {"type": "image", "image": image},
501
  {
502
  "type": "text",
503
+ "text": """Extract ONLY the Arabic text from this image line.
504
+
505
+ STRICT RULES:
506
+ - Output ONLY Arabic characters you see in the image
507
+ - NO English words whatsoever (no 'Commission', 'Text', etc.)
508
+ - NO translations
509
+ - NO explanations
510
+ - NO additional text
511
+ - If you see handwriting, transcribe it exactly
512
+ - If there is no Arabic text, output nothing
513
+
514
+ Extract the Arabic text now:""",
515
  },
516
  ],
517
  }
 
647
  def split_text_regions_into_lines(
648
  image: Image.Image,
649
  layout_data: List[Dict[str, Any]],
650
+ min_line_height: int = 25,
651
+ max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
652
  ) -> List[Dict[str, Any]]:
653
  """
654
  Post-process layout data to split large text regions into individual lines.
 
679
 
680
  x1, y1, x2, y2 = bbox
681
  height = y2 - y1
682
+ width = x2 - x1
683
+
684
+ print(f" Checking region: height={height}px, width={width}px, category={category}")
685
 
686
  # If region is tall enough to contain multiple lines, split it
687
  if height > max_line_height:
688
+ print(f" β†’ Splitting! (height {height}px > threshold {max_line_height}px)")
689
  # Estimate number of lines based on typical line height
690
  # Arabic handwritten text: ~40-60px per line
691
  # Arabic typed text: ~30-50px per line
 
965
 
966
  # πŸ“ LINE-LEVEL SPLITTING: Split large text regions into individual lines
967
  # This ensures each line gets its own bounding box for easier verification
968
+ print(f"\nπŸ“‹ Initial layout: {len(layout_data)} regions detected")
969
+ for idx, item in enumerate(layout_data):
970
+ bbox = item.get('bbox', [])
971
+ text = item.get('text', '')[:50]
972
+ cat = item.get('category', '')
973
+ print(f" Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
974
+
975
  try:
976
+ layout_data_before = len(layout_data)
977
  layout_data = split_text_regions_into_lines(image, layout_data)
978
+ print(f"πŸ“ After splitting: {layout_data_before} β†’ {len(layout_data)} regions")
979
  except Exception as e:
980
  print(f"⚠️ Warning: Could not split text regions: {e}")
981
+ traceback.print_exc()
982
  # Continue with original layout data
983
 
984
  # πŸ”„ RE-OCR SPLIT LINES: For split regions, perform per-line OCR
 
1004
 
1005
  # Re-OCR this specific line
1006
  line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
1007
+
1008
+ # AGGRESSIVE FILTERING: Remove any English words/hallucinations
1009
+ line_text = line_text.strip()
1010
+
1011
+ # Remove common English hallucinations
1012
+ english_hallucinations = [
1013
+ 'Commission', 'commission', 'COMMISSION',
1014
+ 'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
1015
+ 'Text', 'text', 'Title', 'title', 'Caption', 'caption',
1016
+ 'Page', 'page', 'Document', 'document', 'Image', 'image'
1017
+ ]
1018
+
1019
+ for hallucination in english_hallucinations:
1020
+ line_text = line_text.replace(hallucination, '').strip()
1021
+
1022
+ # Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
1023
+ import re
1024
+ # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
1025
+ line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
1026
+
1027
+ item['text'] = line_text
1028
  item['confidence'] = line_conf
1029
  item['reocr_completed'] = True
1030
 
1031
+ print(f" βœ“ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
1032
  except Exception as e:
1033
+ print(f" βœ— Error re-OCRing line {idx}: {e}")
1034
+ traceback.print_exc()
1035
  item['text'] = "[OCR Failed]"
1036
  item['confidence'] = 0.0
1037
+
1038
+ print(f"\nβœ… Re-OCR complete. Final layout has {len(layout_data)} regions:")
1039
+ for idx, item in enumerate(layout_data):
1040
+ text = item.get('text', '')[:50]
1041
+ conf = item.get('confidence', 0)
1042
+ reocr = item.get('reocr_completed', False)
1043
+ print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
1044
 
1045
  # 🎯 INTELLIGENT CONFIDENCE SCORING
1046
  # Count text regions to determine if per-region scoring is feasible
 
1103
 
1104
  # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
1105
  try:
1106
+ print(f"\nπŸ”§ Applying Arabic text correction to {len(layout_data)} regions...")
1107
  corrector = get_corrector()
1108
 
1109
+ corrections_applied = 0
1110
  for idx, item in enumerate(layout_data):
1111
  text_content = item.get('text', '')
1112
  category = item.get('category', '')
 
1115
  if not text_content or category in ['Picture', 'Formula', 'Table']:
1116
  continue
1117
 
1118
+ print(f" Correcting region {idx+1}: '{text_content[:40]}...'")
1119
+
1120
  # Apply correction
1121
  correction_result = corrector.correct_text(text_content)
1122
 
 
1129
 
1130
  # Update the text field to use corrected version
1131
  item['text'] = correction_result['corrected']
1132
+
1133
+ if correction_result['corrections_made'] > 0:
1134
+ corrections_applied += correction_result['corrections_made']
1135
+ print(f" β†’ Made {correction_result['corrections_made']} corrections")
1136
 
1137
  # Regenerate markdown with corrected text
1138
  corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
1139
  result['markdown_content_corrected'] = corrected_markdown
1140
  result['markdown_content_original'] = markdown_content
1141
 
1142
+ print(f"βœ… Correction complete: {corrections_applied} total corrections made across {len(layout_data)} regions")
1143
 
1144
  except Exception as e:
1145
  print(f"⚠️ Error during Arabic correction: {e}")