Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -500,7 +500,18 @@ def _generate_text_and_confidence_for_crop(
|
|
| 500 |
{"type": "image", "image": image},
|
| 501 |
{
|
| 502 |
"type": "text",
|
| 503 |
-
"text": "Extract the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
},
|
| 505 |
],
|
| 506 |
}
|
|
@@ -636,8 +647,8 @@ def estimate_text_density(image: Image.Image) -> float:
|
|
| 636 |
def split_text_regions_into_lines(
|
| 637 |
image: Image.Image,
|
| 638 |
layout_data: List[Dict[str, Any]],
|
| 639 |
-
min_line_height: int =
|
| 640 |
-
max_line_height: int =
|
| 641 |
) -> List[Dict[str, Any]]:
|
| 642 |
"""
|
| 643 |
Post-process layout data to split large text regions into individual lines.
|
|
@@ -668,9 +679,13 @@ def split_text_regions_into_lines(
|
|
| 668 |
|
| 669 |
x1, y1, x2, y2 = bbox
|
| 670 |
height = y2 - y1
|
|
|
|
|
|
|
|
|
|
| 671 |
|
| 672 |
# If region is tall enough to contain multiple lines, split it
|
| 673 |
if height > max_line_height:
|
|
|
|
| 674 |
# Estimate number of lines based on typical line height
|
| 675 |
# Arabic handwritten text: ~40-60px per line
|
| 676 |
# Arabic typed text: ~30-50px per line
|
|
@@ -950,10 +965,20 @@ def process_image(
|
|
| 950 |
|
| 951 |
# π LINE-LEVEL SPLITTING: Split large text regions into individual lines
|
| 952 |
# This ensures each line gets its own bounding box for easier verification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
try:
|
|
|
|
| 954 |
layout_data = split_text_regions_into_lines(image, layout_data)
|
|
|
|
| 955 |
except Exception as e:
|
| 956 |
print(f"β οΈ Warning: Could not split text regions: {e}")
|
|
|
|
| 957 |
# Continue with original layout data
|
| 958 |
|
| 959 |
# π RE-OCR SPLIT LINES: For split regions, perform per-line OCR
|
|
@@ -979,15 +1004,43 @@ def process_image(
|
|
| 979 |
|
| 980 |
# Re-OCR this specific line
|
| 981 |
line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
|
| 982 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 983 |
item['confidence'] = line_conf
|
| 984 |
item['reocr_completed'] = True
|
| 985 |
|
| 986 |
-
print(f" Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' ({line_conf:.1f}%)")
|
| 987 |
except Exception as e:
|
| 988 |
-
print(f"
|
|
|
|
| 989 |
item['text'] = "[OCR Failed]"
|
| 990 |
item['confidence'] = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
|
| 992 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 993 |
# Count text regions to determine if per-region scoring is feasible
|
|
@@ -1050,9 +1103,10 @@ def process_image(
|
|
| 1050 |
|
| 1051 |
# β¨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
|
| 1052 |
try:
|
| 1053 |
-
print("π§ Applying Arabic text correction...")
|
| 1054 |
corrector = get_corrector()
|
| 1055 |
|
|
|
|
| 1056 |
for idx, item in enumerate(layout_data):
|
| 1057 |
text_content = item.get('text', '')
|
| 1058 |
category = item.get('category', '')
|
|
@@ -1061,6 +1115,8 @@ def process_image(
|
|
| 1061 |
if not text_content or category in ['Picture', 'Formula', 'Table']:
|
| 1062 |
continue
|
| 1063 |
|
|
|
|
|
|
|
| 1064 |
# Apply correction
|
| 1065 |
correction_result = corrector.correct_text(text_content)
|
| 1066 |
|
|
@@ -1073,13 +1129,17 @@ def process_image(
|
|
| 1073 |
|
| 1074 |
# Update the text field to use corrected version
|
| 1075 |
item['text'] = correction_result['corrected']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
|
| 1077 |
# Regenerate markdown with corrected text
|
| 1078 |
corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
|
| 1079 |
result['markdown_content_corrected'] = corrected_markdown
|
| 1080 |
result['markdown_content_original'] = markdown_content
|
| 1081 |
|
| 1082 |
-
print(f"β
Correction complete")
|
| 1083 |
|
| 1084 |
except Exception as e:
|
| 1085 |
print(f"β οΈ Error during Arabic correction: {e}")
|
|
|
|
| 500 |
{"type": "image", "image": image},
|
| 501 |
{
|
| 502 |
"type": "text",
|
| 503 |
+
"text": """Extract ONLY the Arabic text from this image line.
|
| 504 |
+
|
| 505 |
+
STRICT RULES:
|
| 506 |
+
- Output ONLY Arabic characters you see in the image
|
| 507 |
+
- NO English words whatsoever (no 'Commission', 'Text', etc.)
|
| 508 |
+
- NO translations
|
| 509 |
+
- NO explanations
|
| 510 |
+
- NO additional text
|
| 511 |
+
- If you see handwriting, transcribe it exactly
|
| 512 |
+
- If there is no Arabic text, output nothing
|
| 513 |
+
|
| 514 |
+
Extract the Arabic text now:""",
|
| 515 |
},
|
| 516 |
],
|
| 517 |
}
|
|
|
|
| 647 |
def split_text_regions_into_lines(
|
| 648 |
image: Image.Image,
|
| 649 |
layout_data: List[Dict[str, Any]],
|
| 650 |
+
min_line_height: int = 25,
|
| 651 |
+
max_line_height: int = 80 # More aggressive - split anything taller than ~2 lines
|
| 652 |
) -> List[Dict[str, Any]]:
|
| 653 |
"""
|
| 654 |
Post-process layout data to split large text regions into individual lines.
|
|
|
|
| 679 |
|
| 680 |
x1, y1, x2, y2 = bbox
|
| 681 |
height = y2 - y1
|
| 682 |
+
width = x2 - x1
|
| 683 |
+
|
| 684 |
+
print(f" Checking region: height={height}px, width={width}px, category={category}")
|
| 685 |
|
| 686 |
# If region is tall enough to contain multiple lines, split it
|
| 687 |
if height > max_line_height:
|
| 688 |
+
print(f" β Splitting! (height {height}px > threshold {max_line_height}px)")
|
| 689 |
# Estimate number of lines based on typical line height
|
| 690 |
# Arabic handwritten text: ~40-60px per line
|
| 691 |
# Arabic typed text: ~30-50px per line
|
|
|
|
| 965 |
|
| 966 |
# π LINE-LEVEL SPLITTING: Split large text regions into individual lines
|
| 967 |
# This ensures each line gets its own bounding box for easier verification
|
| 968 |
+
print(f"\nπ Initial layout: {len(layout_data)} regions detected")
|
| 969 |
+
for idx, item in enumerate(layout_data):
|
| 970 |
+
bbox = item.get('bbox', [])
|
| 971 |
+
text = item.get('text', '')[:50]
|
| 972 |
+
cat = item.get('category', '')
|
| 973 |
+
print(f" Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
|
| 974 |
+
|
| 975 |
try:
|
| 976 |
+
layout_data_before = len(layout_data)
|
| 977 |
layout_data = split_text_regions_into_lines(image, layout_data)
|
| 978 |
+
print(f"π After splitting: {layout_data_before} β {len(layout_data)} regions")
|
| 979 |
except Exception as e:
|
| 980 |
print(f"β οΈ Warning: Could not split text regions: {e}")
|
| 981 |
+
traceback.print_exc()
|
| 982 |
# Continue with original layout data
|
| 983 |
|
| 984 |
# π RE-OCR SPLIT LINES: For split regions, perform per-line OCR
|
|
|
|
| 1004 |
|
| 1005 |
# Re-OCR this specific line
|
| 1006 |
line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
|
| 1007 |
+
|
| 1008 |
+
# AGGRESSIVE FILTERING: Remove any English words/hallucinations
|
| 1009 |
+
line_text = line_text.strip()
|
| 1010 |
+
|
| 1011 |
+
# Remove common English hallucinations
|
| 1012 |
+
english_hallucinations = [
|
| 1013 |
+
'Commission', 'commission', 'COMMISSION',
|
| 1014 |
+
'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
|
| 1015 |
+
'Text', 'text', 'Title', 'title', 'Caption', 'caption',
|
| 1016 |
+
'Page', 'page', 'Document', 'document', 'Image', 'image'
|
| 1017 |
+
]
|
| 1018 |
+
|
| 1019 |
+
for hallucination in english_hallucinations:
|
| 1020 |
+
line_text = line_text.replace(hallucination, '').strip()
|
| 1021 |
+
|
| 1022 |
+
# Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
|
| 1023 |
+
import re
|
| 1024 |
+
# Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
|
| 1025 |
+
line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
|
| 1026 |
+
|
| 1027 |
+
item['text'] = line_text
|
| 1028 |
item['confidence'] = line_conf
|
| 1029 |
item['reocr_completed'] = True
|
| 1030 |
|
| 1031 |
+
print(f" β Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
|
| 1032 |
except Exception as e:
|
| 1033 |
+
print(f" β Error re-OCRing line {idx}: {e}")
|
| 1034 |
+
traceback.print_exc()
|
| 1035 |
item['text'] = "[OCR Failed]"
|
| 1036 |
item['confidence'] = 0.0
|
| 1037 |
+
|
| 1038 |
+
print(f"\nβ
Re-OCR complete. Final layout has {len(layout_data)} regions:")
|
| 1039 |
+
for idx, item in enumerate(layout_data):
|
| 1040 |
+
text = item.get('text', '')[:50]
|
| 1041 |
+
conf = item.get('confidence', 0)
|
| 1042 |
+
reocr = item.get('reocr_completed', False)
|
| 1043 |
+
print(f" Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
|
| 1044 |
|
| 1045 |
# π― INTELLIGENT CONFIDENCE SCORING
|
| 1046 |
# Count text regions to determine if per-region scoring is feasible
|
|
|
|
| 1103 |
|
| 1104 |
# β¨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
|
| 1105 |
try:
|
| 1106 |
+
print(f"\nπ§ Applying Arabic text correction to {len(layout_data)} regions...")
|
| 1107 |
corrector = get_corrector()
|
| 1108 |
|
| 1109 |
+
corrections_applied = 0
|
| 1110 |
for idx, item in enumerate(layout_data):
|
| 1111 |
text_content = item.get('text', '')
|
| 1112 |
category = item.get('category', '')
|
|
|
|
| 1115 |
if not text_content or category in ['Picture', 'Formula', 'Table']:
|
| 1116 |
continue
|
| 1117 |
|
| 1118 |
+
print(f" Correcting region {idx+1}: '{text_content[:40]}...'")
|
| 1119 |
+
|
| 1120 |
# Apply correction
|
| 1121 |
correction_result = corrector.correct_text(text_content)
|
| 1122 |
|
|
|
|
| 1129 |
|
| 1130 |
# Update the text field to use corrected version
|
| 1131 |
item['text'] = correction_result['corrected']
|
| 1132 |
+
|
| 1133 |
+
if correction_result['corrections_made'] > 0:
|
| 1134 |
+
corrections_applied += correction_result['corrections_made']
|
| 1135 |
+
print(f" β Made {correction_result['corrections_made']} corrections")
|
| 1136 |
|
| 1137 |
# Regenerate markdown with corrected text
|
| 1138 |
corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
|
| 1139 |
result['markdown_content_corrected'] = corrected_markdown
|
| 1140 |
result['markdown_content_original'] = markdown_content
|
| 1141 |
|
| 1142 |
+
print(f"β
Correction complete: {corrections_applied} total corrections made across {len(layout_data)} regions")
|
| 1143 |
|
| 1144 |
except Exception as e:
|
| 1145 |
print(f"β οΈ Error during Arabic correction: {e}")
|