Spaces:

VanguardAI
/

Arabic-OCR

Sleeping

App Files Files Community

VanguardAI commited on Nov 5, 2025

Commit

38e40e9

verified ·

1 Parent(s): f52a750

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -8

app.py CHANGED Viewed

@@ -500,7 +500,18 @@ def _generate_text_and_confidence_for_crop(
                     {"type": "image", "image": image},
                     {
                         "type": "text",
-                        "text": "Extract the exact Arabic text from this image. This may be handwritten or typed Arabic text. Output ONLY the Arabic text you see, with no translation, no English, no additional words or explanations. If there is no text, output nothing.",
                     },
                 ],
             }
@@ -636,8 +647,8 @@ def estimate_text_density(image: Image.Image) -> float:
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
-    min_line_height: int = 30,
-    max_line_height: int = 120
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
@@ -668,9 +679,13 @@ def split_text_regions_into_lines(
         x1, y1, x2, y2 = bbox
         height = y2 - y1
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
             # Estimate number of lines based on typical line height
             # Arabic handwritten text: ~40-60px per line
             # Arabic typed text: ~30-50px per line
@@ -950,10 +965,20 @@ def process_image(
             # 📏 LINE-LEVEL SPLITTING: Split large text regions into individual lines
             # This ensures each line gets its own bounding box for easier verification
             try:
                 layout_data = split_text_regions_into_lines(image, layout_data)
             except Exception as e:
                 print(f"⚠️ Warning: Could not split text regions: {e}")
                 # Continue with original layout data
             # 🔄 RE-OCR SPLIT LINES: For split regions, perform per-line OCR
@@ -979,15 +1004,43 @@ def process_image(
                         # Re-OCR this specific line
                         line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
-                        item['text'] = line_text.strip()
                         item['confidence'] = line_conf
                         item['reocr_completed'] = True
-                        print(f"   Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' ({line_conf:.1f}%)")
                     except Exception as e:
-                        print(f"⚠️ Error re-OCRing line {idx}: {e}")
                         item['text'] = "[OCR Failed]"
                         item['confidence'] = 0.0
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
@@ -1050,9 +1103,10 @@ def process_image(
             # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
             try:
-                print("🔧 Applying Arabic text correction...")
                 corrector = get_corrector()
                 for idx, item in enumerate(layout_data):
                     text_content = item.get('text', '')
                     category = item.get('category', '')
@@ -1061,6 +1115,8 @@ def process_image(
                     if not text_content or category in ['Picture', 'Formula', 'Table']:
                         continue
                     # Apply correction
                     correction_result = corrector.correct_text(text_content)
@@ -1073,13 +1129,17 @@ def process_image(
                     # Update the text field to use corrected version
                     item['text'] = correction_result['corrected']
                 # Regenerate markdown with corrected text
                 corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
                 result['markdown_content_corrected'] = corrected_markdown
                 result['markdown_content_original'] = markdown_content
-                print(f"✅ Correction complete")
             except Exception as e:
                 print(f"⚠️ Error during Arabic correction: {e}")

                     {"type": "image", "image": image},
                     {
                         "type": "text",
+                        "text": """Extract ONLY the Arabic text from this image line.
+STRICT RULES:
+- Output ONLY Arabic characters you see in the image
+- NO English words whatsoever (no 'Commission', 'Text', etc.)
+- NO translations
+- NO explanations
+- NO additional text
+- If you see handwriting, transcribe it exactly
+- If there is no Arabic text, output nothing
+Extract the Arabic text now:""",
                     },
                 ],
             }
 def split_text_regions_into_lines(
     image: Image.Image,
     layout_data: List[Dict[str, Any]],
+    min_line_height: int = 25,
+    max_line_height: int = 80  # More aggressive - split anything taller than ~2 lines
 ) -> List[Dict[str, Any]]:
     """
     Post-process layout data to split large text regions into individual lines.
         x1, y1, x2, y2 = bbox
         height = y2 - y1
+        width = x2 - x1
+        print(f"   Checking region: height={height}px, width={width}px, category={category}")
         # If region is tall enough to contain multiple lines, split it
         if height > max_line_height:
+            print(f"   → Splitting! (height {height}px > threshold {max_line_height}px)")
             # Estimate number of lines based on typical line height
             # Arabic handwritten text: ~40-60px per line
             # Arabic typed text: ~30-50px per line
             # 📏 LINE-LEVEL SPLITTING: Split large text regions into individual lines
             # This ensures each line gets its own bounding box for easier verification
+            print(f"\n📋 Initial layout: {len(layout_data)} regions detected")
+            for idx, item in enumerate(layout_data):
+                bbox = item.get('bbox', [])
+                text = item.get('text', '')[:50]
+                cat = item.get('category', '')
+                print(f"   Region {idx+1}: {cat} - '{text}...' bbox={bbox}")
             try:
+                layout_data_before = len(layout_data)
                 layout_data = split_text_regions_into_lines(image, layout_data)
+                print(f"📐 After splitting: {layout_data_before} → {len(layout_data)} regions")
             except Exception as e:
                 print(f"⚠️ Warning: Could not split text regions: {e}")
+                traceback.print_exc()
                 # Continue with original layout data
             # 🔄 RE-OCR SPLIT LINES: For split regions, perform per-line OCR
                         # Re-OCR this specific line
                         line_text, line_conf = _generate_text_and_confidence_for_crop(crop_img)
+                        # AGGRESSIVE FILTERING: Remove any English words/hallucinations
+                        line_text = line_text.strip()
+                        # Remove common English hallucinations
+                        english_hallucinations = [
+                            'Commission', 'commission', 'COMMISSION',
+                            'The', 'the', 'and', 'or', 'of', 'in', 'to', 'a', 'is',
+                            'Text', 'text', 'Title', 'title', 'Caption', 'caption',
+                            'Page', 'page', 'Document', 'document', 'Image', 'image'
+                        ]
+                        for hallucination in english_hallucinations:
+                            line_text = line_text.replace(hallucination, '').strip()
+                        # Remove any remaining Latin alphabet (keep only Arabic, numbers, punctuation)
+                        import re
+                        # Keep: Arabic letters, Arabic numbers, spaces, basic punctuation
+                        line_text = re.sub(r'[a-zA-Z]+', '', line_text).strip()
+                        item['text'] = line_text
                         item['confidence'] = line_conf
                         item['reocr_completed'] = True
+                        print(f"   ✓ Line {idx+1}/{len(regions_needing_reocr)}: '{line_text[:50]}...' (conf: {line_conf:.1f}%)")
                     except Exception as e:
+                        print(f"   ✗ Error re-OCRing line {idx}: {e}")
+                        traceback.print_exc()
                         item['text'] = "[OCR Failed]"
                         item['confidence'] = 0.0
+                print(f"\n✅ Re-OCR complete. Final layout has {len(layout_data)} regions:")
+                for idx, item in enumerate(layout_data):
+                    text = item.get('text', '')[:50]
+                    conf = item.get('confidence', 0)
+                    reocr = item.get('reocr_completed', False)
+                    print(f"   Region {idx+1}: '{text}...' (conf={conf:.1f}%, re-OCR={reocr})")
             # 🎯 INTELLIGENT CONFIDENCE SCORING
             # Count text regions to determine if per-region scoring is feasible
             # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
             try:
+                print(f"\n🔧 Applying Arabic text correction to {len(layout_data)} regions...")
                 corrector = get_corrector()
+                corrections_applied = 0
                 for idx, item in enumerate(layout_data):
                     text_content = item.get('text', '')
                     category = item.get('category', '')
                     if not text_content or category in ['Picture', 'Formula', 'Table']:
                         continue
+                    print(f"   Correcting region {idx+1}: '{text_content[:40]}...'")
                     # Apply correction
                     correction_result = corrector.correct_text(text_content)
                     # Update the text field to use corrected version
                     item['text'] = correction_result['corrected']
+                    if correction_result['corrections_made'] > 0:
+                        corrections_applied += correction_result['corrections_made']
+                        print(f"   → Made {correction_result['corrections_made']} corrections")
                 # Regenerate markdown with corrected text
                 corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
                 result['markdown_content_corrected'] = corrected_markdown
                 result['markdown_content_original'] = markdown_content
+                print(f"✅ Correction complete: {corrections_applied} total corrections made across {len(layout_data)} regions")
             except Exception as e:
                 print(f"⚠️ Error during Arabic correction: {e}")