Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Running

App Files Files Community

GiantAnalytics commited on Mar 21, 2025

Commit

0e09218

verified ·

1 Parent(s): a7b5b52

Create app.py

Browse files

Files changed (1) hide show

app.py +154 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import gradio as gr
+import easyocr
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import os
+import requests
+from pathlib import Path
+import pandas as pd
+import pytesseract
+from pytesseract import Output
+# Download and cache the font file
+def get_font():
+    font_path = Path("Roboto-Regular.ttf")
+    if not font_path.exists():
+        font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
+        response = requests.get(font_url)
+        font_path.write_bytes(response.content)
+    return str(font_path)
+# Initialize EasyOCR Reader for French
+reader = easyocr.Reader(['fr'], gpu=True)  # Set gpu=False if no GPU available
+def ocr_extract_text_and_tables(image):
+    if image is None:
+        return "No image provided", None, None
+    # Convert to RGB if needed
+    if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
+        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+    # Create copy for table detection
+    table_image = image.copy()
+    # 1. First extract general text with EasyOCR
+    results = reader.readtext(image)
+    # Prepare text output and confidence scores
+    detected_text = []
+    for (_, text, confidence) in results:
+        detected_text.append(f"{text} (Confidence: {confidence:.2f})")
+    # 2. Use pytesseract for table detection and extraction
+    # This approach uses pytesseract's data.frame output to identify potential tables
+    pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
+    df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
+    # Filter out low-confidence text
+    df = df.dropna(subset=['text']).query('conf > 50')
+    # Try to identify table structures based on alignment and spacing
+    tables = []
+    # Simple table detection: look for text that's aligned in columns with similar x-coordinates
+    # Group by block_num which often separates tables
+    blocks = df['block_num'].unique()
+    for block in blocks:
+        block_df = df[df['block_num'] == block]
+        if len(block_df) > 4:  # Assuming a table has at least a few cells
+            # Sort by top-to-bottom (vertical position)
+            sorted_df = block_df.sort_values(['top', 'left'])
+            # Convert to pandas table format
+            table_rows = []
+            current_row = []
+            last_top = -100
+            for _, row in sorted_df.iterrows():
+                # If we're on a new row (based on vertical position)
+                if abs(row['top'] - last_top) > 10:  # Threshold for new row
+                    if current_row:
+                        table_rows.append(current_row)
+                        current_row = []
+                    last_top = row['top']
+                current_row.append(row['text'])
+            # Add the last row
+            if current_row:
+                table_rows.append(current_row)
+            # If we have multiple rows, we might have a table
+            if len(table_rows) > 1:
+                # Try to create a pandas DataFrame
+                try:
+                    # Pad rows to have equal length
+                    max_cols = max(len(row) for row in table_rows)
+                    padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
+                    # Create DataFrame
+                    table_df = pd.DataFrame(padded_rows)
+                    # Add to tables list
+                    tables.append(table_df)
+                except:
+                    pass
+    # Create annotated image
+    pil_image = Image.fromarray(image)
+    draw = ImageDraw.Draw(pil_image)
+    # Get font for annotation
+    try:
+        font = ImageFont.truetype(get_font(), size=20)
+    except Exception as e:
+        print(f"Error loading font: {e}")
+        font = ImageFont.load_default()
+    # Draw boxes and text for regular text detection
+    for (bbox, text, confidence) in results:
+        # Convert points to integers
+        top_left = tuple(map(int, bbox[0]))
+        bottom_right = tuple(map(int, bbox[2]))
+        # Draw rectangle
+        draw.rectangle([top_left, bottom_right], outline="red", width=3)
+        # Draw text with confidence
+        text_with_conf = f"{text} ({confidence:.2f})"
+        draw.text(top_left, text_with_conf, fill="blue", font=font)
+    # Convert back to numpy array
+    annotated_image = np.array(pil_image)
+    # Join detected text with proper formatting
+    text_output = "\n".join(detected_text)
+    # Format tables for display
+    tables_output = ""
+    for i, table in enumerate(tables):
+        tables_output += f"Table {i+1}:\n"
+        tables_output += table.to_string(index=False, header=False) + "\n\n"
+    return text_output, tables_output, annotated_image
+# Create Gradio interface
+iface = gr.Interface(
+    fn=ocr_extract_text_and_tables,
+    inputs=gr.Image(type="numpy", label="Upload Image"),
+    outputs=[
+        gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
+        gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
+        gr.Image(label="Annotated Image")
+    ],
+    title="French OCR & Table Extractor",
+    description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
+    examples=[],  # You can add example images here
+    cache_examples=True
+)
+# Launch the interface
+if __name__ == "__main__":
+    iface.launch()