Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Sleeping

File size: 4,724 Bytes

import gradio as gr
import easyocr
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os
import requests
from pathlib import Path
import pandas as pd
import pytesseract
from pytesseract import Output
import traceback
import logging
import sys
from img2table.document import Image as Img2TableImage
from img2table.ocr import TesseractOCR
import pytesseract
import os

# Set the correct Tesseract path
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Explicitly set the traineddata path
# os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/4.00/tessdata/"

# Initialize Tesseract for French OCR
ocr = TesseractOCR(lang="fra")

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)

# Download and cache the font file
def get_font():
    """Download font for annotation if not available."""
    try:
        font_path = Path("Roboto-Regular.ttf")
        if not font_path.exists():
            font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
            response = requests.get(font_url)
            font_path.write_bytes(response.content)
        return str(font_path)
    except Exception as e:
        logger.error(f"Error in get_font: {str(e)}")
        return None

# Initialize EasyOCR Reader for French & English
try:
    reader = easyocr.Reader(['fr', 'en'], gpu=False)
except Exception as e:
    logger.error(f"Error initializing EasyOCR: {str(e)}")

def ocr_extract_text_and_tables(image):
    """Extract text and tables from an image."""
    try:
        if image is None:
            return "No image provided", "No image provided", None

        # Ensure image is in the correct format
        if isinstance(image, Image.Image):
            image = np.array(image, dtype=np.uint8)  # Convert PIL to numpy

        # If image has an alpha channel (RGBA), convert to RGB
        if len(image.shape) == 3 and image.shape[2] == 4:
            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
        elif len(image.shape) == 2:  # If grayscale, convert to BGR
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

        # Convert to grayscale for better OCR
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply adaptive thresholding
        processed = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
        
        # 1️⃣ Extract General Text using EasyOCR
        results = reader.readtext(processed)
        detected_text = [f"{text} (Confidence: {confidence:.2f})" for _, text, confidence in results]

        # 2️⃣ Save image to a temporary file for `img2table`
        temp_image_path = "temp_table_image.jpg"
        cv2.imwrite(temp_image_path, image)

        # 3️⃣ Use img2table for structured table extraction
        img = Img2TableImage(temp_image_path)  # Use file path instead of np.ndarray
        ocr = TesseractOCR(lang="fra")
        tables = img.extract_tables(ocr=ocr)

        # Convert tables to Pandas DataFrame
        table_data = [table.df for table in tables] if tables else []

        # Save extracted tables as CSV (optional)
        for i, df in enumerate(table_data):
            df.to_csv(f"extracted_table_{i+1}.csv", index=False)

        # 4️⃣ Annotate Image with Bounding Boxes for Detected Text
        pil_image = Image.fromarray(image)
        draw = ImageDraw.Draw(pil_image)

        for bbox, text, confidence in results:
            top_left = tuple(map(int, bbox[0]))
            bottom_right = tuple(map(int, bbox[2]))
            draw.rectangle([top_left, bottom_right], outline="red", width=3)

        annotated_image = np.array(pil_image)

        # Format output
        text_output = "\n".join(detected_text)
        tables_output = "\n\n".join([df.to_string(index=False, header=False) for df in table_data]) if table_data else "No tables detected."

        return text_output, tables_output, annotated_image

    except Exception as e:
        return f"Error: {str(e)}", "Processing failed", None

# Create Gradio Interface
iface = gr.Interface(
    fn=ocr_extract_text_and_tables,
    inputs=gr.Image(type="pil", label="Upload Image"),  # Ensures PIL image input
    outputs=[
        gr.Textbox(label="Extracted Text (French)"),
        gr.Textbox(label="Extracted Tables"),
        gr.Image(label="Annotated Image")
    ],
    title="French OCR & Table Extractor",
    description="Upload an image containing French text and tables for OCR processing.",
)

if __name__ == "__main__":
    iface.launch()