Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Sleeping

File size: 11,957 Bytes

import gradio as gr
import easyocr
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os
import requests
from pathlib import Path
import pandas as pd
import pytesseract
from pytesseract import Output
import traceback
import logging
import sys

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)

# Download and cache the font file
def get_font():
    try:
        logger.info("Attempting to get font...")
        font_path = Path("Roboto-Regular.ttf")
        if not font_path.exists():
            logger.info("Font not found, downloading...")
            font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
            response = requests.get(font_url)
            font_path.write_bytes(response.content)
            logger.info("Font downloaded successfully")
        else:
            logger.info("Font already exists")
        return str(font_path)
    except Exception as e:
        logger.error(f"Error in get_font: {str(e)}")
        logger.error(traceback.format_exc())
        return None

# Initialize EasyOCR Reader for French
try:
    logger.info("Initializing EasyOCR Reader for French...")
    reader = easyocr.Reader(['fr', 'en'], gpu=False)  # Changed to False since you're on CPU
    logger.info("EasyOCR Reader initialized successfully")
except Exception as e:
    logger.error(f"Error initializing EasyOCR: {str(e)}")
    logger.error(traceback.format_exc())

def ocr_extract_text_and_tables(image):
    try:
        logger.info("Starting OCR extraction...")
        
        if image is None:
            logger.warning("No image provided")
            return "No image provided", None, None
        
        logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
        
        # Convert to RGB if needed
        if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
            logger.info("Converting RGBA to RGB")
            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
        
        # Create copy for table detection
        table_image = image.copy()
        
        # 1. First extract general text with EasyOCR
        logger.info("Running EasyOCR text detection...")
        results = reader.readtext(image)
        logger.info(f"EasyOCR detected {len(results)} text regions")
        
        # Prepare text output and confidence scores
        detected_text = []
        for i, (bbox, text, confidence) in enumerate(results):
            logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
            detected_text.append(f"{text} (Confidence: {confidence:.2f})")
        
        # 2. Use pytesseract for table detection and extraction
        logger.info("Running Pytesseract for table detection...")
        try:
            pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
            logger.info(f"Pytesseract config: {pytesseract_config}")
            df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
            logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
        except Exception as e:
            logger.error(f"Pytesseract error: {str(e)}")
            logger.error(traceback.format_exc())
            df = pd.DataFrame()  # Empty dataframe to continue processing
        
        # Filter out low-confidence text
        try:
            if not df.empty:
                logger.info("Filtering low-confidence text...")
                df = df.dropna(subset=['text'])
                logger.info(f"After dropna, dataframe shape: {df.shape}")
                if 'conf' in df.columns:
                    df = df.query('conf > 50')
                    logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
                else:
                    logger.warning("No 'conf' column found in pytesseract output")
        except Exception as e:
            logger.error(f"Error filtering dataframe: {str(e)}")
            logger.error(traceback.format_exc())
        
        # Try to identify table structures based on alignment and spacing
        tables = []
        
        try:
            if not df.empty and 'block_num' in df.columns:
                logger.info("Attempting to identify tables...")
                # Simple table detection: look for text that's aligned in columns with similar x-coordinates
                # Group by block_num which often separates tables
                blocks = df['block_num'].unique()
                logger.info(f"Found {len(blocks)} text blocks")
                
                for block in blocks:
                    logger.info(f"Processing block {block}")
                    block_df = df[df['block_num'] == block]
                    if len(block_df) > 4:  # Assuming a table has at least a few cells
                        logger.info(f"Block {block} has {len(block_df)} cells, might be a table")
                        # Sort by top-to-bottom (vertical position)
                        sorted_df = block_df.sort_values(['top', 'left'])
                        
                        # Convert to pandas table format
                        table_rows = []
                        current_row = []
                        last_top = -100
                        
                        for _, row in sorted_df.iterrows():
                            # If we're on a new row (based on vertical position)
                            if abs(row['top'] - last_top) > 10:  # Threshold for new row
                                if current_row:
                                    table_rows.append(current_row)
                                    current_row = []
                                last_top = row['top']
                            
                            current_row.append(row['text'])
                        
                        # Add the last row
                        if current_row:
                            table_rows.append(current_row)
                        
                        logger.info(f"Extracted {len(table_rows)} rows from potential table")
                        
                        # If we have multiple rows, we might have a table
                        if len(table_rows) > 1:
                            # Try to create a pandas DataFrame
                            try:
                                # Pad rows to have equal length
                                max_cols = max(len(row) for row in table_rows)
                                logger.info(f"Table has {max_cols} columns")
                                padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
                                
                                # Create DataFrame
                                table_df = pd.DataFrame(padded_rows)
                                # Add to tables list
                                tables.append(table_df)
                                logger.info(f"Successfully created table with shape {table_df.shape}")
                            except Exception as e:
                                logger.error(f"Error creating table DataFrame: {str(e)}")
                                logger.error(traceback.format_exc())
        except Exception as e:
            logger.error(f"Error in table detection: {str(e)}")
            logger.error(traceback.format_exc())
        
        logger.info(f"Detected {len(tables)} tables")
        
        # Create annotated image
        try:
            logger.info("Creating annotated image...")
            pil_image = Image.fromarray(image)
            draw = ImageDraw.Draw(pil_image)
            
            # Get font for annotation
            logger.info("Loading font...")
            try:
                font_path = get_font()
                if font_path:
                    font = ImageFont.truetype(font_path, size=20)
                    logger.info("Font loaded successfully")
                else:
                    logger.warning("Font path is None, using default font")
                    font = ImageFont.load_default()
            except Exception as e:
                logger.error(f"Error loading font: {str(e)}")
                logger.error(traceback.format_exc())
                font = ImageFont.load_default()
                logger.info("Using default font instead")
            
            # Draw boxes and text for regular text detection
            logger.info("Drawing annotation boxes...")
            for i, (bbox, text, confidence) in enumerate(results):
                try:
                    # Convert points to integers
                    top_left = tuple(map(int, bbox[0]))
                    bottom_right = tuple(map(int, bbox[2]))
                    
                    # Draw rectangle
                    draw.rectangle([top_left, bottom_right], outline="red", width=3)
                    
                    # Draw text with confidence
                    text_with_conf = f"{text} ({confidence:.2f})"
                    draw.text(top_left, text_with_conf, fill="blue", font=font)
                    
                    logger.info(f"Drew annotation for text region {i+1}")
                except Exception as e:
                    logger.error(f"Error drawing annotation for region {i+1}: {str(e)}")
                    continue
            
            # Convert back to numpy array
            annotated_image = np.array(pil_image)
            logger.info("Annotated image created successfully")
        except Exception as e:
            logger.error(f"Error creating annotated image: {str(e)}")
            logger.error(traceback.format_exc())
            annotated_image = image.copy()  # Return original image if annotation fails
        
        # Join detected text with proper formatting
        text_output = "\n".join(detected_text)
        
        # Format tables for display
        tables_output = ""
        for i, table in enumerate(tables):
            tables_output += f"Table {i+1}:\n"
            tables_output += table.to_string(index=False, header=False) + "\n\n"
        
        logger.info("OCR extraction completed successfully")
        return text_output, tables_output, annotated_image
        
    except Exception as e:
        error_msg = f"Unexpected error in OCR extraction: {str(e)}"
        logger.error(error_msg)
        logger.error(traceback.format_exc())
        return f"Error: {error_msg}", "Processing failed", None

# Create Gradio interface
try:
    logger.info("Creating Gradio interface...")
    iface = gr.Interface(
        fn=ocr_extract_text_and_tables,
        inputs=gr.Image(type="numpy", label="Upload Image"),
        outputs=[
            gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
            gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
            gr.Image(label="Annotated Image")
        ],
        title="French OCR & Table Extractor",
        description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
        examples=[],  # You can add example images here
        cache_examples=True
    )
    logger.info("Gradio interface created successfully")
except Exception as e:
    logger.error(f"Error creating Gradio interface: {str(e)}")
    logger.error(traceback.format_exc())

# Launch the interface
if __name__ == "__main__":
    try:
        logger.info("Launching Gradio interface...")
        iface.launch()
        logger.info("Gradio interface launched successfully")
    except Exception as e:
        logger.error(f"Error launching Gradio interface: {str(e)}")
        logger.error(traceback.format_exc())