Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Sleeping

App Files Files Community

ArabicOCRExtractor / app.py

GiantAnalytics

Update app.py

af88408 verified 11 months ago

raw

history blame contribute delete

12 kB

	import gradio as gr
	import easyocr
	import cv2
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	import os
	import requests
	from pathlib import Path
	import pandas as pd
	import pytesseract
	from pytesseract import Output
	import traceback
	import logging
	import sys

	# Set up logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[logging.StreamHandler(sys.stdout)])
	logger = logging.getLogger(__name__)

	# Download and cache the font file
	def get_font():
	try:
	logger.info("Attempting to get font...")
	font_path = Path("Roboto-Regular.ttf")
	if not font_path.exists():
	logger.info("Font not found, downloading...")
	font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
	response = requests.get(font_url)
	font_path.write_bytes(response.content)
	logger.info("Font downloaded successfully")
	else:
	logger.info("Font already exists")
	return str(font_path)
	except Exception as e:
	logger.error(f"Error in get_font: {str(e)}")
	logger.error(traceback.format_exc())
	return None

	# Initialize EasyOCR Reader for French
	try:
	logger.info("Initializing EasyOCR Reader for French...")
	reader = easyocr.Reader(['fr', 'en'], gpu=False) # Changed to False since you're on CPU
	logger.info("EasyOCR Reader initialized successfully")
	except Exception as e:
	logger.error(f"Error initializing EasyOCR: {str(e)}")
	logger.error(traceback.format_exc())

	def ocr_extract_text_and_tables(image):
	try:
	logger.info("Starting OCR extraction...")

	if image is None:
	logger.warning("No image provided")
	return "No image provided", None, None

	logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")

	# Convert to RGB if needed
	if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
	logger.info("Converting RGBA to RGB")
	image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)

	# Create copy for table detection
	table_image = image.copy()

	# 1. First extract general text with EasyOCR
	logger.info("Running EasyOCR text detection...")
	results = reader.readtext(image)
	logger.info(f"EasyOCR detected {len(results)} text regions")

	# Prepare text output and confidence scores
	detected_text = []
	for i, (bbox, text, confidence) in enumerate(results):
	logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
	detected_text.append(f"{text} (Confidence: {confidence:.2f})")

	# 2. Use pytesseract for table detection and extraction
	logger.info("Running Pytesseract for table detection...")
	try:
	pytesseract_config = r'--oem 3 --psm 6 -l fra' # French language
	logger.info(f"Pytesseract config: {pytesseract_config}")
	df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
	logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
	except Exception as e:
	logger.error(f"Pytesseract error: {str(e)}")
	logger.error(traceback.format_exc())
	df = pd.DataFrame() # Empty dataframe to continue processing

	# Filter out low-confidence text
	try:
	if not df.empty:
	logger.info("Filtering low-confidence text...")
	df = df.dropna(subset=['text'])
	logger.info(f"After dropna, dataframe shape: {df.shape}")
	if 'conf' in df.columns:
	df = df.query('conf > 50')
	logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
	else:
	logger.warning("No 'conf' column found in pytesseract output")
	except Exception as e:
	logger.error(f"Error filtering dataframe: {str(e)}")
	logger.error(traceback.format_exc())

	# Try to identify table structures based on alignment and spacing
	tables = []

	try:
	if not df.empty and 'block_num' in df.columns:
	logger.info("Attempting to identify tables...")
	# Simple table detection: look for text that's aligned in columns with similar x-coordinates
	# Group by block_num which often separates tables
	blocks = df['block_num'].unique()
	logger.info(f"Found {len(blocks)} text blocks")

	for block in blocks:
	logger.info(f"Processing block {block}")
	block_df = df[df['block_num'] == block]
	if len(block_df) > 4: # Assuming a table has at least a few cells
	logger.info(f"Block {block} has {len(block_df)} cells, might be a table")
	# Sort by top-to-bottom (vertical position)
	sorted_df = block_df.sort_values(['top', 'left'])

	# Convert to pandas table format
	table_rows = []
	current_row = []
	last_top = -100

	for _, row in sorted_df.iterrows():
	# If we're on a new row (based on vertical position)
	if abs(row['top'] - last_top) > 10: # Threshold for new row
	if current_row:
	table_rows.append(current_row)
	current_row = []
	last_top = row['top']

	current_row.append(row['text'])

	# Add the last row
	if current_row:
	table_rows.append(current_row)

	logger.info(f"Extracted {len(table_rows)} rows from potential table")

	# If we have multiple rows, we might have a table
	if len(table_rows) > 1:
	# Try to create a pandas DataFrame
	try:
	# Pad rows to have equal length
	max_cols = max(len(row) for row in table_rows)
	logger.info(f"Table has {max_cols} columns")
	padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]

	# Create DataFrame
	table_df = pd.DataFrame(padded_rows)
	# Add to tables list
	tables.append(table_df)
	logger.info(f"Successfully created table with shape {table_df.shape}")
	except Exception as e:
	logger.error(f"Error creating table DataFrame: {str(e)}")
	logger.error(traceback.format_exc())
	except Exception as e:
	logger.error(f"Error in table detection: {str(e)}")
	logger.error(traceback.format_exc())

	logger.info(f"Detected {len(tables)} tables")

	# Create annotated image
	try:
	logger.info("Creating annotated image...")
	pil_image = Image.fromarray(image)
	draw = ImageDraw.Draw(pil_image)

	# Get font for annotation
	logger.info("Loading font...")
	try:
	font_path = get_font()
	if font_path:
	font = ImageFont.truetype(font_path, size=20)
	logger.info("Font loaded successfully")
	else:
	logger.warning("Font path is None, using default font")
	font = ImageFont.load_default()
	except Exception as e:
	logger.error(f"Error loading font: {str(e)}")
	logger.error(traceback.format_exc())
	font = ImageFont.load_default()
	logger.info("Using default font instead")

	# Draw boxes and text for regular text detection
	logger.info("Drawing annotation boxes...")
	for i, (bbox, text, confidence) in enumerate(results):
	try:
	# Convert points to integers
	top_left = tuple(map(int, bbox[0]))
	bottom_right = tuple(map(int, bbox[2]))

	# Draw rectangle
	draw.rectangle([top_left, bottom_right], outline="red", width=3)

	# Draw text with confidence
	text_with_conf = f"{text} ({confidence:.2f})"
	draw.text(top_left, text_with_conf, fill="blue", font=font)

	logger.info(f"Drew annotation for text region {i+1}")
	except Exception as e:
	logger.error(f"Error drawing annotation for region {i+1}: {str(e)}")
	continue

	# Convert back to numpy array
	annotated_image = np.array(pil_image)
	logger.info("Annotated image created successfully")
	except Exception as e:
	logger.error(f"Error creating annotated image: {str(e)}")
	logger.error(traceback.format_exc())
	annotated_image = image.copy() # Return original image if annotation fails

	# Join detected text with proper formatting
	text_output = "\n".join(detected_text)

	# Format tables for display
	tables_output = ""
	for i, table in enumerate(tables):
	tables_output += f"Table {i+1}:\n"
	tables_output += table.to_string(index=False, header=False) + "\n\n"

	logger.info("OCR extraction completed successfully")
	return text_output, tables_output, annotated_image

	except Exception as e:
	error_msg = f"Unexpected error in OCR extraction: {str(e)}"
	logger.error(error_msg)
	logger.error(traceback.format_exc())
	return f"Error: {error_msg}", "Processing failed", None

	# Create Gradio interface
	try:
	logger.info("Creating Gradio interface...")
	iface = gr.Interface(
	fn=ocr_extract_text_and_tables,
	inputs=gr.Image(type="numpy", label="Upload Image"),
	outputs=[
	gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
	gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
	gr.Image(label="Annotated Image")
	],
	title="French OCR & Table Extractor",
	description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
	examples=[], # You can add example images here
	cache_examples=True
	)
	logger.info("Gradio interface created successfully")
	except Exception as e:
	logger.error(f"Error creating Gradio interface: {str(e)}")
	logger.error(traceback.format_exc())

	# Launch the interface
	if __name__ == "__main__":
	try:
	logger.info("Launching Gradio interface...")
	iface.launch()
	logger.info("Gradio interface launched successfully")
	except Exception as e:
	logger.error(f"Error launching Gradio interface: {str(e)}")
	logger.error(traceback.format_exc())