Spaces:

kavehtaheri
/

ocrlight2.1overlaytext

Sleeping

App Files Files Community

ocrlight2.1overlaytext / app.py

kavehtaheri

Update app.py

62937d0 verified 7 months ago

raw

history blame contribute delete

11.5 kB

	import gradio as gr
	import easyocr
	from PIL import Image, ImageDraw, ImageFont
	import numpy as np
	import google.generativeai as genai
	import time
	import arabic_reshaper
	from bidi.algorithm import get_display

	# --- CONFIGURATION ---
	# It's best practice to load secrets from environment variables in Hugging Face
	# import os
	# api_key = os.environ.get("GEMINI_API_KEY")
	api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"

	PERSIAN_FONT_PATH = "vazir.ttf"

	# --- GLOBAL INITIALIZATION ---
	reader = None

	def initialize_reader():
	"""Initialize EasyOCR reader if it hasn't been already."""
	global reader
	if reader is None:
	print("Loading EasyOCR model...")
	reader = easyocr.Reader(['en'], gpu=False, verbose=False)
	print("EasyOCR model loaded successfully!")
	return reader

	# --- CORE FUNCTIONS ---

	def extract_text_and_bbox(image):
	"""
	Extracts text and calculates a single consolidated bounding box for all text found.
	"""
	if image is None:
	return "Please upload an image first.", None

	try:
	reader = initialize_reader()
	img_array = np.array(image)
	results = reader.readtext(img_array)

	if not results:
	return "No text detected in the image.", None

	min_x, min_y = float('inf'), float('inf')
	max_x, max_y = float('-inf'), float('-inf')

	text_parts = []
	for (bbox, text, prob) in results:
	text_parts.append(text)
	(tl, tr, br, bl) = bbox
	min_x = min(min_x, tl[0], bl[0])
	min_y = min(min_y, tl[1], tr[1])
	max_x = max(max_x, tr[0], br[0])
	max_y = max(max_y, bl[1], br[1])

	extracted_text = ' '.join(text_parts)
	consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))

	return extracted_text, consolidated_bbox

	except Exception as e:
	return f"Error processing image: {str(e)}", None


	def translate_text_gemini(text):
	"""Translates text using Gemini API."""
	if not text or "No text" in text or "Error" in text or "Please upload" in text:
	return "No valid text to translate."

	try:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-flash')
	prompt = (f"Translate the following English text to Persian. Your translation should be natural, touching, and relatable, "
	f"like casual chats with a friend—short and heartfelt. Use colloquial Persian words and contractions where appropriate. "
	f"Do not add any extra explanations, greetings, or emojis. Output ONLY the Persian translation. "
	f"English text: [{text}]")

	response = model.generate_content(prompt)
	return response.text.strip()
	except Exception as e:
	return f"Error during translation: {str(e)}"

	# --- CORRECTED IMAGE OVERLAY FUNCTION ---

	def overlay_text_on_image(original_image, text_to_overlay, bbox):
	"""
	Overlays Persian text onto an image, erasing the content within the given bounding box.
	Fixed to properly handle RTL text rendering like the working example.
	"""
	image_copy = original_image.copy()
	draw = ImageDraw.Draw(image_copy)

	# 1. Erase the old text (Inpainting)
	padding = 10
	erase_box = (bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding)

	try:
	sample_x = max(0, int(erase_box[0]) - 5)
	sample_y = int((erase_box[1] + erase_box[3]) / 2)
	sample_x = min(sample_x, image_copy.width - 1)
	sample_y = min(sample_y, image_copy.height - 1)
	bg_color = image_copy.getpixel((sample_x, sample_y))
	except (ValueError, IndexError):
	bg_color = (0, 0, 0)

	draw.rectangle(erase_box, fill=bg_color)

	# 2. Text processing following the working pattern
	target_width = (erase_box[2] - erase_box[0]) * 0.90 # 90% like in working code
	target_height = erase_box[3] - erase_box[1]

	# Split text into lines (or words if needed for wrapping)
	lines = [line.strip() for line in text_to_overlay.split('\n') if line.strip()]
	if not lines:
	lines = [text_to_overlay] # Single line if no newlines

	# KEY FIX: Reshape ALL lines first, then apply get_display()
	reshaped_lines = []
	for line in lines:
	reshaped = arabic_reshaper.reshape(line)
	display_text = get_display(reshaped) # This was missing!
	reshaped_lines.append(display_text)

	# 3. Find optimal font size
	font_size = 100
	final_font = None

	# Find the longest line for font sizing (like in working code)
	if reshaped_lines:
	temp_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
	longest_line = max(reshaped_lines, key=lambda line: draw.textlength(line, font=temp_font))

	# Reduce font size until longest line fits
	while font_size > 10:
	font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
	if draw.textlength(longest_line, font=font) <= target_width:
	final_font = font
	break
	font_size -= 2

	if final_font is None:
	final_font = ImageFont.truetype(PERSIAN_FONT_PATH, 12)

	# 4. Handle line wrapping if text is too wide
	final_lines = []
	for line in reshaped_lines:
	if draw.textlength(line, font=final_font) <= target_width:
	final_lines.append(line)
	else:
	# Need to wrap this line - split by words and rewrap
	original_line = lines[reshaped_lines.index(line)] # Get original before reshaping
	words = original_line.split()

	current_line_words = []
	for word in words:
	test_words = current_line_words + [word]
	test_text = ' '.join(test_words)

	# Process the test text properly
	test_reshaped = arabic_reshaper.reshape(test_text)
	test_display = get_display(test_reshaped)

	if draw.textlength(test_display, font=final_font) <= target_width:
	current_line_words = test_words
	else:
	# Line is full, save current line and start new one
	if current_line_words:
	line_text = ' '.join(current_line_words)
	line_reshaped = arabic_reshaper.reshape(line_text)
	line_display = get_display(line_reshaped)
	final_lines.append(line_display)
	current_line_words = [word]

	# Add remaining words
	if current_line_words:
	line_text = ' '.join(current_line_words)
	line_reshaped = arabic_reshaper.reshape(line_text)
	line_display = get_display(line_reshaped)
	final_lines.append(line_display)

	# 5. Calculate total height and center text (following working pattern)
	line_spacing = 20 # Same as working code
	total_text_height = 0
	line_heights = []

	for line in final_lines:
	line_bbox = draw.textbbox((0, 0), line, font=final_font)
	line_height = line_bbox[3] - line_bbox[1]
	line_heights.append(line_height)
	total_text_height += line_height

	# Add spacing between lines
	if len(final_lines) > 1:
	total_text_height += (len(final_lines) - 1) * line_spacing

	# Check if total height fits, if not reduce font size
	while total_text_height > target_height and font_size > 10:
	font_size -= 2
	final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)

	# Recalculate heights
	total_text_height = 0
	line_heights = []
	for line in final_lines:
	line_bbox = draw.textbbox((0, 0), line, font=final_font)
	line_height = line_bbox[3] - line_bbox[1]
	line_heights.append(line_height)
	total_text_height += line_height

	if len(final_lines) > 1:
	total_text_height += (len(final_lines) - 1) * line_spacing

	# Center vertically in the erase box
	y_start = erase_box[1] + (target_height - total_text_height) / 2

	# 6. Draw the text (following working pattern)
	current_y = y_start
	for i, line in enumerate(final_lines):
	# Center horizontally
	line_width = draw.textlength(line, font=final_font)
	x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
	line_y_center = current_y + line_heights[i] / 2

	# Draw shadow for visibility
	draw.text(
	(x_center + 1, line_y_center + 1),
	line,
	font=final_font,
	fill=(0, 0, 0), # Black shadow
	anchor="mm"
	)

	# Draw main text
	draw.text(
	(x_center, line_y_center),
	line,
	font=final_font,
	fill=(255, 255, 255), # White text
	anchor="mm"
	)

	current_y += line_heights[i] + line_spacing

	return image_copy

	# --- GRADIO INTERFACE ---

	with gr.Blocks(title="Quote OCR Translator", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📝 Quote Image Translator")
	gr.Markdown("Upload an image with English text. See the Persian translation overlaid directly on the image!")

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(
	label="Upload Quote Image",
	type="pil",
	sources=["upload", "clipboard"]
	)
	text_output = gr.Textbox(
	label="Extracted Quote Text (English)",
	placeholder="The detected English text will appear here...",
	lines=4,
	show_copy_button=True
	)
	translated_output = gr.Textbox(
	label="Translated Quote (Persian)",
	placeholder="The Persian translation will appear here...",
	lines=4,
	show_copy_button=True
	)

	with gr.Column(scale=1):
	image_output = gr.Image(
	label="Translated Image Output",
	type="pil"
	)

	def process_and_overlay(image):
	if image is None:
	return "Please upload an image.", "Translation will appear here.", None

	extracted_text, bbox = extract_text_and_bbox(image)

	if bbox is None:
	return extracted_text, "No text to translate.", None

	translated_text = translate_text_gemini(extracted_text)

	if "Error" in translated_text:
	return extracted_text, translated_text, None

	final_image = overlay_text_on_image(image, translated_text, bbox)

	return extracted_text, translated_text, final_image

	image_input.change(
	fn=process_and_overlay,
	inputs=[image_input],
	outputs=[text_output, translated_output, image_output]
	)

	gr.Markdown("""
	### 💡 How It Works:
	1. Text Detection: The app uses OCR to find English text and its location.
	2. Inpainting: It digitally "erases" the original text.
	3. Translation: The text is sent to an AI for Persian translation.
	4. Overlay: The Persian text is rendered and placed back onto the image.
	""")

	if __name__ == "__main__":
	demo.launch()