kavehtaheri's picture
Update app.py
62937d0 verified
import gradio as gr
import easyocr
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import google.generativeai as genai
import time
import arabic_reshaper
from bidi.algorithm import get_display
# --- CONFIGURATION ---
# It's best practice to load secrets from environment variables in Hugging Face
# import os
# api_key = os.environ.get("GEMINI_API_KEY")
api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
PERSIAN_FONT_PATH = "vazir.ttf"
# --- GLOBAL INITIALIZATION ---
reader = None
def initialize_reader():
"""Initialize EasyOCR reader if it hasn't been already."""
global reader
if reader is None:
print("Loading EasyOCR model...")
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
print("EasyOCR model loaded successfully!")
return reader
# --- CORE FUNCTIONS ---
def extract_text_and_bbox(image):
"""
Extracts text and calculates a single consolidated bounding box for all text found.
"""
if image is None:
return "Please upload an image first.", None
try:
reader = initialize_reader()
img_array = np.array(image)
results = reader.readtext(img_array)
if not results:
return "No text detected in the image.", None
min_x, min_y = float('inf'), float('inf')
max_x, max_y = float('-inf'), float('-inf')
text_parts = []
for (bbox, text, prob) in results:
text_parts.append(text)
(tl, tr, br, bl) = bbox
min_x = min(min_x, tl[0], bl[0])
min_y = min(min_y, tl[1], tr[1])
max_x = max(max_x, tr[0], br[0])
max_y = max(max_y, bl[1], br[1])
extracted_text = ' '.join(text_parts)
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
return extracted_text, consolidated_bbox
except Exception as e:
return f"Error processing image: {str(e)}", None
def translate_text_gemini(text):
"""Translates text using Gemini API."""
if not text or "No text" in text or "Error" in text or "Please upload" in text:
return "No valid text to translate."
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')
prompt = (f"Translate the following English text to Persian. Your translation should be natural, touching, and relatable, "
f"like casual chats with a friend—short and heartfelt. Use colloquial Persian words and contractions where appropriate. "
f"Do not add any extra explanations, greetings, or emojis. Output ONLY the Persian translation. "
f"English text: [{text}]")
response = model.generate_content(prompt)
return response.text.strip()
except Exception as e:
return f"Error during translation: {str(e)}"
# --- CORRECTED IMAGE OVERLAY FUNCTION ---
def overlay_text_on_image(original_image, text_to_overlay, bbox):
"""
Overlays Persian text onto an image, erasing the content within the given bounding box.
Fixed to properly handle RTL text rendering like the working example.
"""
image_copy = original_image.copy()
draw = ImageDraw.Draw(image_copy)
# 1. Erase the old text (Inpainting)
padding = 10
erase_box = (bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding)
try:
sample_x = max(0, int(erase_box[0]) - 5)
sample_y = int((erase_box[1] + erase_box[3]) / 2)
sample_x = min(sample_x, image_copy.width - 1)
sample_y = min(sample_y, image_copy.height - 1)
bg_color = image_copy.getpixel((sample_x, sample_y))
except (ValueError, IndexError):
bg_color = (0, 0, 0)
draw.rectangle(erase_box, fill=bg_color)
# 2. Text processing following the working pattern
target_width = (erase_box[2] - erase_box[0]) * 0.90 # 90% like in working code
target_height = erase_box[3] - erase_box[1]
# Split text into lines (or words if needed for wrapping)
lines = [line.strip() for line in text_to_overlay.split('\n') if line.strip()]
if not lines:
lines = [text_to_overlay] # Single line if no newlines
# **KEY FIX**: Reshape ALL lines first, then apply get_display()
reshaped_lines = []
for line in lines:
reshaped = arabic_reshaper.reshape(line)
display_text = get_display(reshaped) # This was missing!
reshaped_lines.append(display_text)
# 3. Find optimal font size
font_size = 100
final_font = None
# Find the longest line for font sizing (like in working code)
if reshaped_lines:
temp_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
longest_line = max(reshaped_lines, key=lambda line: draw.textlength(line, font=temp_font))
# Reduce font size until longest line fits
while font_size > 10:
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
if draw.textlength(longest_line, font=font) <= target_width:
final_font = font
break
font_size -= 2
if final_font is None:
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, 12)
# 4. Handle line wrapping if text is too wide
final_lines = []
for line in reshaped_lines:
if draw.textlength(line, font=final_font) <= target_width:
final_lines.append(line)
else:
# Need to wrap this line - split by words and rewrap
original_line = lines[reshaped_lines.index(line)] # Get original before reshaping
words = original_line.split()
current_line_words = []
for word in words:
test_words = current_line_words + [word]
test_text = ' '.join(test_words)
# Process the test text properly
test_reshaped = arabic_reshaper.reshape(test_text)
test_display = get_display(test_reshaped)
if draw.textlength(test_display, font=final_font) <= target_width:
current_line_words = test_words
else:
# Line is full, save current line and start new one
if current_line_words:
line_text = ' '.join(current_line_words)
line_reshaped = arabic_reshaper.reshape(line_text)
line_display = get_display(line_reshaped)
final_lines.append(line_display)
current_line_words = [word]
# Add remaining words
if current_line_words:
line_text = ' '.join(current_line_words)
line_reshaped = arabic_reshaper.reshape(line_text)
line_display = get_display(line_reshaped)
final_lines.append(line_display)
# 5. Calculate total height and center text (following working pattern)
line_spacing = 20 # Same as working code
total_text_height = 0
line_heights = []
for line in final_lines:
line_bbox = draw.textbbox((0, 0), line, font=final_font)
line_height = line_bbox[3] - line_bbox[1]
line_heights.append(line_height)
total_text_height += line_height
# Add spacing between lines
if len(final_lines) > 1:
total_text_height += (len(final_lines) - 1) * line_spacing
# Check if total height fits, if not reduce font size
while total_text_height > target_height and font_size > 10:
font_size -= 2
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
# Recalculate heights
total_text_height = 0
line_heights = []
for line in final_lines:
line_bbox = draw.textbbox((0, 0), line, font=final_font)
line_height = line_bbox[3] - line_bbox[1]
line_heights.append(line_height)
total_text_height += line_height
if len(final_lines) > 1:
total_text_height += (len(final_lines) - 1) * line_spacing
# Center vertically in the erase box
y_start = erase_box[1] + (target_height - total_text_height) / 2
# 6. Draw the text (following working pattern)
current_y = y_start
for i, line in enumerate(final_lines):
# Center horizontally
line_width = draw.textlength(line, font=final_font)
x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
line_y_center = current_y + line_heights[i] / 2
# Draw shadow for visibility
draw.text(
(x_center + 1, line_y_center + 1),
line,
font=final_font,
fill=(0, 0, 0), # Black shadow
anchor="mm"
)
# Draw main text
draw.text(
(x_center, line_y_center),
line,
font=final_font,
fill=(255, 255, 255), # White text
anchor="mm"
)
current_y += line_heights[i] + line_spacing
return image_copy
# --- GRADIO INTERFACE ---
with gr.Blocks(title="Quote OCR Translator", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 📝 Quote Image Translator")
gr.Markdown("Upload an image with English text. See the Persian translation overlaid directly on the image!")
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
label="Upload Quote Image",
type="pil",
sources=["upload", "clipboard"]
)
text_output = gr.Textbox(
label="Extracted Quote Text (English)",
placeholder="The detected English text will appear here...",
lines=4,
show_copy_button=True
)
translated_output = gr.Textbox(
label="Translated Quote (Persian)",
placeholder="The Persian translation will appear here...",
lines=4,
show_copy_button=True
)
with gr.Column(scale=1):
image_output = gr.Image(
label="Translated Image Output",
type="pil"
)
def process_and_overlay(image):
if image is None:
return "Please upload an image.", "Translation will appear here.", None
extracted_text, bbox = extract_text_and_bbox(image)
if bbox is None:
return extracted_text, "No text to translate.", None
translated_text = translate_text_gemini(extracted_text)
if "Error" in translated_text:
return extracted_text, translated_text, None
final_image = overlay_text_on_image(image, translated_text, bbox)
return extracted_text, translated_text, final_image
image_input.change(
fn=process_and_overlay,
inputs=[image_input],
outputs=[text_output, translated_output, image_output]
)
gr.Markdown("""
### 💡 How It Works:
1. **Text Detection:** The app uses OCR to find English text and its location.
2. **Inpainting:** It digitally "erases" the original text.
3. **Translation:** The text is sent to an AI for Persian translation.
4. **Overlay:** The Persian text is rendered and placed back onto the image.
""")
if __name__ == "__main__":
demo.launch()