import gradio as gr import easyocr from PIL import Image, ImageDraw, ImageFont import numpy as np import google.generativeai as genai import time import arabic_reshaper from bidi.algorithm import get_display # --- CONFIGURATION --- # It's best practice to load secrets from environment variables in Hugging Face # import os # api_key = os.environ.get("GEMINI_API_KEY") api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA" PERSIAN_FONT_PATH = "vazir.ttf" # --- GLOBAL INITIALIZATION --- reader = None def initialize_reader(): """Initialize EasyOCR reader if it hasn't been already.""" global reader if reader is None: print("Loading EasyOCR model...") reader = easyocr.Reader(['en'], gpu=False, verbose=False) print("EasyOCR model loaded successfully!") return reader # --- CORE FUNCTIONS --- def extract_text_and_bbox(image): """ Extracts text and calculates a single consolidated bounding box for all text found. """ if image is None: return "Please upload an image first.", None try: reader = initialize_reader() img_array = np.array(image) results = reader.readtext(img_array) if not results: return "No text detected in the image.", None min_x, min_y = float('inf'), float('inf') max_x, max_y = float('-inf'), float('-inf') text_parts = [] for (bbox, text, prob) in results: text_parts.append(text) (tl, tr, br, bl) = bbox min_x = min(min_x, tl[0], bl[0]) min_y = min(min_y, tl[1], tr[1]) max_x = max(max_x, tr[0], br[0]) max_y = max(max_y, bl[1], br[1]) extracted_text = ' '.join(text_parts) consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y)) return extracted_text, consolidated_bbox except Exception as e: return f"Error processing image: {str(e)}", None def translate_text_gemini(text): """Translates text using Gemini API.""" if not text or "No text" in text or "Error" in text or "Please upload" in text: return "No valid text to translate." try: genai.configure(api_key=api_key) model = genai.GenerativeModel('gemini-1.5-flash') prompt = (f"Translate the following English text to Persian. Your translation should be natural, touching, and relatable, " f"like casual chats with a friend—short and heartfelt. Use colloquial Persian words and contractions where appropriate. " f"Do not add any extra explanations, greetings, or emojis. Output ONLY the Persian translation. " f"English text: [{text}]") response = model.generate_content(prompt) return response.text.strip() except Exception as e: return f"Error during translation: {str(e)}" # --- CORRECTED IMAGE OVERLAY FUNCTION --- def overlay_text_on_image(original_image, text_to_overlay, bbox): """ Overlays Persian text onto an image, erasing the content within the given bounding box. Fixed to properly handle RTL text rendering like the working example. """ image_copy = original_image.copy() draw = ImageDraw.Draw(image_copy) # 1. Erase the old text (Inpainting) padding = 10 erase_box = (bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding) try: sample_x = max(0, int(erase_box[0]) - 5) sample_y = int((erase_box[1] + erase_box[3]) / 2) sample_x = min(sample_x, image_copy.width - 1) sample_y = min(sample_y, image_copy.height - 1) bg_color = image_copy.getpixel((sample_x, sample_y)) except (ValueError, IndexError): bg_color = (0, 0, 0) draw.rectangle(erase_box, fill=bg_color) # 2. Text processing following the working pattern target_width = (erase_box[2] - erase_box[0]) * 0.90 # 90% like in working code target_height = erase_box[3] - erase_box[1] # Split text into lines (or words if needed for wrapping) lines = [line.strip() for line in text_to_overlay.split('\n') if line.strip()] if not lines: lines = [text_to_overlay] # Single line if no newlines # **KEY FIX**: Reshape ALL lines first, then apply get_display() reshaped_lines = [] for line in lines: reshaped = arabic_reshaper.reshape(line) display_text = get_display(reshaped) # This was missing! reshaped_lines.append(display_text) # 3. Find optimal font size font_size = 100 final_font = None # Find the longest line for font sizing (like in working code) if reshaped_lines: temp_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) longest_line = max(reshaped_lines, key=lambda line: draw.textlength(line, font=temp_font)) # Reduce font size until longest line fits while font_size > 10: font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) if draw.textlength(longest_line, font=font) <= target_width: final_font = font break font_size -= 2 if final_font is None: final_font = ImageFont.truetype(PERSIAN_FONT_PATH, 12) # 4. Handle line wrapping if text is too wide final_lines = [] for line in reshaped_lines: if draw.textlength(line, font=final_font) <= target_width: final_lines.append(line) else: # Need to wrap this line - split by words and rewrap original_line = lines[reshaped_lines.index(line)] # Get original before reshaping words = original_line.split() current_line_words = [] for word in words: test_words = current_line_words + [word] test_text = ' '.join(test_words) # Process the test text properly test_reshaped = arabic_reshaper.reshape(test_text) test_display = get_display(test_reshaped) if draw.textlength(test_display, font=final_font) <= target_width: current_line_words = test_words else: # Line is full, save current line and start new one if current_line_words: line_text = ' '.join(current_line_words) line_reshaped = arabic_reshaper.reshape(line_text) line_display = get_display(line_reshaped) final_lines.append(line_display) current_line_words = [word] # Add remaining words if current_line_words: line_text = ' '.join(current_line_words) line_reshaped = arabic_reshaper.reshape(line_text) line_display = get_display(line_reshaped) final_lines.append(line_display) # 5. Calculate total height and center text (following working pattern) line_spacing = 20 # Same as working code total_text_height = 0 line_heights = [] for line in final_lines: line_bbox = draw.textbbox((0, 0), line, font=final_font) line_height = line_bbox[3] - line_bbox[1] line_heights.append(line_height) total_text_height += line_height # Add spacing between lines if len(final_lines) > 1: total_text_height += (len(final_lines) - 1) * line_spacing # Check if total height fits, if not reduce font size while total_text_height > target_height and font_size > 10: font_size -= 2 final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) # Recalculate heights total_text_height = 0 line_heights = [] for line in final_lines: line_bbox = draw.textbbox((0, 0), line, font=final_font) line_height = line_bbox[3] - line_bbox[1] line_heights.append(line_height) total_text_height += line_height if len(final_lines) > 1: total_text_height += (len(final_lines) - 1) * line_spacing # Center vertically in the erase box y_start = erase_box[1] + (target_height - total_text_height) / 2 # 6. Draw the text (following working pattern) current_y = y_start for i, line in enumerate(final_lines): # Center horizontally line_width = draw.textlength(line, font=final_font) x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2 line_y_center = current_y + line_heights[i] / 2 # Draw shadow for visibility draw.text( (x_center + 1, line_y_center + 1), line, font=final_font, fill=(0, 0, 0), # Black shadow anchor="mm" ) # Draw main text draw.text( (x_center, line_y_center), line, font=final_font, fill=(255, 255, 255), # White text anchor="mm" ) current_y += line_heights[i] + line_spacing return image_copy # --- GRADIO INTERFACE --- with gr.Blocks(title="Quote OCR Translator", theme=gr.themes.Soft()) as demo: gr.Markdown("# 📝 Quote Image Translator") gr.Markdown("Upload an image with English text. See the Persian translation overlaid directly on the image!") with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( label="Upload Quote Image", type="pil", sources=["upload", "clipboard"] ) text_output = gr.Textbox( label="Extracted Quote Text (English)", placeholder="The detected English text will appear here...", lines=4, show_copy_button=True ) translated_output = gr.Textbox( label="Translated Quote (Persian)", placeholder="The Persian translation will appear here...", lines=4, show_copy_button=True ) with gr.Column(scale=1): image_output = gr.Image( label="Translated Image Output", type="pil" ) def process_and_overlay(image): if image is None: return "Please upload an image.", "Translation will appear here.", None extracted_text, bbox = extract_text_and_bbox(image) if bbox is None: return extracted_text, "No text to translate.", None translated_text = translate_text_gemini(extracted_text) if "Error" in translated_text: return extracted_text, translated_text, None final_image = overlay_text_on_image(image, translated_text, bbox) return extracted_text, translated_text, final_image image_input.change( fn=process_and_overlay, inputs=[image_input], outputs=[text_output, translated_output, image_output] ) gr.Markdown(""" ### 💡 How It Works: 1. **Text Detection:** The app uses OCR to find English text and its location. 2. **Inpainting:** It digitally "erases" the original text. 3. **Translation:** The text is sent to an AI for Persian translation. 4. **Overlay:** The Persian text is rendered and placed back onto the image. """) if __name__ == "__main__": demo.launch()