Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import easyocr | |
| from PIL import Image, ImageDraw, ImageFont | |
| import numpy as np | |
| import google.generativeai as genai | |
| import time | |
| import arabic_reshaper | |
| from bidi.algorithm import get_display | |
| # --- CONFIGURATION --- | |
| # It's best practice to load secrets from environment variables in Hugging Face | |
| # import os | |
| # api_key = os.environ.get("GEMINI_API_KEY") | |
| api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA" | |
| PERSIAN_FONT_PATH = "vazir.ttf" | |
| # --- GLOBAL INITIALIZATION --- | |
| reader = None | |
| def initialize_reader(): | |
| """Initialize EasyOCR reader if it hasn't been already.""" | |
| global reader | |
| if reader is None: | |
| print("Loading EasyOCR model...") | |
| reader = easyocr.Reader(['en'], gpu=False, verbose=False) | |
| print("EasyOCR model loaded successfully!") | |
| return reader | |
| # --- CORE FUNCTIONS --- | |
| def extract_text_and_bbox(image): | |
| """ | |
| Extracts text and calculates a single consolidated bounding box for all text found. | |
| """ | |
| if image is None: | |
| return "Please upload an image first.", None | |
| try: | |
| reader = initialize_reader() | |
| img_array = np.array(image) | |
| results = reader.readtext(img_array) | |
| if not results: | |
| return "No text detected in the image.", None | |
| min_x, min_y = float('inf'), float('inf') | |
| max_x, max_y = float('-inf'), float('-inf') | |
| text_parts = [] | |
| for (bbox, text, prob) in results: | |
| text_parts.append(text) | |
| (tl, tr, br, bl) = bbox | |
| min_x = min(min_x, tl[0], bl[0]) | |
| min_y = min(min_y, tl[1], tr[1]) | |
| max_x = max(max_x, tr[0], br[0]) | |
| max_y = max(max_y, bl[1], br[1]) | |
| extracted_text = ' '.join(text_parts) | |
| consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y)) | |
| return extracted_text, consolidated_bbox | |
| except Exception as e: | |
| return f"Error processing image: {str(e)}", None | |
| def translate_text_gemini(text): | |
| """Translates text using Gemini API.""" | |
| if not text or "No text" in text or "Error" in text or "Please upload" in text: | |
| return "No valid text to translate." | |
| try: | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel('gemini-1.5-flash') | |
| prompt = (f"Translate the following English text to Persian. Your translation should be natural, touching, and relatable, " | |
| f"like casual chats with a friend—short and heartfelt. Use colloquial Persian words and contractions where appropriate. " | |
| f"Do not add any extra explanations, greetings, or emojis. Output ONLY the Persian translation. " | |
| f"English text: [{text}]") | |
| response = model.generate_content(prompt) | |
| return response.text.strip() | |
| except Exception as e: | |
| return f"Error during translation: {str(e)}" | |
| # --- CORRECTED IMAGE OVERLAY FUNCTION --- | |
| def overlay_text_on_image(original_image, text_to_overlay, bbox): | |
| """ | |
| Overlays Persian text onto an image, erasing the content within the given bounding box. | |
| Fixed to properly handle RTL text rendering like the working example. | |
| """ | |
| image_copy = original_image.copy() | |
| draw = ImageDraw.Draw(image_copy) | |
| # 1. Erase the old text (Inpainting) | |
| padding = 10 | |
| erase_box = (bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding) | |
| try: | |
| sample_x = max(0, int(erase_box[0]) - 5) | |
| sample_y = int((erase_box[1] + erase_box[3]) / 2) | |
| sample_x = min(sample_x, image_copy.width - 1) | |
| sample_y = min(sample_y, image_copy.height - 1) | |
| bg_color = image_copy.getpixel((sample_x, sample_y)) | |
| except (ValueError, IndexError): | |
| bg_color = (0, 0, 0) | |
| draw.rectangle(erase_box, fill=bg_color) | |
| # 2. Text processing following the working pattern | |
| target_width = (erase_box[2] - erase_box[0]) * 0.90 # 90% like in working code | |
| target_height = erase_box[3] - erase_box[1] | |
| # Split text into lines (or words if needed for wrapping) | |
| lines = [line.strip() for line in text_to_overlay.split('\n') if line.strip()] | |
| if not lines: | |
| lines = [text_to_overlay] # Single line if no newlines | |
| # **KEY FIX**: Reshape ALL lines first, then apply get_display() | |
| reshaped_lines = [] | |
| for line in lines: | |
| reshaped = arabic_reshaper.reshape(line) | |
| display_text = get_display(reshaped) # This was missing! | |
| reshaped_lines.append(display_text) | |
| # 3. Find optimal font size | |
| font_size = 100 | |
| final_font = None | |
| # Find the longest line for font sizing (like in working code) | |
| if reshaped_lines: | |
| temp_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) | |
| longest_line = max(reshaped_lines, key=lambda line: draw.textlength(line, font=temp_font)) | |
| # Reduce font size until longest line fits | |
| while font_size > 10: | |
| font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) | |
| if draw.textlength(longest_line, font=font) <= target_width: | |
| final_font = font | |
| break | |
| font_size -= 2 | |
| if final_font is None: | |
| final_font = ImageFont.truetype(PERSIAN_FONT_PATH, 12) | |
| # 4. Handle line wrapping if text is too wide | |
| final_lines = [] | |
| for line in reshaped_lines: | |
| if draw.textlength(line, font=final_font) <= target_width: | |
| final_lines.append(line) | |
| else: | |
| # Need to wrap this line - split by words and rewrap | |
| original_line = lines[reshaped_lines.index(line)] # Get original before reshaping | |
| words = original_line.split() | |
| current_line_words = [] | |
| for word in words: | |
| test_words = current_line_words + [word] | |
| test_text = ' '.join(test_words) | |
| # Process the test text properly | |
| test_reshaped = arabic_reshaper.reshape(test_text) | |
| test_display = get_display(test_reshaped) | |
| if draw.textlength(test_display, font=final_font) <= target_width: | |
| current_line_words = test_words | |
| else: | |
| # Line is full, save current line and start new one | |
| if current_line_words: | |
| line_text = ' '.join(current_line_words) | |
| line_reshaped = arabic_reshaper.reshape(line_text) | |
| line_display = get_display(line_reshaped) | |
| final_lines.append(line_display) | |
| current_line_words = [word] | |
| # Add remaining words | |
| if current_line_words: | |
| line_text = ' '.join(current_line_words) | |
| line_reshaped = arabic_reshaper.reshape(line_text) | |
| line_display = get_display(line_reshaped) | |
| final_lines.append(line_display) | |
| # 5. Calculate total height and center text (following working pattern) | |
| line_spacing = 20 # Same as working code | |
| total_text_height = 0 | |
| line_heights = [] | |
| for line in final_lines: | |
| line_bbox = draw.textbbox((0, 0), line, font=final_font) | |
| line_height = line_bbox[3] - line_bbox[1] | |
| line_heights.append(line_height) | |
| total_text_height += line_height | |
| # Add spacing between lines | |
| if len(final_lines) > 1: | |
| total_text_height += (len(final_lines) - 1) * line_spacing | |
| # Check if total height fits, if not reduce font size | |
| while total_text_height > target_height and font_size > 10: | |
| font_size -= 2 | |
| final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size) | |
| # Recalculate heights | |
| total_text_height = 0 | |
| line_heights = [] | |
| for line in final_lines: | |
| line_bbox = draw.textbbox((0, 0), line, font=final_font) | |
| line_height = line_bbox[3] - line_bbox[1] | |
| line_heights.append(line_height) | |
| total_text_height += line_height | |
| if len(final_lines) > 1: | |
| total_text_height += (len(final_lines) - 1) * line_spacing | |
| # Center vertically in the erase box | |
| y_start = erase_box[1] + (target_height - total_text_height) / 2 | |
| # 6. Draw the text (following working pattern) | |
| current_y = y_start | |
| for i, line in enumerate(final_lines): | |
| # Center horizontally | |
| line_width = draw.textlength(line, font=final_font) | |
| x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2 | |
| line_y_center = current_y + line_heights[i] / 2 | |
| # Draw shadow for visibility | |
| draw.text( | |
| (x_center + 1, line_y_center + 1), | |
| line, | |
| font=final_font, | |
| fill=(0, 0, 0), # Black shadow | |
| anchor="mm" | |
| ) | |
| # Draw main text | |
| draw.text( | |
| (x_center, line_y_center), | |
| line, | |
| font=final_font, | |
| fill=(255, 255, 255), # White text | |
| anchor="mm" | |
| ) | |
| current_y += line_heights[i] + line_spacing | |
| return image_copy | |
| # --- GRADIO INTERFACE --- | |
| with gr.Blocks(title="Quote OCR Translator", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 📝 Quote Image Translator") | |
| gr.Markdown("Upload an image with English text. See the Persian translation overlaid directly on the image!") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| label="Upload Quote Image", | |
| type="pil", | |
| sources=["upload", "clipboard"] | |
| ) | |
| text_output = gr.Textbox( | |
| label="Extracted Quote Text (English)", | |
| placeholder="The detected English text will appear here...", | |
| lines=4, | |
| show_copy_button=True | |
| ) | |
| translated_output = gr.Textbox( | |
| label="Translated Quote (Persian)", | |
| placeholder="The Persian translation will appear here...", | |
| lines=4, | |
| show_copy_button=True | |
| ) | |
| with gr.Column(scale=1): | |
| image_output = gr.Image( | |
| label="Translated Image Output", | |
| type="pil" | |
| ) | |
| def process_and_overlay(image): | |
| if image is None: | |
| return "Please upload an image.", "Translation will appear here.", None | |
| extracted_text, bbox = extract_text_and_bbox(image) | |
| if bbox is None: | |
| return extracted_text, "No text to translate.", None | |
| translated_text = translate_text_gemini(extracted_text) | |
| if "Error" in translated_text: | |
| return extracted_text, translated_text, None | |
| final_image = overlay_text_on_image(image, translated_text, bbox) | |
| return extracted_text, translated_text, final_image | |
| image_input.change( | |
| fn=process_and_overlay, | |
| inputs=[image_input], | |
| outputs=[text_output, translated_output, image_output] | |
| ) | |
| gr.Markdown(""" | |
| ### 💡 How It Works: | |
| 1. **Text Detection:** The app uses OCR to find English text and its location. | |
| 2. **Inpainting:** It digitally "erases" the original text. | |
| 3. **Translation:** The text is sent to an AI for Persian translation. | |
| 4. **Overlay:** The Persian text is rendered and placed back onto the image. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |