Spaces:

kavehtaheri
/

ocrlight2.1overlaytext

Sleeping

File size: 11,546 Bytes

import gradio as gr
import easyocr
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import google.generativeai as genai
import time
import arabic_reshaper
from bidi.algorithm import get_display

# --- CONFIGURATION ---
# It's best practice to load secrets from environment variables in Hugging Face
# import os
# api_key = os.environ.get("GEMINI_API_KEY")
api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"

PERSIAN_FONT_PATH = "vazir.ttf"

# --- GLOBAL INITIALIZATION ---
reader = None

def initialize_reader():
    """Initialize EasyOCR reader if it hasn't been already."""
    global reader
    if reader is None:
        print("Loading EasyOCR model...")
        reader = easyocr.Reader(['en'], gpu=False, verbose=False)
        print("EasyOCR model loaded successfully!")
    return reader

# --- CORE FUNCTIONS ---

def extract_text_and_bbox(image):
    """
    Extracts text and calculates a single consolidated bounding box for all text found.
    """
    if image is None:
        return "Please upload an image first.", None

    try:
        reader = initialize_reader()
        img_array = np.array(image)
        results = reader.readtext(img_array)

        if not results:
            return "No text detected in the image.", None

        min_x, min_y = float('inf'), float('inf')
        max_x, max_y = float('-inf'), float('-inf')
        
        text_parts = []
        for (bbox, text, prob) in results:
            text_parts.append(text)
            (tl, tr, br, bl) = bbox
            min_x = min(min_x, tl[0], bl[0])
            min_y = min(min_y, tl[1], tr[1])
            max_x = max(max_x, tr[0], br[0])
            max_y = max(max_y, bl[1], br[1])
            
        extracted_text = ' '.join(text_parts)
        consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
        
        return extracted_text, consolidated_bbox

    except Exception as e:
        return f"Error processing image: {str(e)}", None


def translate_text_gemini(text):
    """Translates text using Gemini API."""
    if not text or "No text" in text or "Error" in text or "Please upload" in text:
        return "No valid text to translate."

    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-flash')
        prompt = (f"Translate the following English text to Persian. Your translation should be natural, touching, and relatable, "
                  f"like casual chats with a friend—short and heartfelt. Use colloquial Persian words and contractions where appropriate. "
                  f"Do not add any extra explanations, greetings, or emojis. Output ONLY the Persian translation. "
                  f"English text: [{text}]")
        
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error during translation: {str(e)}"

# --- CORRECTED IMAGE OVERLAY FUNCTION ---

def overlay_text_on_image(original_image, text_to_overlay, bbox):
    """
    Overlays Persian text onto an image, erasing the content within the given bounding box.
    Fixed to properly handle RTL text rendering like the working example.
    """
    image_copy = original_image.copy()
    draw = ImageDraw.Draw(image_copy)

    # 1. Erase the old text (Inpainting)
    padding = 10 
    erase_box = (bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding)
    
    try:
        sample_x = max(0, int(erase_box[0]) - 5)
        sample_y = int((erase_box[1] + erase_box[3]) / 2)
        sample_x = min(sample_x, image_copy.width - 1)
        sample_y = min(sample_y, image_copy.height - 1)
        bg_color = image_copy.getpixel((sample_x, sample_y))
    except (ValueError, IndexError):
        bg_color = (0, 0, 0)
    
    draw.rectangle(erase_box, fill=bg_color)

    # 2. Text processing following the working pattern
    target_width = (erase_box[2] - erase_box[0]) * 0.90  # 90% like in working code
    target_height = erase_box[3] - erase_box[1]
    
    # Split text into lines (or words if needed for wrapping)
    lines = [line.strip() for line in text_to_overlay.split('\n') if line.strip()]
    if not lines:
        lines = [text_to_overlay]  # Single line if no newlines
    
    # **KEY FIX**: Reshape ALL lines first, then apply get_display()
    reshaped_lines = []
    for line in lines:
        reshaped = arabic_reshaper.reshape(line)
        display_text = get_display(reshaped)  # This was missing!
        reshaped_lines.append(display_text)
    
    # 3. Find optimal font size
    font_size = 100
    final_font = None
    
    # Find the longest line for font sizing (like in working code)
    if reshaped_lines:
        temp_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
        longest_line = max(reshaped_lines, key=lambda line: draw.textlength(line, font=temp_font))
        
        # Reduce font size until longest line fits
        while font_size > 10:
            font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
            if draw.textlength(longest_line, font=font) <= target_width:
                final_font = font
                break
            font_size -= 2
    
    if final_font is None:
        final_font = ImageFont.truetype(PERSIAN_FONT_PATH, 12)
    
    # 4. Handle line wrapping if text is too wide
    final_lines = []
    for line in reshaped_lines:
        if draw.textlength(line, font=final_font) <= target_width:
            final_lines.append(line)
        else:
            # Need to wrap this line - split by words and rewrap
            original_line = lines[reshaped_lines.index(line)]  # Get original before reshaping
            words = original_line.split()
            
            current_line_words = []
            for word in words:
                test_words = current_line_words + [word]
                test_text = ' '.join(test_words)
                
                # Process the test text properly
                test_reshaped = arabic_reshaper.reshape(test_text)
                test_display = get_display(test_reshaped)
                
                if draw.textlength(test_display, font=final_font) <= target_width:
                    current_line_words = test_words
                else:
                    # Line is full, save current line and start new one
                    if current_line_words:
                        line_text = ' '.join(current_line_words)
                        line_reshaped = arabic_reshaper.reshape(line_text)
                        line_display = get_display(line_reshaped)
                        final_lines.append(line_display)
                    current_line_words = [word]
            
            # Add remaining words
            if current_line_words:
                line_text = ' '.join(current_line_words)
                line_reshaped = arabic_reshaper.reshape(line_text)
                line_display = get_display(line_reshaped)
                final_lines.append(line_display)
    
    # 5. Calculate total height and center text (following working pattern)
    line_spacing = 20  # Same as working code
    total_text_height = 0
    line_heights = []
    
    for line in final_lines:
        line_bbox = draw.textbbox((0, 0), line, font=final_font)
        line_height = line_bbox[3] - line_bbox[1]
        line_heights.append(line_height)
        total_text_height += line_height
    
    # Add spacing between lines
    if len(final_lines) > 1:
        total_text_height += (len(final_lines) - 1) * line_spacing
    
    # Check if total height fits, if not reduce font size
    while total_text_height > target_height and font_size > 10:
        font_size -= 2
        final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
        
        # Recalculate heights
        total_text_height = 0
        line_heights = []
        for line in final_lines:
            line_bbox = draw.textbbox((0, 0), line, font=final_font)
            line_height = line_bbox[3] - line_bbox[1]
            line_heights.append(line_height)
            total_text_height += line_height
        
        if len(final_lines) > 1:
            total_text_height += (len(final_lines) - 1) * line_spacing
    
    # Center vertically in the erase box
    y_start = erase_box[1] + (target_height - total_text_height) / 2
    
    # 6. Draw the text (following working pattern)
    current_y = y_start
    for i, line in enumerate(final_lines):
        # Center horizontally
        line_width = draw.textlength(line, font=final_font)
        x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
        line_y_center = current_y + line_heights[i] / 2
        
        # Draw shadow for visibility
        draw.text(
            (x_center + 1, line_y_center + 1), 
            line, 
            font=final_font, 
            fill=(0, 0, 0),  # Black shadow
            anchor="mm"
        )
        
        # Draw main text
        draw.text(
            (x_center, line_y_center), 
            line, 
            font=final_font, 
            fill=(255, 255, 255),  # White text
            anchor="mm"
        )
        
        current_y += line_heights[i] + line_spacing

    return image_copy

# --- GRADIO INTERFACE ---

with gr.Blocks(title="Quote OCR Translator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📝 Quote Image Translator")
    gr.Markdown("Upload an image with English text. See the Persian translation overlaid directly on the image!")

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="Upload Quote Image",
                type="pil",
                sources=["upload", "clipboard"]
            )
            text_output = gr.Textbox(
                label="Extracted Quote Text (English)",
                placeholder="The detected English text will appear here...",
                lines=4,
                show_copy_button=True
            )
            translated_output = gr.Textbox(
                label="Translated Quote (Persian)",
                placeholder="The Persian translation will appear here...",
                lines=4,
                show_copy_button=True
            )

        with gr.Column(scale=1):
            image_output = gr.Image(
                label="Translated Image Output",
                type="pil"
            )

    def process_and_overlay(image):
        if image is None:
            return "Please upload an image.", "Translation will appear here.", None

        extracted_text, bbox = extract_text_and_bbox(image)

        if bbox is None:
            return extracted_text, "No text to translate.", None
            
        translated_text = translate_text_gemini(extracted_text)

        if "Error" in translated_text:
            return extracted_text, translated_text, None

        final_image = overlay_text_on_image(image, translated_text, bbox)
        
        return extracted_text, translated_text, final_image

    image_input.change(
        fn=process_and_overlay,
        inputs=[image_input],
        outputs=[text_output, translated_output, image_output]
    )

    gr.Markdown("""
    ### 💡 How It Works:
    1.  **Text Detection:** The app uses OCR to find English text and its location.
    2.  **Inpainting:** It digitally "erases" the original text.
    3.  **Translation:** The text is sent to an AI for Persian translation.
    4.  **Overlay:** The Persian text is rendered and placed back onto the image.
    """)

if __name__ == "__main__":
    demo.launch()