Spaces:

Amandeep01
/

Signboard_Overlay_Project

Sleeping

App Files Files Community

Amandeep01 commited on May 12, 2025

Commit

3321e09

verified ·

1 Parent(s): 2ba73b6

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -130

app.py CHANGED Viewed

@@ -1,166 +1,253 @@
 import gradio as gr
-import easyocr
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
-from transformers import MarianMTModel, MarianTokenizer
-import time
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Load OCR Reader (CPU only)
-try:
-    ocr_reader = easyocr.Reader(['hi', 'mr', 'en'], gpu=False)
-except Exception as e:
-    logger.error(f"Error loading OCR Reader: {e}")
-    ocr_reader = None
-# Translator Cache
-tokenizers = {}
-models = {}
-# Translation function
-def translate_text_batch(texts, src_lang, tgt_lang):
-    try:
-        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
-        # Check if model is in cache
-        if model_name not in models:
-            tokenizer = MarianTokenizer.from_pretrained(model_name)
-            model = MarianMTModel.from_pretrained(model_name)
-            tokenizers[model_name] = tokenizer
-            models[model_name] = model
-        else:
-            tokenizer = tokenizers[model_name]
-            model = models[model_name]
-        # Ensure texts is a list
-        if isinstance(texts, str):
-            texts = [texts]
-        # Process all texts at once
-        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-        translated = model.generate(**inputs)
-        translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-        return translated_texts
-    except Exception as e:
-        logger.error(f"Translation error: {e}")
-        return [f"Translation failed: {e}" for _ in texts]
-# Overlay text on image
-def overlay_text_on_image(image_np, results, translated_texts):
-    try:
-        pil_img = Image.fromarray(image_np)
-        draw = ImageDraw.Draw(pil_img)
-        # Fallback font handling
-        try:
-            font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
-            font = ImageFont.truetype(font_path, 24)
-        except IOError:
-            # Fallback to default font if specific font not found
-            font = ImageFont.load_default()
-        # Validate inputs
-        if len(results) != len(translated_texts):
-            logger.warning("Mismatch between OCR results and translated texts")
-            return image_np
-        for ((bbox, text), translated) in zip(results, translated_texts):
-            # Ensure bbox coordinates are valid
-            try:
-                top_left = tuple(map(int, bbox[0]))
-                bottom_right = tuple(map(int, bbox[2]))
-                # Draw bounding box
-                draw.rectangle([top_left, bottom_right], outline="red", width=2)
-                # Draw translated text above the bounding box
-                text_position = (top_left[0], max(0, top_left[1] - 30))
-                draw.text(text_position, translated, fill="yellow", font=font)
-            except Exception as e:
-                logger.error(f"Error drawing text for {text}: {e}")
-        return np.array(pil_img)
-    except Exception as e:
-        logger.error(f"Overlay text error: {e}")
-        return image_np
-# Main function
-def process_image(image, target_lang):
-    try:
-        # Validate inputs
-        if image is None:
-            return "Please upload an image."
-        start_time = time.time()
-        # Convert image to numpy array
-        image_np = np.array(image)
-        # Perform OCR
-        results = ocr_reader.readtext(image_np)
-        if not results:
-            return "No text detected in the image."
-        # Extract texts from OCR results
-        texts = [item[1] for item in results]
-        # Detect source language (assume first result's language)
-        src_lang = 'en'  # Default to English
-        if results and len(results[0]) > 2:
-            detected_lang = results[0][2]
-            if detected_lang in ['hi', 'en', 'mr', 'fr', 'de', 'es']:
-                src_lang = detected_lang[:2]
-        # Translate texts
-        translated_texts = translate_text_batch(texts, src_lang, target_lang)
-        # Overlay translated text
-        overlaid_image = overlay_text_on_image(image_np,
-                                               [(r[0], r[1]) for r in results],
-                                               translated_texts)
-        # Calculate and log processing time
-        end_time = time.time()
-        logger.info(f"Processing time: {end_time - start_time} seconds")
-        return overlaid_image
-    except Exception as e:
-        logger.error(f"Process image error: {e}")
-        return f"An error occurred: {str(e)}"
-# Gradio UI
-def interface():
     with gr.Blocks() as demo:
-        gr.Markdown("# 🌍 TravelOCR: Multilingual Signboard Reader + Translator")
-        gr.Markdown("Upload a signboard image in any language and translate it!")
         with gr.Row():
             image_input = gr.Image(type="pil", label="Upload Signboard Image")
             lang_dropdown = gr.Dropdown(
                 label="Target Language",
-                choices=["en", "hi", "fr", "de", "es"],
-                value="en"
             )
-            translate_btn = gr.Button("Translate & Overlay")
-            output_img = gr.Image(type="numpy", label="Translated Output")
         translate_btn.click(
-            fn=process_image,
             inputs=[image_input, lang_dropdown],
             outputs=output_img
         )
     return demo
-# Create and launch the app
-demo = interface()
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import cv2
 import numpy as np
+import pytesseract
 from PIL import Image, ImageDraw, ImageFont
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import re
+class UltimateTravelOCR:
+    def __init__(self):
+        # Tesseract configuration for multiple languages
+        self.tesseract_config = r'--oem 3 --psm 6 -l eng+hin'
+        # Translation model cache
+        self.translation_models = {}
+        self.translation_tokenizers = {}
+    def preprocess_image(self, image):
+        """
+        Advanced image preprocessing for better OCR accuracy
+        """
+        # Convert to grayscale
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        # Apply adaptive thresholding
+        thresh = cv2.adaptiveThreshold(
+            gray, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2
+        )
+        # Denoise
+        denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
+        return denoised
+    def extract_text(self, preprocessed_image):
+        """
+        Advanced text extraction using Tesseract
+        """
+        # Extract text using Tesseract
+        text = pytesseract.image_to_string(
+            preprocessed_image,
+            config=self.tesseract_config
+        )
+        # Clean and process extracted text
+        def clean_text(txt):
+            # Remove special characters and extra whitespace
+            txt = re.sub(r'[^\w\s]', '', txt)
+            txt = ' '.join(txt.split())
+            return txt
+        # Split text into lines and clean
+        lines = text.split('\n')
+        cleaned_lines = [clean_text(line) for line in lines if clean_text(line)]
+        return cleaned_lines
+    def get_text_regions(self, preprocessed_image):
+        """
+        Detect text regions with precise bounding boxes
+        """
+        # Find contours
+        contours, _ = cv2.findContours(
+            preprocessed_image,
+            cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
+        )
+        # Filter and process contours
+        text_regions = []
+        for contour in contours:
+            # Filter contours by area to remove noise
+            area = cv2.contourArea(contour)
+            if 100 < area < 10000:  # Adjust these thresholds as needed
+                x, y, w, h = cv2.boundingRect(contour)
+                text_regions.append((x, y, w, h))
+        return text_regions
+    def _load_translation_model(self, src_lang, tgt_lang):
+        """
+        Load and cache translation models
+        """
+        model_key = f"{src_lang}-{tgt_lang}"
+        if model_key not in self.translation_models:
+            try:
+                model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
+                tokenizer = AutoTokenizer.from_pretrained(model_name)
+                model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+                self.translation_models[model_key] = model
+                self.translation_tokenizers[model_key] = tokenizer
+            except Exception as e:
+                print(f"Translation model loading error: {e}")
+                return None, None
+        return self.translation_models[model_key], self.translation_tokenizers[model_key]
+    def translate_text(self, text, target_lang):
+        """
+        Advanced text translation with fallback mechanisms
+        """
+        try:
+            # Determine source language (default to English)
+            src_lang = 'en'
+            # Load translation model
+            model, tokenizer = self._load_translation_model(src_lang, target_lang)
+            if not model or not tokenizer:
+                return text
+            # Prepare and translate
+            inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+            with torch.no_grad():
+                outputs = model.generate(**inputs)
+            translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            return translated
+        except Exception as e:
+            print(f"Translation error for '{text}': {e}")
+            return text
+    def overlay_translations(self, original_image, preprocessed_image, text_regions, lines, target_lang):
+        """
+        Overlay translated text with advanced rendering
+        """
+        # Convert to PIL for drawing
+        pil_image = Image.fromarray(original_image)
+        draw = ImageDraw.Draw(pil_image)
+        # Load a robust font
+        try:
+            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 25)
+        except IOError:
+            font = ImageFont.load_default()
+        # Translate and overlay each text region
+        for (x, y, w, h), text in zip(text_regions, lines):
+            # Skip empty texts
+            if not text.strip():
+                continue
+            # Translate text
+            translated_text = self.translate_text(text, target_lang)
+            # Draw bounding box
+            draw.rectangle(
+                [x, y, x+w, y+h],
+                outline='red',
+                width=2
+            )
+            # Position translation text
+            text_position = (x, max(0, y - 35))
+            # Draw semi-transparent background
+            text_bbox = draw.textbbox(text_position, translated_text, font=font)
+            draw.rectangle(
+                text_bbox,
+                fill=(0, 0, 0, 128)  # Semi-transparent black
+            )
+            # Draw translated text
+            draw.text(
+                text_position,
+                translated_text,
+                fill='white',
+                font=font
+            )
+        return np.array(pil_image)
+    def process_image(self, image, target_lang):
+        """
+        Comprehensive image processing pipeline
+        """
+        if image is None:
+            return None
+        try:
+            # Convert to numpy if needed
+            original_image = np.array(image)
+            # Preprocess image
+            preprocessed_image = self.preprocess_image(original_image)
+            # Extract text
+            lines = self.extract_text(preprocessed_image)
+            if not lines:
+                print("No text detected in the image.")
+                return original_image
+            # Get text regions
+            text_regions = self.get_text_regions(preprocessed_image)
+            # Ensure we have enough regions
+            if len(text_regions) < len(lines):
+                text_regions = [(0, i*30, original_image.shape[1], 30) for i in range(len(lines))]
+            # Overlay translations
+            result_image = self.overlay_translations(
+                original_image,
+                preprocessed_image,
+                text_regions[:len(lines)],
+                lines,
+                target_lang
+            )
+            return result_image
+        except Exception as e:
+            print(f"Comprehensive processing error: {e}")
+            return original_image
+# Create global OCR translator instance
+ocr_translator = UltimateTravelOCR()
+# Gradio Interface
+def create_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("# 🌍 Ultimate TravelOCR: Multilingual Signboard Translator")
         with gr.Row():
             image_input = gr.Image(type="pil", label="Upload Signboard Image")
             lang_dropdown = gr.Dropdown(
                 label="Target Language",
+                choices=['en', 'hi', 'fr', 'de', 'es'],
+                value="hi"
             )
+        translate_btn = gr.Button("Translate & Overlay")
+        output_img = gr.Image(label="Translated Output")
         translate_btn.click(
+            fn=ocr_translator.process_image,
             inputs=[image_input, lang_dropdown],
             outputs=output_img
         )
     return demo
+# Launch the app
+demo = create_interface()
 if __name__ == "__main__":
     demo.launch()