Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

a51fe1b

verified ·

1 Parent(s): 3ee7d17

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -20

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
 from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer
 import torch
 import gradio as gr
 from PIL import Image
 from collections import deque
 # Load main BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load YOLOv5 small model for object detection (using torch hub)
-detect_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
 # Setup MarianMT translation models cache for multilingual captions
 translation_models = {
@@ -23,13 +25,16 @@ translation_cache = {}
 def get_translation_model(lang_code):
     if lang_code not in translation_cache:
         model_name, _ = translation_models[lang_code]
-        tokenizer = MarianTokenizer.from_pretrained(model_name)
-        model = MarianMTModel.from_pretrained(model_name)
-        translation_cache[lang_code] = (tokenizer, model)
     return translation_cache[lang_code]
 def translate_caption(caption, target_lang):
-    if target_lang == "English":
         return caption
     tokenizer, model = get_translation_model(target_lang)
     batch = tokenizer([caption], return_tensors="pt")
@@ -48,10 +53,15 @@ def preprocess_image(image):
     return image
 def detect_objects(image):
-    results = detect_model(image)
-    detected_labels = results.names
-    objs = [detected_labels[int(x)] for x in results.xyxy[0][:, -1]]
-    return list(set(objs))  # unique labels
 def generate_caption(image, language):
     image = preprocess_image(image)
@@ -65,20 +75,12 @@ def generate_caption(image, language):
     last_images.append(image)
     last_captions.append(caption_translated)
-    # Format detected objects tags as comma-separated list
     tags = ", ".join(detected_objs) if detected_objs else "None"
-    # Prepare last images gallery (thumbnails and captions)
     gallery = [(img, cap) for img, cap in zip(list(last_images), list(last_captions))]
     result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
     return result_text, gallery
-# Gradio gallery components expect images as PIL Images or URLs, captions as texts
-def gallery_to_components(gallery):
-    images, captions = zip(*gallery) if gallery else ([], [])
-    return images, captions
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
@@ -107,4 +109,5 @@ with gr.Blocks() as iface:
         outputs=[caption_output, gallery]
     )
-iface.launch()

 from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer
+from ultralytics import YOLO
 import torch
 import gradio as gr
 from PIL import Image
 from collections import deque
+import numpy as np
 # Load main BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 small model for object detection using ultralytics package
+detect_model = YOLO('yolov5s.pt')
 # Setup MarianMT translation models cache for multilingual captions
 translation_models = {
 def get_translation_model(lang_code):
     if lang_code not in translation_cache:
         model_name, _ = translation_models[lang_code]
+        if model_name:
+            tokenizer = MarianTokenizer.from_pretrained(model_name)
+            model = MarianMTModel.from_pretrained(model_name)
+            translation_cache[lang_code] = (tokenizer, model)
+        else:
+            translation_cache[lang_code] = None
     return translation_cache[lang_code]
 def translate_caption(caption, target_lang):
+    if target_lang == "English" or translation_cache.get(target_lang) is None:
         return caption
     tokenizer, model = get_translation_model(target_lang)
     batch = tokenizer([caption], return_tensors="pt")
     return image
 def detect_objects(image):
+    img_np = np.array(image)
+    results = detect_model(img_np)
+    detected_objs = set()
+    for r in results:
+        for box in r.boxes.data.tolist():
+            class_id = int(box[-1])
+            label = detect_model.names[class_id]
+            detected_objs.add(label)
+    return list(detected_objs)
 def generate_caption(image, language):
     image = preprocess_image(image)
     last_images.append(image)
     last_captions.append(caption_translated)
     tags = ", ".join(detected_objs) if detected_objs else "None"
     gallery = [(img, cap) for img, cap in zip(list(last_images), list(last_captions))]
     result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
     return result_text, gallery
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
         outputs=[caption_output, gallery]
     )
+if __name__ == "__main__":
+    iface.launch()