Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

3ee7d17

verified ·

1 Parent(s): 0f88d29

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -25

app.py CHANGED Viewed

@@ -1,39 +1,110 @@
-from transformers import BlipProcessor, BlipForConditionalGeneration
 import gradio as gr
 from PIL import Image
-# Load the BLIP image captioning model and processor
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 def preprocess_image(image):
     if image.mode != "RGB":
         image = image.convert("RGB")
     return image
-def generate_caption(image, max_length, num_beams):
-    max_length = int(max_length)  # cast to int to ensure correct type
-    num_beams = int(num_beams)    # cast to int to ensure correct type
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
-    out = model.generate(**inputs, max_length=max_length, num_beams=num_beams, early_stopping=True)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-iface = gr.Interface(
-    fn=generate_caption,
-    inputs=[
-        gr.Image(type="pil", label="Upload Image"),
-        gr.Slider(10, 50, value=30, step=5, label="Caption Max Length",
-                  info="Controls length of caption (higher is longer)"),
-        gr.Slider(1, 10, value=5, step=1, label="Beam Search Width",
-                  info="Higher means better caption quality (slower processing)")
-    ],
-    outputs=gr.Textbox(label="Generated Caption"),
-    title="Simple Image Captioning App",
-    description="Upload an image. The model generates a simple caption describing it."
-)
-if __name__ == "__main__":
-    iface.launch()

+from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer
+import torch
 import gradio as gr
 from PIL import Image
+from collections import deque
+# Load main BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 small model for object detection (using torch hub)
+detect_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
+# Setup MarianMT translation models cache for multilingual captions
+translation_models = {
+    "English": None,
+    "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
+    "Spanish": ("Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-es-en"),
+    "German": ("Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-de-en"),
+}
+translation_cache = {}
+def get_translation_model(lang_code):
+    if lang_code not in translation_cache:
+        model_name, _ = translation_models[lang_code]
+        tokenizer = MarianTokenizer.from_pretrained(model_name)
+        model = MarianMTModel.from_pretrained(model_name)
+        translation_cache[lang_code] = (tokenizer, model)
+    return translation_cache[lang_code]
+def translate_caption(caption, target_lang):
+    if target_lang == "English":
+        return caption
+    tokenizer, model = get_translation_model(target_lang)
+    batch = tokenizer([caption], return_tensors="pt")
+    gen = model.generate(**batch)
+    translated = tokenizer.decode(gen[0], skip_special_tokens=True)
+    return translated
+# Session memory for last 15 images and captions
+MEMORY_SIZE = 15
+last_images = deque([], maxlen=MEMORY_SIZE)
+last_captions = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     if image.mode != "RGB":
         image = image.convert("RGB")
     return image
+def detect_objects(image):
+    results = detect_model(image)
+    detected_labels = results.names
+    objs = [detected_labels[int(x)] for x in results.xyxy[0][:, -1]]
+    return list(set(objs))  # unique labels
+def generate_caption(image, language):
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
+    out = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)
+    caption_en = processor.decode(out[0], skip_special_tokens=True)
+    caption_translated = translate_caption(caption_en, language)
+    detected_objs = detect_objects(image)
+    # Update session memory
+    last_images.append(image)
+    last_captions.append(caption_translated)
+    # Format detected objects tags as comma-separated list
+    tags = ", ".join(detected_objs) if detected_objs else "None"
+    # Prepare last images gallery (thumbnails and captions)
+    gallery = [(img, cap) for img, cap in zip(list(last_images), list(last_captions))]
+    result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
+    return result_text, gallery
+# Gradio gallery components expect images as PIL Images or URLs, captions as texts
+def gallery_to_components(gallery):
+    images, captions = zip(*gallery) if gallery else ([], [])
+    return images, captions
+with gr.Blocks() as iface:
+    gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
+    language = gr.Dropdown(
+        label="Select Caption Language",
+        choices=["English", "French", "Spanish", "German"],
+        value="English"
+    )
+    image_input = gr.Image(type="pil", label="Upload Image")
+    caption_output = gr.Textbox(label="Caption and Detected Objects", lines=3, interactive=False)
+    gallery = gr.Gallery(label="Last 15 Images and Captions").style(columns=3, object_fit="contain", height="auto")
+    generate_btn = gr.Button("Generate Caption")
+    def on_generate(image, language):
+        if image is None:
+            return "Please upload an image.", []
+        return generate_caption(image, language)
+    generate_btn.click(
+        fn=on_generate,
+        inputs=[image_input, language],
+        outputs=[caption_output, gallery]
+    )
+iface.launch()