Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

98c6b6d

verified ·

1 Parent(s): 3fdf4eb

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -74

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer
 from ultralytics import YOLO
 import torch
 import gradio as gr
@@ -6,47 +6,16 @@ from PIL import Image
 from collections import deque
 import numpy as np
-# Load BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load YOLOv5 small model for detection
 detect_model = YOLO('yolov5s.pt')
-# Setup MarianMT translation models cache for multilingual captions
-translation_models = {
-    "English": None,
-    "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
-    "Spanish": ("Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-es-en"),
-    "German": ("Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-de-en"),
-}
-translation_cache = {}
-def get_translation_model(lang_code):
-    if lang_code not in translation_cache:
-        model_name, _ = translation_models[lang_code]
-        if model_name:
-            tokenizer = MarianTokenizer.from_pretrained(model_name)
-            model = MarianMTModel.from_pretrained(model_name)
-            translation_cache[lang_code] = (tokenizer, model)
-        else:
-            translation_cache[lang_code] = None
-    return translation_cache[lang_code]
-def translate_caption(caption, target_lang):
-    if target_lang == "English" or translation_cache.get(target_lang) is None:
-        return caption
-    tokenizer, model = get_translation_model(target_lang)
-    batch = tokenizer([caption], return_tensors="pt")
-    gen = model.generate(**batch)
-    translated = tokenizer.decode(gen[0], skip_special_tokens=True)
-    return translated
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
-last_captions = deque([], maxlen=MEMORY_SIZE)
-last_objects = deque([], maxlen=MEMORY_SIZE)
-last_languages = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     if image.mode != "RGB":
@@ -64,99 +33,84 @@ def detect_objects(image):
             detected_objs.add(label)
     return list(detected_objs)
-def generate_caption(image, language):
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)
-    caption_en = processor.decode(out[0], skip_special_tokens=True)
-    caption_translated = translate_caption(caption_en, language)
     detected_objs = detect_objects(image)
     # Update session memory
     last_images.append(image)
-    last_captions.append(caption_translated)
-    last_objects.append(detected_objs)
-    last_languages.append(language)
-    tags = ", ".join(detected_objs) if detected_objs else "None"
-    return caption_translated, tags
 def build_history_ui():
-    # Build list of Gradio Rows containing image, caption textbox and copy button
     rows = []
     for i in range(len(last_images)):
         img = last_images[i]
-        cap = last_captions[i]
-        obj = last_objects[i]
-        lang = last_languages[i]
-        cap_box = gr.Textbox(value=cap, lines=2, interactive=True, show_label=False)
-        copy_btn = gr.Button("Copy Caption")
         def copy_fn(caption):
             return caption
-        # Bind copy button inside lambda to close over correct caption_box
         copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
         row = gr.Row([
             gr.Image(value=img, interactive=False, show_label=False, elem_id=f"history_img_{i}"),
             gr.Column([
-                gr.Markdown(f"**Caption ({lang}):**"),
                 cap_box,
                 copy_btn,
-                gr.Markdown(f"**Detected Objects:** {', '.join(obj) if obj else 'None'}")
             ])
         ])
         rows.append(row)
     return rows
 with gr.Blocks() as iface:
-    gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
     gr.Markdown(
         """
-        Upload an image, select the caption language, then click 'Generate Caption'.
-        The app generates a caption and detected object tags.
-        Your last 15 images and captions are displayed below for easy copying and reference.
         """
     )
-    language = gr.Dropdown(
-        label="Select Caption Language",
-        choices=["English", "French", "Spanish", "German"],
-        value="English"
-    )
     with gr.Row():
         with gr.Column(scale=2):
             image_input = gr.Image(type="pil", label="Upload Image")
             generate_btn = gr.Button("Generate Caption")
         with gr.Column(scale=3):
-            caption_output = gr.Textbox(label="Caption", lines=3, interactive=True)
-            object_output = gr.Textbox(label="Detected Objects", lines=2, interactive=False)
-            copy_btn = gr.Button("Copy Caption Text")
     history_container = gr.Column()
-    def on_generate(image, language):
         if image is None:
-            return "Please upload an image.", "", []
-        caption, objects = generate_caption(image, language)
         history = build_history_ui()
-        return caption, objects, history
     def copy_text(text):
         return gr.Textbox.update(value=text, interactive=True)
     generate_btn.click(
         fn=on_generate,
-        inputs=[image_input, language],
-        outputs=[caption_output, object_output, history_container]
     )
-    copy_btn.click(fn=copy_text, inputs=caption_output, outputs=caption_output)
 if __name__ == "__main__":
     iface.launch()

+from transformers import BlipProcessor, BlipForConditionalGeneration
 from ultralytics import YOLO
 import torch
 import gradio as gr
 from collections import deque
 import numpy as np
+# Load BLIP model for image captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 model for object detection
 detect_model = YOLO('yolov5s.pt')
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
+last_texts = deque([], maxlen=MEMORY_SIZE)  # will store combined caption + detected objects
 def preprocess_image(image):
     if image.mode != "RGB":
             detected_objs.add(label)
     return list(detected_objs)
+def generate_caption_with_objects(image):
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)
+    caption = processor.decode(out[0], skip_special_tokens=True)
     detected_objs = detect_objects(image)
+    tags = ", ".join(detected_objs) if detected_objs else "None"
+    combined_text = f"Detected objects: {tags}\nCaption: {caption}"
     # Update session memory
     last_images.append(image)
+    last_texts.append(combined_text)
+    return combined_text
 def build_history_ui():
     rows = []
     for i in range(len(last_images)):
         img = last_images[i]
+        text = last_texts[i]
+        cap_box = gr.Textbox(value=text, lines=3, interactive=True, show_label=False)
+        copy_btn = gr.Button("Copy Text")
         def copy_fn(caption):
             return caption
         copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
         row = gr.Row([
             gr.Image(value=img, interactive=False, show_label=False, elem_id=f"history_img_{i}"),
             gr.Column([
                 cap_box,
                 copy_btn,
             ])
         ])
         rows.append(row)
     return rows
 with gr.Blocks() as iface:
+    gr.Markdown("# Image Captioning with Object Detection")
     gr.Markdown(
         """
+        Upload an image and click 'Generate Caption'.
+        The app will display detected objects and a caption together.
+        Your last 15 images and combined captions are shown below.
         """
     )
     with gr.Row():
         with gr.Column(scale=2):
             image_input = gr.Image(type="pil", label="Upload Image")
             generate_btn = gr.Button("Generate Caption")
         with gr.Column(scale=3):
+            output_box = gr.Textbox(label="Caption & Detected Objects", lines=6, interactive=True)
+            copy_btn = gr.Button("Copy Text")
     history_container = gr.Column()
+    def on_generate(image):
         if image is None:
+            return "Please upload an image.", []
+        combined_text = generate_caption_with_objects(image)
         history = build_history_ui()
+        return combined_text, history
     def copy_text(text):
         return gr.Textbox.update(value=text, interactive=True)
     generate_btn.click(
         fn=on_generate,
+        inputs=image_input,
+        outputs=[output_box, history_container],
     )
+    copy_btn.click(fn=copy_text, inputs=output_box, outputs=output_box)
 if __name__ == "__main__":
     iface.launch()