Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

1f6a8b2

verified ·

1 Parent(s): 7223770

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -15

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ from PIL import Image
 from collections import deque
 import numpy as np
-# Load BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load YOLOv5 small model for detection
 detect_model = YOLO('yolov5s.pt')
 translation_models = {
@@ -71,7 +71,6 @@ def generate_caption(image, language):
     caption_translated = translate_caption(caption_en, language)
     detected_objs = detect_objects(image)
-    # Update session memory
     last_images.append(image)
     last_captions.append(caption_translated)
     last_objects.append(detected_objs)
@@ -82,7 +81,6 @@ def generate_caption(image, language):
     return caption_translated, tags
 def build_history_ui():
-    # Create a list of columns with image, caption textbox and copy button
     components = []
     for i in range(len(last_images)):
         img = last_images[i]
@@ -90,32 +88,35 @@ def build_history_ui():
         obj = last_objects[i]
         lang = last_languages[i]
         copy_btn = gr.Button("Copy Caption")
-        cap_box = gr.Textbox(value=cap, lines=2, interactive=True)
         def copy_fn(caption):
-            return caption  # No direct clipboard access, but textbox selectable
         copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
-        components.append(gr.Row([
-            gr.Image(value=img, interactive=False, label=f"Image {i+1}", elem_id=f"img_{i}"),
             gr.Column([
                 gr.Markdown(f"**Caption ({lang}):**"),
                 cap_box,
                 copy_btn,
                 gr.Markdown(f"**Detected Objects:** {', '.join(obj) if obj else 'None'}")
             ])
-        ]))
     return components
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
     gr.Markdown(
         """
-        Upload an image, select the language for captions, then click 'Generate Caption'.
-        The app generates a descriptive caption along with detected object tags.
-        Your last 15 images and captions are displayed below for easy reference and copying.
         """
     )
@@ -134,14 +135,14 @@ with gr.Blocks() as iface:
             object_output = gr.Textbox(label="Detected Objects", lines=2, interactive=False)
             copy_btn = gr.Button("Copy Caption Text")
-    history_container = gr.Column(label="Last 15 Images & Captions", elem_classes="history-container")
     def on_generate(image, language):
         if image is None:
             return "Please upload an image.", "", []
         caption, objects = generate_caption(image, language)
-        # Rebuild history display on every generation
-        return caption, objects, build_history_ui()
     def copy_text(text):
         return gr.Textbox.update(value=text, interactive=True)

 from collections import deque
 import numpy as np
+# Load BLIP model
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 model
 detect_model = YOLO('yolov5s.pt')
 translation_models = {
     caption_translated = translate_caption(caption_en, language)
     detected_objs = detect_objects(image)
     last_images.append(image)
     last_captions.append(caption_translated)
     last_objects.append(detected_objs)
     return caption_translated, tags
 def build_history_ui():
     components = []
     for i in range(len(last_images)):
         img = last_images[i]
         obj = last_objects[i]
         lang = last_languages[i]
+        cap_box = gr.Textbox(value=cap, lines=2, interactive=True, show_label=False)
         copy_btn = gr.Button("Copy Caption")
         def copy_fn(caption):
+            return caption
         copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
+        row = gr.Row([
+            gr.Image(value=img, interactive=False, show_label=False),
             gr.Column([
                 gr.Markdown(f"**Caption ({lang}):**"),
                 cap_box,
                 copy_btn,
                 gr.Markdown(f"**Detected Objects:** {', '.join(obj) if obj else 'None'}")
             ])
+        ])
+        components.append(row)
     return components
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
     gr.Markdown(
         """
+        Upload an image, select the caption language, then click 'Generate Caption'.
+        The app generates a caption and detected object tags.
+        Your last 15 images and captions are displayed below for easy copying and reference.
         """
     )
             object_output = gr.Textbox(label="Detected Objects", lines=2, interactive=False)
             copy_btn = gr.Button("Copy Caption Text")
+    history_container = gr.Column()
     def on_generate(image, language):
         if image is None:
             return "Please upload an image.", "", []
         caption, objects = generate_caption(image, language)
+        history = build_history_ui()
+        return caption, objects, history
     def copy_text(text):
         return gr.Textbox.update(value=text, interactive=True)