Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

7223770

verified ·

1 Parent(s): 9517d28

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -62

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-capt
 # Load YOLOv5 small model for detection
 detect_model = YOLO('yolov5s.pt')
-# MarianMT translation models cache
 translation_models = {
     "English": None,
     "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
@@ -45,6 +44,7 @@ def translate_caption(caption, target_lang):
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
 last_captions = deque([], maxlen=MEMORY_SIZE)
 last_languages = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
@@ -74,40 +74,50 @@ def generate_caption(image, language):
     # Update session memory
     last_images.append(image)
     last_captions.append(caption_translated)
     last_languages.append(language)
     tags = ", ".join(detected_objs) if detected_objs else "None"
-    result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
-    # Prepare table data for last 15 images with copyable captions and copy buttons
-    history_rows = []
-    for img, cap, lang in zip(last_images, last_captions, last_languages):
-        history_rows.append([img, cap])
-    return result_text, history_rows
-def gallery_to_table(history_rows):
-    # history_rows is list of [PIL image, caption text]
-    headers = ["Image", "Caption (click to copy)"]
-    data = []
-    for img, cap in history_rows:
-        data.append([
-            img,
-            gr.Textbox.update(value=cap, interactive=True)
-        ])
-    return headers, data
-with gr.Blocks() as iface:
-    gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
-    gr.Markdown("""
-    This app generates descriptive captions for your uploaded images, detects objects within them,
-    and supports multilingual captions. Upload an image, then click 'Generate Caption' to see results.
-    Your last 15 images and captions are saved below for easy reference and copying.
-    """)
     language = gr.Dropdown(
         label="Select Caption Language",
@@ -120,52 +130,29 @@ with gr.Blocks() as iface:
             image_input = gr.Image(type="pil", label="Upload Image")
             generate_btn = gr.Button("Generate Caption")
         with gr.Column(scale=3):
-            caption_output = gr.Textbox(
-                label="Caption & Detected Objects",
-                lines=4,
-                interactive=True
-            )
             copy_btn = gr.Button("Copy Caption Text")
-    # History table with thumbnails and copyable captions
-    history_table = gr.Dataframe(
-        headers=["Image", "Caption"],
-        row_count=(MEMORY_SIZE, MEMORY_SIZE),
-        col_count=2,
-        datatype=["image", "str"],
-        interactive=False,
-        wrap=True,
-        label="Last 15 Images and Captions"
-    )
-    def copy_text(caption_text):
-        return gr.update(value=caption_text)
-    def update_history(history_rows):
-        # Convert to format compatible with gr.Dataframe
-        data = []
-        for img, cap in history_rows:
-            data.append([img, cap])
-        return data
     def on_generate(image, language):
         if image is None:
-            return "Please upload an image.", []
-        result_text, history_rows = generate_caption(image, language)
-        history_data = update_history(history_rows)
-        return result_text, history_data
     generate_btn.click(
         fn=on_generate,
         inputs=[image_input, language],
-        outputs=[caption_output, history_table]
     )
-    copy_btn.click(
-        fn=lambda text: text,
-        inputs=[caption_output],
-        outputs=[caption_output]
-    )
 if __name__ == "__main__":
     iface.launch()

 # Load YOLOv5 small model for detection
 detect_model = YOLO('yolov5s.pt')
 translation_models = {
     "English": None,
     "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
 last_captions = deque([], maxlen=MEMORY_SIZE)
+last_objects = deque([], maxlen=MEMORY_SIZE)
 last_languages = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     # Update session memory
     last_images.append(image)
     last_captions.append(caption_translated)
+    last_objects.append(detected_objs)
     last_languages.append(language)
     tags = ", ".join(detected_objs) if detected_objs else "None"
+    return caption_translated, tags
+def build_history_ui():
+    # Create a list of columns with image, caption textbox and copy button
+    components = []
+    for i in range(len(last_images)):
+        img = last_images[i]
+        cap = last_captions[i]
+        obj = last_objects[i]
+        lang = last_languages[i]
+        copy_btn = gr.Button("Copy Caption")
+        cap_box = gr.Textbox(value=cap, lines=2, interactive=True)
+        def copy_fn(caption):
+            return caption  # No direct clipboard access, but textbox selectable
+        copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
+        components.append(gr.Row([
+            gr.Image(value=img, interactive=False, label=f"Image {i+1}", elem_id=f"img_{i}"),
+            gr.Column([
+                gr.Markdown(f"**Caption ({lang}):**"),
+                cap_box,
+                copy_btn,
+                gr.Markdown(f"**Detected Objects:** {', '.join(obj) if obj else 'None'}")
+            ])
+        ]))
+    return components
+with gr.Blocks() as iface:
+    gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
+    gr.Markdown(
+        """
+        Upload an image, select the language for captions, then click 'Generate Caption'.
+        The app generates a descriptive caption along with detected object tags.
+        Your last 15 images and captions are displayed below for easy reference and copying.
+        """
+    )
     language = gr.Dropdown(
         label="Select Caption Language",
             image_input = gr.Image(type="pil", label="Upload Image")
             generate_btn = gr.Button("Generate Caption")
         with gr.Column(scale=3):
+            caption_output = gr.Textbox(label="Caption", lines=3, interactive=True)
+            object_output = gr.Textbox(label="Detected Objects", lines=2, interactive=False)
             copy_btn = gr.Button("Copy Caption Text")
+    history_container = gr.Column(label="Last 15 Images & Captions", elem_classes="history-container")
     def on_generate(image, language):
         if image is None:
+            return "Please upload an image.", "", []
+        caption, objects = generate_caption(image, language)
+        # Rebuild history display on every generation
+        return caption, objects, build_history_ui()
+    def copy_text(text):
+        return gr.Textbox.update(value=text, interactive=True)
     generate_btn.click(
         fn=on_generate,
         inputs=[image_input, language],
+        outputs=[caption_output, object_output, history_container]
     )
+    copy_btn.click(fn=copy_text, inputs=caption_output, outputs=caption_output)
 if __name__ == "__main__":
     iface.launch()