Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

9517d28

verified ·

1 Parent(s): 229a996

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -15

app.py CHANGED Viewed

@@ -6,14 +6,14 @@ from PIL import Image
 from collections import deque
 import numpy as np
-# Load main BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load YOLOv5 small model for object detection using ultralytics package
 detect_model = YOLO('yolov5s.pt')
-# Setup MarianMT translation models cache for multilingual captions
 translation_models = {
     "English": None,
     "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
@@ -42,10 +42,10 @@ def translate_caption(caption, target_lang):
     translated = tokenizer.decode(gen[0], skip_special_tokens=True)
     return translated
-# Session memory for last 15 images and captions
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
 last_captions = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     if image.mode != "RGB":
@@ -74,40 +74,97 @@ def generate_caption(image, language):
     # Update session memory
     last_images.append(image)
     last_captions.append(caption_translated)
     tags = ", ".join(detected_objs) if detected_objs else "None"
-    gallery = [(img, cap) for img, cap in zip(list(last_images), list(last_captions))]
     result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
-    return result_text, gallery
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
     language = gr.Dropdown(
         label="Select Caption Language",
         choices=["English", "French", "Spanish", "German"],
         value="English"
     )
-    image_input = gr.Image(type="pil", label="Upload Image")
-    caption_output = gr.Textbox(label="Caption and Detected Objects", lines=3, interactive=False)
-    # Fixed: removed style() method, added scale parameter to Gallery
-    gallery = gr.Gallery(label="Last 15 Images and Captions", scale=3)
-    generate_btn = gr.Button("Generate Caption")
     def on_generate(image, language):
         if image is None:
             return "Please upload an image.", []
-        return generate_caption(image, language)
     generate_btn.click(
         fn=on_generate,
         inputs=[image_input, language],
-        outputs=[caption_output, gallery]
     )
 if __name__ == "__main__":

 from collections import deque
 import numpy as np
+# Load BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 small model for detection
 detect_model = YOLO('yolov5s.pt')
+# MarianMT translation models cache
 translation_models = {
     "English": None,
     "French": ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"),
     translated = tokenizer.decode(gen[0], skip_special_tokens=True)
     return translated
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
 last_captions = deque([], maxlen=MEMORY_SIZE)
+last_languages = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     if image.mode != "RGB":
     # Update session memory
     last_images.append(image)
     last_captions.append(caption_translated)
+    last_languages.append(language)
     tags = ", ".join(detected_objs) if detected_objs else "None"
     result_text = f"Detected objects: {tags}\nCaption ({language}): {caption_translated}"
+    # Prepare table data for last 15 images with copyable captions and copy buttons
+    history_rows = []
+    for img, cap, lang in zip(last_images, last_captions, last_languages):
+        history_rows.append([img, cap])
+    return result_text, history_rows
+def gallery_to_table(history_rows):
+    # history_rows is list of [PIL image, caption text]
+    headers = ["Image", "Caption (click to copy)"]
+    data = []
+    for img, cap in history_rows:
+        data.append([
+            img,
+            gr.Textbox.update(value=cap, interactive=True)
+        ])
+    return headers, data
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection & Multilingual Support")
+    gr.Markdown("""
+    This app generates descriptive captions for your uploaded images, detects objects within them,
+    and supports multilingual captions. Upload an image, then click 'Generate Caption' to see results.
+    Your last 15 images and captions are saved below for easy reference and copying.
+    """)
     language = gr.Dropdown(
         label="Select Caption Language",
         choices=["English", "French", "Spanish", "German"],
         value="English"
     )
+    with gr.Row():
+        with gr.Column(scale=2):
+            image_input = gr.Image(type="pil", label="Upload Image")
+            generate_btn = gr.Button("Generate Caption")
+        with gr.Column(scale=3):
+            caption_output = gr.Textbox(
+                label="Caption & Detected Objects",
+                lines=4,
+                interactive=True
+            )
+            copy_btn = gr.Button("Copy Caption Text")
+    # History table with thumbnails and copyable captions
+    history_table = gr.Dataframe(
+        headers=["Image", "Caption"],
+        row_count=(MEMORY_SIZE, MEMORY_SIZE),
+        col_count=2,
+        datatype=["image", "str"],
+        interactive=False,
+        wrap=True,
+        label="Last 15 Images and Captions"
+    )
+    def copy_text(caption_text):
+        return gr.update(value=caption_text)
+    def update_history(history_rows):
+        # Convert to format compatible with gr.Dataframe
+        data = []
+        for img, cap in history_rows:
+            data.append([img, cap])
+        return data
     def on_generate(image, language):
         if image is None:
             return "Please upload an image.", []
+        result_text, history_rows = generate_caption(image, language)
+        history_data = update_history(history_rows)
+        return result_text, history_data
     generate_btn.click(
         fn=on_generate,
         inputs=[image_input, language],
+        outputs=[caption_output, history_table]
+    )
+    copy_btn.click(
+        fn=lambda text: text,
+        inputs=[caption_output],
+        outputs=[caption_output]
     )
 if __name__ == "__main__":