Spaces:

scmlewis
/

image_captioning

Sleeping

App Files Files Community

scmlewis commited on Oct 19, 2025

Commit

7a9f97a

verified ·

1 Parent(s): 3d6ca1f

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -56

app.py CHANGED Viewed

@@ -6,16 +6,17 @@ from PIL import Image
 from collections import deque
 import numpy as np
-# Load BLIP model for image captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load YOLOv5 model for object detection
 detect_model = YOLO('yolov5s.pt')
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
-last_texts = deque([], maxlen=MEMORY_SIZE)  # will store combined caption + detected objects
 def preprocess_image(image):
     if image.mode != "RGB":
@@ -33,84 +34,44 @@ def detect_objects(image):
             detected_objs.add(label)
     return list(detected_objs)
-def generate_caption_with_objects(image):
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)
     caption = processor.decode(out[0], skip_special_tokens=True)
     detected_objs = detect_objects(image)
-    tags = ", ".join(detected_objs) if detected_objs else "None"
-    combined_text = f"Detected objects: {tags}\nCaption: {caption}"
     # Update session memory
     last_images.append(image)
-    last_texts.append(combined_text)
-    return combined_text
-def build_history_ui():
-    rows = []
-    for i in range(len(last_images)):
-        img = last_images[i]
-        text = last_texts[i]
-        cap_box = gr.Textbox(value=text, lines=3, interactive=True, show_label=False)
-        copy_btn = gr.Button("Copy Text")
-        def copy_fn(caption):
-            return caption
-        copy_btn.click(fn=copy_fn, inputs=cap_box, outputs=cap_box)
-        row = gr.Row([
-            gr.Image(value=img, interactive=False, show_label=False, elem_id=f"history_img_{i}"),
-            gr.Column([
-                cap_box,
-                copy_btn,
-            ])
-        ])
-        rows.append(row)
-    return rows
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection")
-    gr.Markdown(
-        """
-        Upload an image and click 'Generate Caption'.
-        The app will display detected objects and a caption together.
-        Your last 15 images and combined captions are shown below.
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=2):
-            image_input = gr.Image(type="pil", label="Upload Image")
-            generate_btn = gr.Button("Generate Caption")
-        with gr.Column(scale=3):
-            output_box = gr.Textbox(label="Caption & Detected Objects", lines=6, interactive=True)
-            copy_btn = gr.Button("Copy Text")
-    history_container = gr.Column()
     def on_generate(image):
         if image is None:
             return "Please upload an image.", []
-        combined_text = generate_caption_with_objects(image)
-        history = build_history_ui()
-        return combined_text, history
-    def copy_text(text):
-        return gr.Textbox.update(value=text, interactive=True)
     generate_btn.click(
         fn=on_generate,
         inputs=image_input,
-        outputs=[output_box, history_container],
     )
-    copy_btn.click(fn=copy_text, inputs=output_box, outputs=output_box)
 if __name__ == "__main__":
     iface.launch()

 from collections import deque
 import numpy as np
+# Load main BLIP model for English captioning
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load YOLOv5 small model for object detection using ultralytics package
 detect_model = YOLO('yolov5s.pt')
+# Session memory for last 15 images and captions
 MEMORY_SIZE = 15
 last_images = deque([], maxlen=MEMORY_SIZE)
+last_captions = deque([], maxlen=MEMORY_SIZE)
 def preprocess_image(image):
     if image.mode != "RGB":
             detected_objs.add(label)
     return list(detected_objs)
+def generate_caption(image):
     image = preprocess_image(image)
     inputs = processor(image, return_tensors="pt")
     out = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)
     caption = processor.decode(out[0], skip_special_tokens=True)
     detected_objs = detect_objects(image)
     # Update session memory
     last_images.append(image)
+    last_captions.append(caption)
+    tags = ", ".join(detected_objs) if detected_objs else "None"
+    gallery = [(img, cap) for img, cap in zip(list(last_images), list(last_captions))]
+    result_text = f"Detected objects: {tags}\nCaption: {caption}"
+    return result_text, gallery
 with gr.Blocks() as iface:
     gr.Markdown("# Image Captioning with Object Detection")
+    image_input = gr.Image(type="pil", label="Upload Image")
+    caption_output = gr.Textbox(label="Caption and Detected Objects", lines=3, interactive=False)
+    gallery = gr.Gallery(label="Last 15 Images and Captions", scale=3)
+    generate_btn = gr.Button("Generate Caption")
     def on_generate(image):
         if image is None:
             return "Please upload an image.", []
+        return generate_caption(image)
     generate_btn.click(
         fn=on_generate,
         inputs=image_input,
+        outputs=[caption_output, gallery]
     )
 if __name__ == "__main__":
     iface.launch()