Spaces:

tlam
/

captioning

Sleeping

App Files Files Community

tlam commited on 26 days ago

Commit

e731e19

verified ·

1 Parent(s): 4cd14aa

Upload 3 files

Browse files

Files changed (3) hide show

README.md +14 -6
app.py +128 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,13 +1,21 @@
 ---
-title: Captioning
-emoji: 🌍
-colorFrom: green
-colorTo: gray
 sdk: gradio
-sdk_version: 6.8.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Batch Image Captioning
+emoji: 🖼️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.6.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+hardware: zero-a10g-free
 ---
+# Batch Image Captioning with Qwen2.5-VL
+A lightweight, powerful, and customizable image captioning tool leveraging the `Qwen2.5-VL-3B-Instruct` model. Designed to run efficiently on Hugging Face Spaces free tier (ZeroGPU).
+## Features
+- **Batch Processing**: Upload multiple images and get captions generated sequentially.
+- **Custom Instructions**: Guide the model's captioning style using a custom system prompt.
+- **Lightweight & Powerful**: Uses the 3B parameter Qwen2.5-VL model for fast, high-quality, and instruction-following descriptions.

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import torch
+import gradio as gr
+from PIL import Image
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import spaces
+# Configuration
+MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load Processor
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+# Load Model
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+model.eval()
+print("Model loaded.")
+@spaces.GPU
+def process_images(image_files, instruction):
+    """
+    Process a batch of images sequentially.
+    Yields the updated results list as each image is processed.
+    """
+    if not image_files:
+        yield "No images uploaded."
+        return
+    results = []
+    for idx, img_file in enumerate(image_files):
+        try:
+            # We assume it is a path to the file passed from gradio
+            img_path = img_file.name if hasattr(img_file, 'name') else img_file
+            # Use Qwen-VL specific conversational format
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": img_path},
+                        {"type": "text", "text": instruction},
+                    ],
+                }
+            ]
+            # Preparation for inference
+            text = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            # Move inputs to the same device as the model
+            inputs = inputs.to(model.device)
+            # Generate output
+            generated_ids = model.generate(**inputs, max_new_tokens=256)
+            # Trim the generated ids to only contain the new tokens
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            output_text = processor.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )[0]
+            results.append(f"### Image {idx + 1}\n**Caption:** {output_text}\n")
+            # Yield accumulated results so user sees progress
+            yield "\n---\n".join(results)
+        except Exception as e:
+            results.append(f"### Image {idx + 1}\n**Error processing image:** {str(e)}\n")
+            yield "\n---\n".join(results)
+# Gradio Interface Construction
+with gr.Blocks(title="Batch Image Captioning") as demo:
+    gr.Markdown("# 🖼️ Batch Image Captioning with Qwen2.5-VL")
+    gr.Markdown(
+        "Upload multiple images and provide an instruction prompt. The system uses "
+        "[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) "
+        "to generate descriptions sequentially. Designed to run smoothly on Hugging Face ZeroGPU."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_images = gr.File(
+                label="Upload Images",
+                file_count="multiple",
+                file_types=["image"],
+                type="filepath" # returns temp paths
+            )
+            # Default instruction panel
+            instruction_textbox = gr.Textbox(
+                label="Instructions",
+                placeholder="Describe this image in detail...",
+                value="Provide a detailed, highly descriptive caption for this image focusing on lighting, composition, and subjects.",
+                lines=3
+            )
+            submit_btn = gr.Button("Generate Captions", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.Markdown("Captions will appear here...", label="Results")
+    submit_btn.click(
+        fn=process_images,
+        inputs=[input_images, instruction_textbox],
+        outputs=output_text
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers==4.46.1
+torch
+torchvision
+pillow
+accelerate
+spaces
+gradio
+qwen-vl-utils