tlam commited on
Commit
e731e19
·
verified ·
1 Parent(s): 4cd14aa

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +14 -6
  2. app.py +128 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,13 +1,21 @@
1
  ---
2
- title: Captioning
3
- emoji: 🌍
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.8.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Batch Image Captioning
3
+ emoji: 🖼️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ hardware: zero-a10g-free
12
  ---
13
 
14
+ # Batch Image Captioning with Qwen2.5-VL
15
+
16
+ A lightweight, powerful, and customizable image captioning tool leveraging the `Qwen2.5-VL-3B-Instruct` model. Designed to run efficiently on Hugging Face Spaces free tier (ZeroGPU).
17
+
18
+ ## Features
19
+ - **Batch Processing**: Upload multiple images and get captions generated sequentially.
20
+ - **Custom Instructions**: Guide the model's captioning style using a custom system prompt.
21
+ - **Lightweight & Powerful**: Uses the 3B parameter Qwen2.5-VL model for fast, high-quality, and instruction-following descriptions.
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from PIL import Image
5
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
6
+ from qwen_vl_utils import process_vision_info
7
+ import spaces
8
+
9
+ # Configuration
10
+ MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # Load Processor
14
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
15
+
16
+ # Load Model
17
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
18
+ MODEL_ID,
19
+ torch_dtype=torch.float16,
20
+ device_map="auto"
21
+ )
22
+ model.eval()
23
+ print("Model loaded.")
24
+
25
+ @spaces.GPU
26
+ def process_images(image_files, instruction):
27
+ """
28
+ Process a batch of images sequentially.
29
+ Yields the updated results list as each image is processed.
30
+ """
31
+ if not image_files:
32
+ yield "No images uploaded."
33
+ return
34
+
35
+ results = []
36
+
37
+ for idx, img_file in enumerate(image_files):
38
+ try:
39
+ # We assume it is a path to the file passed from gradio
40
+ img_path = img_file.name if hasattr(img_file, 'name') else img_file
41
+
42
+ # Use Qwen-VL specific conversational format
43
+ messages = [
44
+ {
45
+ "role": "user",
46
+ "content": [
47
+ {"type": "image", "image": img_path},
48
+ {"type": "text", "text": instruction},
49
+ ],
50
+ }
51
+ ]
52
+
53
+ # Preparation for inference
54
+ text = processor.apply_chat_template(
55
+ messages, tokenize=False, add_generation_prompt=True
56
+ )
57
+ image_inputs, video_inputs = process_vision_info(messages)
58
+
59
+ inputs = processor(
60
+ text=[text],
61
+ images=image_inputs,
62
+ videos=video_inputs,
63
+ padding=True,
64
+ return_tensors="pt",
65
+ )
66
+ # Move inputs to the same device as the model
67
+ inputs = inputs.to(model.device)
68
+
69
+ # Generate output
70
+ generated_ids = model.generate(**inputs, max_new_tokens=256)
71
+
72
+ # Trim the generated ids to only contain the new tokens
73
+ generated_ids_trimmed = [
74
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
75
+ ]
76
+
77
+ output_text = processor.batch_decode(
78
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
79
+ )[0]
80
+
81
+ results.append(f"### Image {idx + 1}\n**Caption:** {output_text}\n")
82
+
83
+ # Yield accumulated results so user sees progress
84
+ yield "\n---\n".join(results)
85
+
86
+ except Exception as e:
87
+ results.append(f"### Image {idx + 1}\n**Error processing image:** {str(e)}\n")
88
+ yield "\n---\n".join(results)
89
+
90
+ # Gradio Interface Construction
91
+ with gr.Blocks(title="Batch Image Captioning") as demo:
92
+ gr.Markdown("# 🖼️ Batch Image Captioning with Qwen2.5-VL")
93
+ gr.Markdown(
94
+ "Upload multiple images and provide an instruction prompt. The system uses "
95
+ "[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) "
96
+ "to generate descriptions sequentially. Designed to run smoothly on Hugging Face ZeroGPU."
97
+ )
98
+
99
+ with gr.Row():
100
+ with gr.Column(scale=1):
101
+ input_images = gr.File(
102
+ label="Upload Images",
103
+ file_count="multiple",
104
+ file_types=["image"],
105
+ type="filepath" # returns temp paths
106
+ )
107
+
108
+ # Default instruction panel
109
+ instruction_textbox = gr.Textbox(
110
+ label="Instructions",
111
+ placeholder="Describe this image in detail...",
112
+ value="Provide a detailed, highly descriptive caption for this image focusing on lighting, composition, and subjects.",
113
+ lines=3
114
+ )
115
+
116
+ submit_btn = gr.Button("Generate Captions", variant="primary")
117
+
118
+ with gr.Column(scale=1):
119
+ output_text = gr.Markdown("Captions will appear here...", label="Results")
120
+
121
+ submit_btn.click(
122
+ fn=process_images,
123
+ inputs=[input_images, instruction_textbox],
124
+ outputs=output_text
125
+ )
126
+
127
+ if __name__ == "__main__":
128
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers==4.46.1
2
+ torch
3
+ torchvision
4
+ pillow
5
+ accelerate
6
+ spaces
7
+ gradio
8
+ qwen-vl-utils