File size: 4,432 Bytes
e731e19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19ec6fa
 
 
e731e19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import torch
import gradio as gr
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import spaces

# Configuration
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Processor
processor = AutoProcessor.from_pretrained(MODEL_ID)

# Load Model
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID, 
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
).eval()
print("Model loaded.")

@spaces.GPU
def process_images(image_files, instruction):
    """
    Process a batch of images sequentially.
    Yields the updated results list as each image is processed.
    """
    if not image_files:
        yield "No images uploaded."
        return

    results = []
    
    for idx, img_file in enumerate(image_files):
        try:
            # We assume it is a path to the file passed from gradio
            img_path = img_file.name if hasattr(img_file, 'name') else img_file
            
            # Use Qwen-VL specific conversational format
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img_path},
                        {"type": "text", "text": instruction},
                    ],
                }
            ]
            
            # Preparation for inference
            text = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            image_inputs, video_inputs = process_vision_info(messages)
            
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            # Move inputs to the same device as the model
            inputs = inputs.to(model.device)
            
            # Generate output
            generated_ids = model.generate(**inputs, max_new_tokens=256)
            
            # Trim the generated ids to only contain the new tokens
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0]
            
            results.append(f"### Image {idx + 1}\n**Caption:** {output_text}\n")
            
            # Yield accumulated results so user sees progress
            yield "\n---\n".join(results)
            
        except Exception as e:
            results.append(f"### Image {idx + 1}\n**Error processing image:** {str(e)}\n")
            yield "\n---\n".join(results)

# Gradio Interface Construction
with gr.Blocks(title="Batch Image Captioning") as demo:
    gr.Markdown("# 🖼️ Batch Image Captioning with Qwen2.5-VL")
    gr.Markdown(
        "Upload multiple images and provide an instruction prompt. The system uses "
        "[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) "
        "to generate descriptions sequentially. Designed to run smoothly on Hugging Face ZeroGPU."
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_images = gr.File(
                label="Upload Images", 
                file_count="multiple", 
                file_types=["image"], 
                type="filepath" # returns temp paths
            )
            
            # Default instruction panel
            instruction_textbox = gr.Textbox(
                label="Instructions", 
                placeholder="Describe this image in detail...",
                value="Provide a detailed, highly descriptive caption for this image focusing on lighting, composition, and subjects.",
                lines=3
            )
            
            submit_btn = gr.Button("Generate Captions", variant="primary")
            
        with gr.Column(scale=1):
            output_text = gr.Markdown("Captions will appear here...", label="Results")

    submit_btn.click(
        fn=process_images,
        inputs=[input_images, instruction_textbox],
        outputs=output_text
    )

if __name__ == "__main__":
    demo.launch()