File size: 4,800 Bytes
a14c972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import io
import base64

import spaces

# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

def process_image(image):
    """Convert image to base64 string for processing"""
    if isinstance(image, str):
        return image
    if isinstance(image, Image.Image):
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return f"data:image/png;base64,{img_str}"
    return image

@spaces.GPU(duration=120)
def qwen_chat(message, image, chat_history):
    """
    Process chat message with optional image input
    
    Args:
        message (str): User's text message
        image: Optional image input
        chat_history (list): Previous conversation history
    
    Returns:
        tuple: Updated chat history and empty message input
    """
    if not message and image is None:
        return chat_history, ""
    
    # Build messages list
    messages = []
    
    # Add previous chat history
    for user_msg, assistant_msg in chat_history:
        messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
        messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
    
    # Add current message with optional image
    current_content = []
    if image is not None:
        current_content.append({
            "type": "image",
            "image": image
        })
    
    if message:
        current_content.append({
            "type": "text",
            "text": message
        })
    
    messages.append({
        "role": "user",
        "content": current_content
    })
    
    # Prepare inputs
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)
    
    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    # Update chat history
    chat_history.append((message if message else "[Image provided]", output_text))
    
    return chat_history, ""

# Create Gradio interface
with gr.Blocks(title="Qwen3-VL Chat") as demo:
    gr.Markdown(
        """
        # 🎨 Qwen3-VL Chat
        Chat with Qwen3-VL-2B-Instruct - A multimodal AI that can understand both text and images!
        
        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """
    )
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="Chat History",
                type="messages",
                height=600,
                show_copy_button=True
            )
        
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="Upload Image (Optional)",
                type="pil",
                sources=["upload", "clipboard"],
                interactive=True
            )
    
    with gr.Row():
        message_input = gr.Textbox(
            label="Message",
            placeholder="Type your message here...",
            lines=2,
            scale=4
        )
        send_btn = gr.Button("Send", scale=1, variant="primary")
    
    with gr.Row():
        clear_btn = gr.Button("Clear Chat", variant="secondary")
    
    gr.Markdown(
        """
        ### Tips:
        - Upload an image to ask questions about it
        - Describe what you see or ask for analysis
        - The model can answer questions about images and text
        """
    )
    
    # Event handlers
    def send_message(msg, img, history):
        return qwen_chat(msg, img, history)
    
    send_btn.click(
        send_message,
        inputs=[message_input, image_input, chatbot],
        outputs=[chatbot, message_input]
    )
    
    message_input.submit(
        send_message,
        inputs=[message_input, image_input, chatbot],
        outputs=[chatbot, message_input]
    )
    
    clear_btn.click(
        lambda: ([], None, ""),
        outputs=[chatbot, image_input, message_input]
    )

if __name__ == "__main__":
    demo.launch(share=False)