Spaces:

akhaliq
/

Qwen3-VL-2B-Instruct

Running on Zero

File size: 4,800 Bytes

a14c972

import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import io
import base64

import spaces

# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

def process_image(image):
    """Convert image to base64 string for processing"""
    if isinstance(image, str):
        return image
    if isinstance(image, Image.Image):
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return f"data:image/png;base64,{img_str}"
    return image

@spaces.GPU(duration=120)
def qwen_chat(message, image, chat_history):
    """
    Process chat message with optional image input
    
    Args:
        message (str): User's text message
        image: Optional image input
        chat_history (list): Previous conversation history
    
    Returns:
        tuple: Updated chat history and empty message input
    """
    if not message and image is None:
        return chat_history, ""
    
    # Build messages list
    messages = []
    
    # Add previous chat history
    for user_msg, assistant_msg in chat_history:
        messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
        messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
    
    # Add current message with optional image
    current_content = []
    if image is not None:
        current_content.append({
            "type": "image",
            "image": image
        })
    
    if message:
        current_content.append({
            "type": "text",
            "text": message
        })
    
    messages.append({
        "role": "user",
        "content": current_content
    })
    
    # Prepare inputs
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)
    
    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    # Update chat history
    chat_history.append((message if message else "[Image provided]", output_text))
    
    return chat_history, ""

# Create Gradio interface
with gr.Blocks(title="Qwen3-VL Chat") as demo:
    gr.Markdown(
        """
        # 🎨 Qwen3-VL Chat
        Chat with Qwen3-VL-2B-Instruct - A multimodal AI that can understand both text and images!
        
        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """
    )
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="Chat History",
                type="messages",
                height=600,
                show_copy_button=True
            )
        
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="Upload Image (Optional)",
                type="pil",
                sources=["upload", "clipboard"],
                interactive=True
            )
    
    with gr.Row():
        message_input = gr.Textbox(
            label="Message",
            placeholder="Type your message here...",
            lines=2,
            scale=4
        )
        send_btn = gr.Button("Send", scale=1, variant="primary")
    
    with gr.Row():
        clear_btn = gr.Button("Clear Chat", variant="secondary")
    
    gr.Markdown(
        """
        ### Tips:
        - Upload an image to ask questions about it
        - Describe what you see or ask for analysis
        - The model can answer questions about images and text
        """
    )
    
    # Event handlers
    def send_message(msg, img, history):
        return qwen_chat(msg, img, history)
    
    send_btn.click(
        send_message,
        inputs=[message_input, image_input, chatbot],
        outputs=[chatbot, message_input]
    )
    
    message_input.submit(
        send_message,
        inputs=[message_input, image_input, chatbot],
        outputs=[chatbot, message_input]
    )
    
    clear_btn.click(
        lambda: ([], None, ""),
        outputs=[chatbot, image_input, message_input]
    )

if __name__ == "__main__":
    demo.launch(share=False)