| | import gradio as gr |
| | import spaces |
| | import torch |
| | import os |
| |
|
| | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor |
| | from qwen_vl_utils import process_vision_info |
| |
|
| | |
| | |
| | |
| | MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| | MODEL_ID, |
| | torch_dtype="auto", |
| | device_map="auto", |
| | ) |
| | model.eval() |
| |
|
| | |
| | processor = AutoProcessor.from_pretrained(MODEL_ID) |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| | @spaces.GPU |
| | def qwen_vl_inference(image_path: str | None, text_input: str | None = None): |
| | if image_path is None: |
| | return "Please upload an image first." |
| |
|
| | |
| | file_uri = f"file://{os.path.abspath(image_path)}" |
| | user_text = text_input.strip() if text_input else "Describe this image." |
| |
|
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", "image": file_uri}, |
| | {"type": "text", "text": user_text}, |
| | ], |
| | } |
| | ] |
| |
|
| | |
| | chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | inputs = processor( |
| | text=[chat_text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| |
|
| | |
| | inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()} |
| |
|
| | |
| | gen_ids = model.generate(**inputs, max_new_tokens=512) |
| | |
| | trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)] |
| | output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
| | return output |
| |
|
| |
|
| | |
| | |
| | |
| | DESCRIPTION = ( |
| | "[Qwen2.5-VL-7B-Instruct demo](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) โ " |
| | "upload an image and ask anything about it." |
| | ) |
| |
|
| | css = """ |
| | #output_text { |
| | height: 500px; |
| | overflow: auto; |
| | border: 1px solid #ccc; |
| | } |
| | """ |
| |
|
| | with gr.Blocks(css=css, theme="origin") as demo: |
| | gr.Markdown(DESCRIPTION) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | input_image = gr.Image(label="Upload Image", type="filepath") |
| | text_input = gr.Textbox(label="Question") |
| | submit_btn = gr.Button("Submit") |
| | with gr.Column(scale=1): |
| | output_text = gr.Textbox(label="Model Output", elem_id="output_text") |
| |
|
| | gr.Examples( |
| | examples=[["example.webp", "Explain this image"]], |
| | inputs=[input_image, text_input], |
| | outputs=output_text, |
| | fn=qwen_vl_inference, |
| | cache_examples=True, |
| | label="Try an example" |
| | ) |
| |
|
| | submit_btn.click(qwen_vl_inference, [input_image, text_input], [output_text]) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|