import torch import gradio as gr from transformers import AutoProcessor, Qwen3VLForConditionalGeneration import os MODEL_ID = "yuyan-chen/Lep-Description-Qwen3-VL-2B-Instruct" token = os.environ.get("HF_TOKEN") device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, token=token) model = Qwen3VLForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto", trust_remote_code=True, token=token ) def infer(image, text): messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": text}, ], } ] inputs = processor( messages, return_tensors="pt" ).to(device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7 ) return processor.decode( output_ids[0], skip_special_tokens=True ) demo = gr.Interface( fn=infer, inputs=[ gr.Image(type="pil", label="Image"), gr.Textbox(label="Prompt") ], outputs=gr.Textbox(label="Response"), title="Qwen3-VL-2B (Finetuned)", description="Upload an image and ask a question." ) demo.launch()