Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Paused

App Files Files Community

qwen3-vl-2b-instruct / app.py

broadfield-dev

Update app.py

1aaaced verified 5 months ago

raw

history blame contribute delete

3.46 kB

	import gradio as gr
	import torch
	from PIL import Image
	import requests
	from io import BytesIO
	from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

	# --- Configuration ---
	MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
	CPU_DEVICE = "cpu"

	# --- Model and Processor Loading ---
	print("Loading model and processor... This will take a few minutes on a CPU.")
	processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	dtype="auto", # Use 'auto' for dtype for better compatibility
	device_map="auto" # This is the key for CPU (and GPU) compatibility
	)
	print("Model and processor loaded successfully.")

	# --- Inference Function ---
	def process_and_generate(image_input, text_prompt):
	"""
	Processes the image and text prompt, and generates a response from the model.
	"""
	if image_input is None or not text_prompt.strip():
	return "Please provide both an image and a text prompt."

	# Convert Gradio's numpy array to a PIL Image
	pil_image = Image.fromarray(image_input)

	# Prepare the messages payload for the model
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": pil_image},
	{"type": "text", "text": text_prompt},
	],
	}
	]

	print("Processing inputs and generating response... This will be slow.")
	try:
	# Preparation for inference
	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(model.device)

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=1024)

	# To get only the new tokens, we trim the input IDs from the generated IDs
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	# Decode the trimmed IDs to text
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	# batch_decode returns a list, we return the first element
	return output_text[0]

	except Exception as e:
	return f"An error occurred during generation: {str(e)}"

	# --- Gradio Interface ---
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Qwen3-VL-2B-Instruct CPU Demo
	This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
	Warning: Running this on a free CPU Space is very slow. Duplicate this space for solo experience.
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="numpy", label="Upload Image")
	text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
	submit_button = gr.Button("Generate Response")
	with gr.Column():
	output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

	submit_button.click(
	fn=process_and_generate,
	inputs=[image_input, text_prompt],
	outputs=output_text
	)



	if __name__ == "__main__":
	demo.launch()