Spaces:

KasKniesmeijer
/

FAAM-demo

Sleeping

FAAM-demo / app.py

Add SmolVLM with WebGPU frontend

cab1df1 about 1 year ago

1.12 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, AutoModelForVision2Seq

	# Set the device (CPU or CUDA)
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize processor and model
	processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
	model = AutoModelForVision2Seq.from_pretrained(
	"HuggingFaceTB/SmolVLM-Instruct",
	torch_dtype=torch.bfloat16,
	_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
	).to(DEVICE)


	# Define the function to answer questions
	def answer_question(image, question):
	inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE)
	outputs = model.generate(**inputs)
	answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
	return answer


	# Gradio interface
	interface = gr.Interface(
	fn=answer_question,
	inputs=["image", "text"],
	outputs="text",
	title="SmolVLM - Vision-Language Question Answering",
	description="Upload an image and ask a question to get an answer powered by SmolVLM.",
	)

	if __name__ == "__main__":
	interface.launch()