FAAM-demo / app.py
KasKniesmeijer's picture
Add SmolVLM with WebGPU frontend
cab1df1
raw
history blame
1.12 kB
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
# Set the device (CPU or CUDA)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
"HuggingFaceTB/SmolVLM-Instruct",
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
# Define the function to answer questions
def answer_question(image, question):
inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE)
outputs = model.generate(**inputs)
answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
return answer
# Gradio interface
interface = gr.Interface(
fn=answer_question,
inputs=["image", "text"],
outputs="text",
title="SmolVLM - Vision-Language Question Answering",
description="Upload an image and ask a question to get an answer powered by SmolVLM.",
)
if __name__ == "__main__":
interface.launch()