Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| # Set the device (CPU or CUDA) | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Initialize processor and model | |
| processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| "HuggingFaceTB/SmolVLM-Instruct", | |
| torch_dtype=torch.bfloat16, | |
| _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", | |
| ).to(DEVICE) | |
| # Define the function to answer questions | |
| def answer_question(image, question): | |
| inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE) | |
| outputs = model.generate(**inputs) | |
| answer = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| return answer | |
| # Gradio interface | |
| interface = gr.Interface( | |
| fn=answer_question, | |
| inputs=["image", "text"], | |
| outputs="text", | |
| title="SmolVLM - Vision-Language Question Answering", | |
| description="Upload an image and ask a question to get an answer powered by SmolVLM.", | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |