import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq import torch class DeepSeekVL: def __init__(self, model_path="deepseek-ai/deepseek-vl-7b", device="cpu"): self.device = device self.processor = AutoProcessor.from_pretrained(model_path) self.model = AutoModelForVision2Seq.from_pretrained( model_path, torch_dtype=torch.float32 ).to(device) def generate(self, image, question, max_new_tokens=128): inputs = self.processor(text=question, images=image, return_tensors="pt").to(self.device) with torch.no_grad(): output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) return self.processor.batch_decode(output_ids, skip_special_tokens=True)[0] # Initialize DeepSeek-VL model (CPU for free Spaces) model = DeepSeekVL(model_path="deepseek-ai/deepseek-vl-7b", device="cpu") def qa(image, question): # Run DeepSeek-VL inference: image + question -> answer return model.generate(image, question) demo = gr.Interface( fn=qa, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter your question") ], outputs="text", title="DeepSeek-VL Multimodal QA Demo", description="Upload an image and enter a question. Experience DeepSeek-VL's vision-language capabilities." ) if __name__ == "__main__": demo.launch()