import gradio as gr from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from PIL import Image import torch # Load ViT-GPT2 (Apache-2.0 licensed, safe to use) model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) def caption_image(image): # Convert image to tensor pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device) # Generate caption output_ids = model.generate(pixel_values, max_length=50, num_beams=4) caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) return caption # Build Gradio app demo = gr.Interface( fn=caption_image, inputs=gr.Image(type="pil"), outputs="text", title="Chart Analyzer", description="Upload a chart/visualization image and get a description of it." ) if __name__ == "__main__": demo.launch()