Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
| from PIL import Image | |
| import torch | |
| model_name = "nlpconnect/vit-gpt2-image-captioning" | |
| print("Loading model components...") | |
| model = VisionEncoderDecoderModel.from_pretrained(model_name) | |
| feature_extractor = ViTImageProcessor.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| print("Model loaded!") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| def generate_caption(image): | |
| """ | |
| Takes a PIL Image and returns a text caption. | |
| """ | |
| if image is None: | |
| return "Please upload an image." | |
| # Ensure image is in RGB mode so it has 3 channels | |
| if image.mode != "RGB": | |
| image = image.convert(mode="RGB") | |
| # Preprocess the image | |
| pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values | |
| pixel_values = pixel_values.to(device) | |
| # Generate output | |
| output_ids = model.generate(pixel_values, max_length=16, num_beams=4) | |
| # Decode text | |
| preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
| return preds[0].strip() | |
| demo = gr.Interface( | |
| fn=generate_caption, | |
| inputs=gr.Image(type="pil", label="Upload an Image"), | |
| outputs=gr.Textbox(label="AI Caption"), | |
| title="AI Image Captioner", | |
| description="Upload any photo, and the AI will describe what it sees using a Vision Transformer + GPT-2 model!", | |
| examples=[], | |
| theme="default" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |