Spaces:
Sleeping
Sleeping
| from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
| import torch | |
| from PIL import Image | |
| import gradio as gr | |
| model_name = "aryan083/vit-gpt2-image-captioning" | |
| model = VisionEncoderDecoderModel.from_pretrained(model_name) | |
| feature_extractor = ViTImageProcessor.from_pretrained(model_name) # Changed from ViTFeatureExtractor to ViTImageProcessor | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model.to(device) | |
| def predict_caption(image): | |
| if image is None: | |
| return None | |
| images = [] | |
| images.append(image) | |
| pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values | |
| pixel_values = pixel_values.to(device) | |
| output_ids = model.generate( | |
| pixel_values, | |
| do_sample=True, | |
| max_length=16, | |
| num_beams=4, | |
| temperature=0.7 | |
| ) | |
| preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
| return preds[0].strip() | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=predict_caption, | |
| inputs=gr.Image(type="pil"), | |
| outputs=gr.Textbox(label="Generated Caption"), | |
| title="Image Captioning", | |
| description="Upload an image and get its description generated using ViT-GPT2", | |
| # examples=[["assets/example1.jpg"]] # Add example images if you have any | |
| ) | |
| iface.launch() |