# Web links Handler import requests # Backend import torch # Image Processing from PIL import Image from IPython.display import display # Transformer and Pretrained Model from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast # Managing loading processsing from tqdm import tqdm # Assign available GPU device = "cuda" if torch.cuda.is_available() else "cpu" # ViT Encoder - Decoder Model model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device) # Corresponding ViT Tokenizer tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Image processor image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") import urllib.parse as parse import os # Verify url def check_url(string): try: result = parse.urlparse(string) return all([result.scheme, result.netloc, result.path]) except: return False # Load an image def load_image(image_path): if check_url(image_path): return Image.open(requests.get(image_path, stream=True).raw) elif os.path.exists(image_path): return Image.open(image_path) def get_caption(model, image_processor, tokenizer, image_path): image = load_image(image_path) # Preprocessing the Image img = image_processor(image, return_tensors="pt").to(device) # Generating captions output = model.generate(**img) # decode the output caption = tokenizer.batch_decode(output, skip_special_tokens=True)[0] print(caption) return caption # Loading URLs url = "https://images.pexels.com/photos/101667/pexels-photo-101667.jpeg?auto=compress&cs=tinysrgb&w=600" urlNew = "https://images.pexels.com/photos/406014/pexels-photo-406014.jpeg?auto=compress&cs=tinysrgb&w=600" # Display Image display(load_image(url)) # Display Caption get_caption(model, image_processor, tokenizer, url) get_caption(model, image_processor, tokenizer, urlNew)