ashwml's picture
Upload 233 files
5aa312d
# Web links Handler
import requests
# Backend
import torch
# Image Processing
from PIL import Image
from IPython.display import display
# Transformer and Pretrained Model
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast
# Managing loading processsing
from tqdm import tqdm
# Assign available GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# ViT Encoder - Decoder Model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
# Corresponding ViT Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Image processor
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
import urllib.parse as parse
import os
# Verify url
def check_url(string):
try:
result = parse.urlparse(string)
return all([result.scheme, result.netloc, result.path])
except:
return False
# Load an image
def load_image(image_path):
if check_url(image_path):
return Image.open(requests.get(image_path, stream=True).raw)
elif os.path.exists(image_path):
return Image.open(image_path)
def get_caption(model, image_processor, tokenizer, image_path):
image = load_image(image_path)
# Preprocessing the Image
img = image_processor(image, return_tensors="pt").to(device)
# Generating captions
output = model.generate(**img)
# decode the output
caption = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print(caption)
return caption
# Loading URLs
url = "https://images.pexels.com/photos/101667/pexels-photo-101667.jpeg?auto=compress&cs=tinysrgb&w=600"
urlNew = "https://images.pexels.com/photos/406014/pexels-photo-406014.jpeg?auto=compress&cs=tinysrgb&w=600"
# Display Image
display(load_image(url))
# Display Caption
get_caption(model, image_processor, tokenizer, url)
get_caption(model, image_processor, tokenizer, urlNew)