JerryAnto commited on
Commit
501270c
·
1 Parent(s): 845ac15

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """caption.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/17BgQi1eU254RKp6BKOdC-Kfr1LqIwKmj
8
+
9
+ ## Image Caption Generator
10
+
11
+
12
+ In Colab, Pytorch comes preinstalled and same goes with PIL for Image. You will only need to install **transformers** from Huggingface.
13
+ """
14
+
15
+ #!pip install transformers
16
+
17
+ #from google.colab import drive
18
+ #drive.mount('/content/drive')
19
+
20
+ from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
21
+ import torch
22
+ from PIL import Image
23
+
24
+ model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
25
+ feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
26
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
27
+
28
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ model.to(device)
30
+
31
+
32
+
33
+ max_length = 16
34
+ num_beams = 4
35
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
36
+ def predict_step(image_paths):
37
+ images = []
38
+ for image_path in image_paths:
39
+ i_image = Image.open(image_path)
40
+ if i_image.mode != "RGB":
41
+ i_image = i_image.convert(mode="RGB")
42
+
43
+ images.append(i_image)
44
+
45
+ pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
46
+ pixel_values = pixel_values.to(device)
47
+
48
+ output_ids = model.generate(pixel_values, **gen_kwargs)
49
+
50
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
51
+ preds = [pred.strip() for pred in preds]
52
+ return preds
53
+
54
+ #predict_step(['/content/drive/MyDrive/caption generator/horses.png'])
55
+
56
+
57
+
58
+ def caption_generator(img):
59
+ pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
60
+ encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)
61
+ generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
62
+
63
+
64
+
65
+ import gradio as gr
66
+
67
+ inputs = [
68
+ gr.inputs.Image(type="pil", label="Original Image")
69
+ ]
70
+
71
+ outputs = [
72
+ gr.outputs.Textbox(label = 'Caption')
73
+ ]
74
+
75
+ title = "Image Captioning using ViT + GPT2"
76
+ description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
77
+ article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
78
+ examples = [
79
+ ["horses.png"],
80
+ ["persons.jpeg"],
81
+ ['football_player'']
82
+
83
+ ]
84
+
85
+ gr.Interface(
86
+ predict_step,
87
+ inputs,
88
+ outputs,
89
+ title=title,
90
+ description=description,
91
+ article=article,
92
+ examples=examples,
93
+ theme="huggingface",
94
+ ).launch(debug=True, enable_queue=True)
95
+
96
+
97
+