{ "cells": [ { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Srujan Jujare\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer\n", "import torch\n", "from PIL import Image" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "VisionEncoderDecoderModel(\n", " (encoder): ViTModel(\n", " (embeddings): ViTEmbeddings(\n", " (patch_embeddings): ViTPatchEmbeddings(\n", " (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))\n", " )\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (encoder): ViTEncoder(\n", " (layer): ModuleList(\n", " (0-11): 12 x ViTLayer(\n", " (attention): ViTAttention(\n", " (attention): ViTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (output): ViTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (intermediate): ViTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): ViTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " )\n", " )\n", " (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (pooler): ViTPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", " )\n", " (decoder): GPT2LMHeadModel(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 768)\n", " (wpe): Embedding(1024, 768)\n", " (drop): Dropout(p=0.1, inplace=False)\n", " (h): ModuleList(\n", " (0-11): 12 x GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (crossattention): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (q_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_cross_attn): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", " )\n", ")" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = VisionEncoderDecoderModel.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n", "feature_extractor = ViTImageProcessor.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "max_length = 16\n", "num_beams = 4\n", "gen_kwargs = {\"max_length\": max_length, \"num_beams\": num_beams}\n", "def predict_step(image_paths):\n", " images = []\n", " for image_path in image_paths:\n", " i_image = Image.open(image_path)\n", " if i_image.mode != \"RGB\":\n", " i_image = i_image.convert(mode=\"RGB\")\n", "\n", " images.append(i_image)\n", "\n", " pixel_values = feature_extractor(images=images, return_tensors=\"pt\").pixel_values\n", " pixel_values = pixel_values.to(device)\n", "\n", " output_ids = model.generate(pixel_values, **gen_kwargs)\n", "\n", " preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)\n", " preds = [pred.strip() for pred in preds]\n", " return preds" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n", "You may ignore this warning if your `pad_token_id` (50256) is identical to the `bos_token_id` (50256), `eos_token_id` (50256), or the `sep_token_id` (None), and your input is not padded.\n" ] }, { "data": { "text/plain": [ "['a clock on a dashboard of a car']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_step(['D:\\\\Validation\\\\Class 2\\\\i17.jpg'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }