Spaces:
Sleeping
Sleeping
File size: 3,202 Bytes
c1596ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import os
import json
import random
from PIL import Image
from torch.utils.data import Dataset
import torch
class CaptionDataset(Dataset):
def __init__(
self,
json_path,
image_dir,
w2i,
tokenizer: callable,
split='train',
transform=None,
max_len=30,
train_num_caption=1,
debug=False,
use_subword=False,
sp_model_path="tokenizer.model"
):
with open(json_path, 'r') as f:
self.data = json.load(f)
# 디버깅용
if debug:
self.data= self.data[:10]
if split == "val":
self.is_val = True
else:
self.is_val = False
self.image_dir = image_dir
self.w2i = w2i
self.transform = transform
self.max_len = max_len
self.tokenizer = tokenizer
self.train_num_caption = train_num_caption
self.use_subword = use_subword
if self.use_subword:
import sentencepiece as spm
self.sp = spm.SentencePieceProcessor()
self.sp.load(sp_model_path)
def __len__(self):
return len(self.data)
def encode_caption(self, caption):
if self.use_subword:
words = self.sp.encode(caption.lower(), out_type=str)
tokens = (
[self.w2i["<sos>"]] +
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
[self.w2i["<eos>"]]
)
else:
words = self.tokenizer(caption)
tokens = (
[self.w2i["<sos>"]] +
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
[self.w2i["<eos>"]]
)
# truncation
if len(tokens) > self.max_len:
tokens = (tokens[:self.max_len - 1])
tokens.append(self.w2i["<eos>"])
else:
tokens += ([self.w2i["<pad>"]] * (self.max_len - len(tokens)))
return torch.tensor(tokens, dtype=torch.long)
def __getitem__(self, index):
data = self.data[index]
file_name = data["file_name"]
image_path = os.path.join(self.image_dir, file_name)
image = Image.open(image_path).convert('RGB')
if self.transform:
image = self.transform(image)
captions = data["captions"]
captions = captions[:5] # 캡션 5개 초과시 5개까지만 씀
while len(captions) < 5: # 캡션 5개 보다 부족할 시 마지막 캡션 복제해서 씀
captions.append(captions[-1])
# validation
if self.is_val:
caption = random.choice(captions)
tokens = (self.encode_caption(caption))
return image, tokens, captions, file_name
# train
selected_captions = (random.sample(captions, k=self.train_num_caption))
images = []
token_list = []
for caption in selected_captions:
images.append(image)
token_list.append(self.encode_caption(caption))
images = torch.stack(images)
tokens = torch.stack(token_list)
return images, tokens
|