Trainin_Transformers / dataset.py
Urfavghost's picture
Added models and code
39a7504
import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset
import json
class English2HindiDataset(Dataset):
def __init__(
self, data, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len
):
super().__init__()
self.seq_len = seq_len
self.tokenizer_src = tokenizer_src
self.tokenizer_tgt = tokenizer_tgt
self.src_lang = src_lang
self.tgt_lang = tgt_lang
self.data = data
self.sos_token = torch.tensor(
[tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64
)
self.eos_token = torch.tensor(
[tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64
)
self.pad_token = torch.tensor(
[tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64
)
# Why int64?
# When passing token indices to an nn.Embedding layer, PyTorch expects torch.int64 (or torch.long)
def __len__(self):
return len(self.data)
def causal_mask(self,size):
mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
return mask == 0
def __getitem__(self, idx):
trans_pairs = self.data[idx]
src_text = trans_pairs["en_text"]
tgt_text = trans_pairs["hi_text"]
# here we first get english_text and hindi text which was in dictionary.
enc_input_tokens = self.tokenizer_src.encode(src_text).ids
dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
# we then use the two tokenizers to encode the text into input_ids.
# For every sentence for example : I am sai - The input to the enocoder will be
## <SOS> I am Sai <EOS>
### but because of variable length sequences we need to add padding.
### how do we do it ? we take the longest sentence in the dataset add 30 to it and that will gives us the seq_len
### So now we add padding to every sentence and make it similar lengths.
enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
# -2 Because of the <SOS> and <EOS>
dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
# since the input to the decoder will only consist of <SOS>
# the target label will have the <EOS>
encoder_input = torch.cat(
[
self.sos_token,
torch.tensor(enc_input_tokens, dtype=torch.int64),
self.eos_token,
torch.tensor(
[self.pad_token] * enc_num_padding_tokens, dtype=torch.int64
),
],
dim=0,
)
decoder_input = torch.cat(
[
self.sos_token,
torch.tensor(dec_input_tokens, dtype=torch.int64),
torch.tensor(
[self.pad_token] * dec_num_padding_tokens, dtype=torch.int64
),
],
dim=0,
)
# why dim =0 because everything 1d so they are stacked one after other.
label = torch.cat(
[
torch.tensor(dec_input_tokens, dtype=torch.int64),
self.eos_token,
torch.tensor(
[self.pad_token]* dec_num_padding_tokens, dtype=torch.int64
),
],
dim=0,
)
# Label is always tgt language so we give the the decoder tokens
encoder_mask = (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() # (1, 1, seq_len)
decoder_mask= (decoder_input != self.pad_token).unsqueeze(0).int() & self.causal_mask(decoder_input.size(0)) # (1, seq_len) & (1, seq_len, seq_len),
### didnt understand this at all.
return {
"encoder_input": encoder_input,
"decoder_input": decoder_input,
"encoder_mask": encoder_mask,
"decoder_mask": decoder_mask,
"label": label,
"src_text": src_text,
"tgt_text": tgt_text,
}
class English2HindiDatasetTest(Dataset):
def __init__(
self, json_path,
):
super().__init__()
with open(json_path, "r", encoding="utf-8") as f:
self.data = json.load(f)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
trans_pairs = self.data[idx]
return trans_pairs