gpt2-finetuned / utils.py
kyrylokumar's picture
Upload folder using huggingface_hub
4d898ee verified
from math import inf
# from utils import *
from h11 import Data
import torch
import torch.nn as nn
import numpy as np
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader, Dataset
# from utils import MyDataset, custom_collate
from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
import wandb
import torch.nn.functional as F
import einops
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2TokenizerFast
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
class CNNDataset(Dataset):
def __init__(self, df, max_length = 1000, max_len=21000, test_ds=False):
super().__init__()
self.df = df
self.max_len = max_len
self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
self.max_length = max_length
self.test_ds = test_ds
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
self.tokenizer.add_special_tokens({'cls_token': '[START]'})
self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
print(len(self.tokenizer), self.start, "Pad")
for index in range(max_len):
x, y = self.df['article'][index], self.df['highlights'][index]
x, y = re.sub(r'[\t\n\r]', ' ', x) , re.sub(r'[\t\n\r]', ' ', y)
y = self.tokenizer(y, return_tensors="pt", max_length=256,truncation=True).input_ids[0]
x = self.tokenizer(x, return_tensors="pt", max_length=self.max_length-max(y.shape[0], 256+24), truncation=True).input_ids[0]
self.df.loc[index, 'article'], self.df.loc[index, 'highlights'] = x,y
def __len__(self, ):
return self.max_len
def __getitem__(self, index):
x, y = self.df['article'][index], self.df['highlights'][index]
# Check if middle self.eot is needed
# print(x, self.eot)
if self.test_ds:
return torch.cat([self.eot, x, self.start]), torch.cat([y, self.eot])
x = torch.cat([self.eot, x, self.start, y, self.eot])
y = torch.cat([y, self.eot])
y_final = torch.ones(x.shape[0], dtype=torch.long)
y_final[-y.shape[0]-1:-1] = y
y_final[:-y.shape[0]-1] = self.pad
return x, y_final
def properly_pad(context):
lenghts = []
# print(context)
for i in context:
lenghts.append(i.shape[0])
lenghts = torch.tensor(lenghts)
ind = torch.argsort(lenghts, descending=True)
lenghts = lenghts[ind]
sorted_tensors = [context[i] for i in ind]
context = sorted_tensors
context = pad_sequence(sequences=context, batch_first=True, padding_value=50257)
return context
def custom_collate(batch):
# print(batch)
context, target = [], []
# print(batch)
for a,b in batch:
context.append(a)
target.append(b)
context, target = properly_pad(context), properly_pad(target)
return context, target
def import_data(bs=4, fraction=0.1):
df_train = pd.read_csv('./cnn_dailymail/train.csv')
df_val = pd.read_csv('./cnn_dailymail/validation.csv')
df_test = pd.read_csv('./cnn_dailymail/test.csv')
print('Loaded data')
df_train, df_val, df_test = CNNDataset(df_train, max_len=int(21000*fraction)), CNNDataset(df_val, max_len=int(fraction*6000)), CNNDataset(df_test, max_len=int(fraction*300), test_ds=True)
df_train = DataLoader(df_train, batch_size=bs, num_workers=7, collate_fn=custom_collate)
df_test = DataLoader(df_test, batch_size=1, num_workers=7, collate_fn=custom_collate)
df_val = DataLoader(df_val, batch_size=bs, num_workers=7, collate_fn=custom_collate)
# print(df_train['article'][0])
return df_train, df_val, df_test
if __name__ == '__main__':
tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
tokenizer.add_special_tokens({'cls_token': '[START]'})
eot =tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
pad =tokenizer("[PAD]", return_tensors="pt").input_ids[0]
start =tokenizer("[START]", return_tensors="pt").input_ids[0]
print(tokenizer.decode([1, 2, 50256]))
print(tokenizer.decode([1, 2, 50257]))
print(tokenizer('[START]'))
# dl_train, dl_val, dl_test = import_data()
# for x,y in dl_train:
# print(x.shape, y.shape)
# break