|
|
from math import inf |
|
|
|
|
|
from h11 import Data |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import numpy as np |
|
|
import torch.utils |
|
|
import torch.utils.data |
|
|
|
|
|
from torch.utils.data import DataLoader, Dataset |
|
|
|
|
|
from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence |
|
|
import wandb |
|
|
import torch.nn.functional as F |
|
|
|
|
|
import einops |
|
|
import pandas as pd |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from transformers import GPT2TokenizerFast |
|
|
import os |
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
import re |
|
|
|
|
|
class CNNDataset(Dataset): |
|
|
def __init__(self, df, max_length = 1000, max_len=21000, test_ds=False): |
|
|
super().__init__() |
|
|
self.df = df |
|
|
self.max_len = max_len |
|
|
self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") |
|
|
self.max_length = max_length |
|
|
self.test_ds = test_ds |
|
|
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
self.tokenizer.add_special_tokens({'cls_token': '[START]'}) |
|
|
|
|
|
self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0] |
|
|
self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0] |
|
|
self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0] |
|
|
print(len(self.tokenizer), self.start, "Pad") |
|
|
for index in range(max_len): |
|
|
x, y = self.df['article'][index], self.df['highlights'][index] |
|
|
x, y = re.sub(r'[\t\n\r]', ' ', x) , re.sub(r'[\t\n\r]', ' ', y) |
|
|
y = self.tokenizer(y, return_tensors="pt", max_length=256,truncation=True).input_ids[0] |
|
|
x = self.tokenizer(x, return_tensors="pt", max_length=self.max_length-max(y.shape[0], 256+24), truncation=True).input_ids[0] |
|
|
self.df.loc[index, 'article'], self.df.loc[index, 'highlights'] = x,y |
|
|
|
|
|
def __len__(self, ): |
|
|
return self.max_len |
|
|
|
|
|
def __getitem__(self, index): |
|
|
x, y = self.df['article'][index], self.df['highlights'][index] |
|
|
|
|
|
|
|
|
|
|
|
if self.test_ds: |
|
|
return torch.cat([self.eot, x, self.start]), torch.cat([y, self.eot]) |
|
|
x = torch.cat([self.eot, x, self.start, y, self.eot]) |
|
|
y = torch.cat([y, self.eot]) |
|
|
|
|
|
y_final = torch.ones(x.shape[0], dtype=torch.long) |
|
|
y_final[-y.shape[0]-1:-1] = y |
|
|
y_final[:-y.shape[0]-1] = self.pad |
|
|
return x, y_final |
|
|
|
|
|
def properly_pad(context): |
|
|
lenghts = [] |
|
|
|
|
|
for i in context: |
|
|
lenghts.append(i.shape[0]) |
|
|
lenghts = torch.tensor(lenghts) |
|
|
|
|
|
ind = torch.argsort(lenghts, descending=True) |
|
|
lenghts = lenghts[ind] |
|
|
|
|
|
sorted_tensors = [context[i] for i in ind] |
|
|
|
|
|
context = sorted_tensors |
|
|
context = pad_sequence(sequences=context, batch_first=True, padding_value=50257) |
|
|
|
|
|
return context |
|
|
|
|
|
def custom_collate(batch): |
|
|
|
|
|
context, target = [], [] |
|
|
|
|
|
for a,b in batch: |
|
|
context.append(a) |
|
|
target.append(b) |
|
|
|
|
|
context, target = properly_pad(context), properly_pad(target) |
|
|
|
|
|
return context, target |
|
|
|
|
|
def import_data(bs=4, fraction=0.1): |
|
|
df_train = pd.read_csv('./cnn_dailymail/train.csv') |
|
|
df_val = pd.read_csv('./cnn_dailymail/validation.csv') |
|
|
df_test = pd.read_csv('./cnn_dailymail/test.csv') |
|
|
|
|
|
print('Loaded data') |
|
|
|
|
|
df_train, df_val, df_test = CNNDataset(df_train, max_len=int(21000*fraction)), CNNDataset(df_val, max_len=int(fraction*6000)), CNNDataset(df_test, max_len=int(fraction*300), test_ds=True) |
|
|
|
|
|
df_train = DataLoader(df_train, batch_size=bs, num_workers=7, collate_fn=custom_collate) |
|
|
df_test = DataLoader(df_test, batch_size=1, num_workers=7, collate_fn=custom_collate) |
|
|
df_val = DataLoader(df_val, batch_size=bs, num_workers=7, collate_fn=custom_collate) |
|
|
|
|
|
|
|
|
return df_train, df_val, df_test |
|
|
|
|
|
if __name__ == '__main__': |
|
|
tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") |
|
|
|
|
|
tokenizer.add_special_tokens({'cls_token': '[START]'}) |
|
|
|
|
|
eot =tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0] |
|
|
pad =tokenizer("[PAD]", return_tensors="pt").input_ids[0] |
|
|
start =tokenizer("[START]", return_tensors="pt").input_ids[0] |
|
|
|
|
|
print(tokenizer.decode([1, 2, 50256])) |
|
|
print(tokenizer.decode([1, 2, 50257])) |
|
|
print(tokenizer('[START]')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|