File size: 4,556 Bytes
4d898ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from math import inf
# from utils import *
from h11 import Data
import torch
import torch.nn as nn
import numpy as np
import torch.utils
import torch.utils.data

from torch.utils.data import DataLoader, Dataset
# from utils import MyDataset, custom_collate
from torch.nn.utils.rnn import pad_sequence,pad_packed_sequence,pack_padded_sequence
import wandb
import torch.nn.functional as F

import einops
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2TokenizerFast
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re

class CNNDataset(Dataset):
    def __init__(self, df, max_length = 1000, max_len=21000, test_ds=False):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
        self.max_length = max_length
        self.test_ds = test_ds
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.add_special_tokens({'cls_token': '[START]'})

        self.eot = self.tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
        self.pad = self.tokenizer("[PAD]", return_tensors="pt").input_ids[0]
        self.start = self.tokenizer("[START]", return_tensors="pt").input_ids[0]
        print(len(self.tokenizer), self.start, "Pad")
        for index in range(max_len):
            x, y = self.df['article'][index], self.df['highlights'][index]
            x, y = re.sub(r'[\t\n\r]', ' ', x) , re.sub(r'[\t\n\r]', ' ', y) 
            y = self.tokenizer(y, return_tensors="pt", max_length=256,truncation=True).input_ids[0]
            x = self.tokenizer(x, return_tensors="pt", max_length=self.max_length-max(y.shape[0], 256+24), truncation=True).input_ids[0]
            self.df.loc[index, 'article'], self.df.loc[index, 'highlights'] = x,y

    def __len__(self, ):
        return self.max_len
    
    def __getitem__(self, index):
        x, y = self.df['article'][index], self.df['highlights'][index]
       
       # Check if middle self.eot is needed
        # print(x, self.eot)
        if self.test_ds:
            return torch.cat([self.eot, x, self.start]), torch.cat([y, self.eot])
        x = torch.cat([self.eot, x, self.start, y, self.eot]) 
        y = torch.cat([y, self.eot])

        y_final = torch.ones(x.shape[0], dtype=torch.long) 
        y_final[-y.shape[0]-1:-1] = y
        y_final[:-y.shape[0]-1] = self.pad
        return x, y_final

def properly_pad(context):
    lenghts = []
    # print(context)
    for i in context:
        lenghts.append(i.shape[0])
    lenghts = torch.tensor(lenghts)

    ind = torch.argsort(lenghts, descending=True)
    lenghts = lenghts[ind]

    sorted_tensors = [context[i] for i in ind]

    context = sorted_tensors
    context = pad_sequence(sequences=context, batch_first=True, padding_value=50257)

    return context

def custom_collate(batch):
    # print(batch)
    context, target = [], []
    # print(batch)
    for a,b in batch:
        context.append(a)
        target.append(b)

    context, target = properly_pad(context), properly_pad(target)

    return context, target

def import_data(bs=4, fraction=0.1):
    df_train = pd.read_csv('./cnn_dailymail/train.csv')
    df_val = pd.read_csv('./cnn_dailymail/validation.csv')
    df_test = pd.read_csv('./cnn_dailymail/test.csv')

    print('Loaded data')
    
    df_train, df_val, df_test = CNNDataset(df_train, max_len=int(21000*fraction)), CNNDataset(df_val, max_len=int(fraction*6000)), CNNDataset(df_test, max_len=int(fraction*300), test_ds=True)

    df_train = DataLoader(df_train, batch_size=bs, num_workers=7, collate_fn=custom_collate)
    df_test = DataLoader(df_test, batch_size=1, num_workers=7, collate_fn=custom_collate)
    df_val = DataLoader(df_val, batch_size=bs, num_workers=7, collate_fn=custom_collate)

    # print(df_train['article'][0])
    return df_train, df_val, df_test

if __name__ == '__main__':
    tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")

    tokenizer.add_special_tokens({'cls_token': '[START]'})

    eot =tokenizer("<|endoftext|>", return_tensors="pt").input_ids[0]
    pad =tokenizer("[PAD]", return_tensors="pt").input_ids[0]
    start =tokenizer("[START]", return_tensors="pt").input_ids[0]

    print(tokenizer.decode([1, 2, 50256]))
    print(tokenizer.decode([1, 2, 50257]))
    print(tokenizer('[START]'))
    # dl_train, dl_val, dl_test = import_data()
    # for x,y in dl_train:
    #     print(x.shape, y.shape)
    #     break