File size: 7,363 Bytes
e5f704a
c5dc1d4
 
ccfa333
c5dc1d4
 
6543d58
ccfa333
dcd73d9
ccfa333
 
dcd73d9
ccfa333
dcd73d9
 
ccfa333
dcd73d9
ccfa333
 
dcd73d9
ccfa333
 
dcd73d9
 
ccfa333
dcd73d9
 
ccfa333
 
 
 
 
 
dcd73d9
 
ccfa333
 
 
 
 
 
 
6543d58
c5dc1d4
 
 
6543d58
 
 
ccfa333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcd73d9
ccfa333
 
dcd73d9
ccfa333
dcd73d9
ccfa333
dcd73d9
ccfa333
 
dcd73d9
ccfa333
dcd73d9
ccfa333
dcd73d9
ccfa333
dcd73d9
ccfa333
 
 
dcd73d9
ccfa333
 
 
dcd73d9
ccfa333
 
 
dcd73d9
 
ccfa333
dcd73d9
e5f704a
 
dcd73d9
 
 
 
 
ccfa333
dcd73d9
ccfa333
 
 
dcd73d9
 
ccfa333
dcd73d9
ccfa333
 
 
 
 
 
 
dcd73d9
6543d58
c5dc1d4
 
6543d58
c5dc1d4
 
6543d58
 
 
 
 
 
c5dc1d4
 
6543d58
 
 
 
 
 
c5dc1d4
6543d58
dcd73d9
 
 
e5f704a
6fc394f
dcd73d9
 
 
 
ccfa333
6543d58
 
ccfa333
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import random
from pipes import const
from pipes import utils
import string
import tensorflow as tf
import numpy as np

class SequenceLoader:
    def __init__(self):
        self.sequence_dict = None
        self.shuffled_sequences = None
        self.shuffled_indices = None
        self.sequences = None
        self.max_seq_length = None
        self.vocab = None
        self.lang = None

    def pack(self):
        self.sequences = utils.read_file("{}/raw/{}.txt".format(const.data_dir, self.lang))

        examples_count = len(self.sequences)
        split_index = int(examples_count * 0.80)

        if self.shuffled_indices is None:
            self.shuffled_indices = list(range(examples_count))
            random.shuffle(self.shuffled_indices)

        self.shuffled_sequences = [self.sequences[i] for i in self.shuffled_indices]

        self.sequence_dict = dict(
            train=self.shuffled_sequences[:split_index],
            val=self.shuffled_sequences[split_index:],
            count=examples_count,
        )

    def get_dict(self):
        return self.sequence_dict

    def set_lang(self, lang):
        self.lang = lang


def serialize(src_seq, tar_seq):
    tar_seq_in = tf.convert_to_tensor(tar_seq[:, :-1])
    tar_seq_out = tf.convert_to_tensor(tar_seq[:, 1:])
    src_seq = tf.convert_to_tensor(src_seq)
    return (src_seq, tar_seq_in), tar_seq_out


def remove_punctuation_from_seq(seq):
    english_punctuations = string.punctuation
    bangla_punctuations = "৷-–—’‘৳…।"
    all_punctuations = english_punctuations + bangla_punctuations
    cleaned_seq = ''.join([char for char in seq if char not in all_punctuations])
    cleaned_seq = cleaned_seq.strip()
    cleaned_seq = ' '.join(cleaned_seq.split())
    return cleaned_seq


def add_start_end_tags_seq(sequence):
    return '<SOS> ' + sequence + ' <EOS>'


def pad_sequence(sequence, max_seq_len, padding_token=0):
    padded_sequence = sequence[:max_seq_len] + [padding_token] * (max_seq_len - len(sequence))
    return padded_sequence


class SequenceProcessor:
    def __init__(self, _dataset_dict):
        self.max_seq_len = 0
        self.lang = None
        self.dataset_dict = _dataset_dict
        self.vocab = None

    def remove_punctuation(self):
        for i in range(len(self.dataset_dict[self.lang]["train"])):
            self.dataset_dict[self.lang]["train"][i] = remove_punctuation_from_seq(
                self.dataset_dict[self.lang]["train"][i])

        for i in range(len(self.dataset_dict[self.lang]["val"])):
            self.dataset_dict[self.lang]["val"][i] = remove_punctuation_from_seq(
                self.dataset_dict[self.lang]["val"][i])

    def build_vocab(self):
        vocab = set()

        for i in range(len(self.dataset_dict[self.lang]["train"])):
            seq = self.dataset_dict[self.lang]["train"][i]
            vocab.update(seq.split())

        for i in range(len(self.dataset_dict[self.lang]["val"])):
            seq = self.dataset_dict[self.lang]["val"][i]
            vocab.update(seq.split())

        self.vocab = sorted(list(vocab))
        self.dataset_dict[self.lang]["vocab"] = self.vocab
        self.dataset_dict[self.lang]["vocab_size"] = len(self.vocab)

    def add_start_end_tags(self):
        for i in range(len(self.dataset_dict[self.lang]["train"])):
            self.dataset_dict[self.lang]["train"][i] = add_start_end_tags_seq(
                self.dataset_dict[self.lang]["train"][i])
            self.max_seq_len = max(len(self.dataset_dict[self.lang]["train"][i].split()), self.max_seq_len)

        for i in range(len(self.dataset_dict[self.lang]["val"])):
            self.dataset_dict[self.lang]["val"][i] = add_start_end_tags_seq(
                self.dataset_dict[self.lang]["val"][i])
            self.max_seq_len = max(len(self.dataset_dict[self.lang]["val"][i].split()), self.max_seq_len)

        self.dataset_dict[self.lang]["max_seq_len"] = self.max_seq_len

    def tokenize(self):
        for i in range(len(self.dataset_dict[self.lang]["train"])):
            seq = self.dataset_dict[self.lang]["train"][i]
            tokens = []
            for word in seq.split():
                tokens.append(self.vocab.index(word))
            self.dataset_dict[self.lang]["train"][i] = tokens

        for i in range(len(self.dataset_dict[self.lang]["val"])):
            seq = self.dataset_dict[self.lang]["val"][i]
            tokens = []
            for word in seq.split():
                tokens.append(self.vocab.index(word))
            self.dataset_dict[self.lang]["val"][i] = tokens

    def pad(self, max_seq_len=const.MAX_SEQ_LEN):

        for i in range(len(self.dataset_dict[self.lang]["train"])):
            self.dataset_dict[self.lang]["train"][i] = pad_sequence(
                sequence=self.dataset_dict[self.lang]["train"][i], max_seq_len=max_seq_len)

        for i in range(len(self.dataset_dict[self.lang]["val"])):
            self.dataset_dict[self.lang]["val"][i] = pad_sequence(sequence=self.dataset_dict[self.lang]["val"][i],
                                                                  max_seq_len=self.max_seq_len)

    def set_lang(self, lang):
        self.lang = lang
        self.max_seq_len = 0

    def get_dict(self):
        return self.dataset_dict


class Dataset:
    def __init__(self, langs):
        self.langs = langs
        self.dataset_dict = {}

    def pack(self):
        seq_loader = SequenceLoader()
        for lang in self.langs:
            seq_loader.set_lang(lang)
            seq_loader.pack()
            self.dataset_dict[lang] = seq_loader.get_dict()

    def process(self):
        seq_processor = SequenceProcessor(self.dataset_dict)
        for lang in self.langs:
            seq_processor.set_lang(lang)
            seq_processor.remove_punctuation()
            seq_processor.add_start_end_tags()
            seq_processor.build_vocab()
            seq_processor.tokenize()
            seq_processor.pad()
        self.dataset_dict = seq_processor.get_dict()

    def pull(self):
        src_lang_train_seqs = np.array(self.dataset_dict[self.langs[0]]["train"])
        tar_lang_train_seqs = np.array(self.dataset_dict[self.langs[1]]["train"])

        src_lang_val_seqs = np.array(self.dataset_dict[self.langs[0]]["val"])
        tar_lang_val_seqs = np.array(self.dataset_dict[self.langs[1]]["val"])

        train_ds = ((tf.data.Dataset
                     .from_tensor_slices((src_lang_train_seqs, tar_lang_train_seqs)))
                    .shuffle(const.BUFFER_SIZE)
                    .batch(const.BATCH_SIZE))

        val_ds = ((tf.data.Dataset
                   .from_tensor_slices((src_lang_val_seqs, tar_lang_val_seqs)))
                  .shuffle(const.BUFFER_SIZE)
                  .batch(const.BATCH_SIZE))

        train_ds = train_ds.map(serialize, tf.data.AUTOTUNE)
        val_ds = val_ds.map(serialize, tf.data.AUTOTUNE)

        return train_ds, val_ds

    def get_dict(self):
        return self.dataset_dict


if __name__ == "__main__":
    dataset_object = Dataset(const.langs)
    dataset_object.pack()
    dataset_dict = dataset_object.get_dict()
    utils.save_dict("{}/dataset.txt".format(const.data_dir), dataset_dict)
    dataset_object.process()
    trainset, valset = dataset_object.pull()

    print(utils.load_dict("{}/dataset.txt".format(const.data_dir)))