File size: 3,484 Bytes
9ec3d0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import math 
import numpy as np
import pandas as pd

class Tokenizer:
    def __init__(self, captions:list[str]):
        self.captions: list[str] = captions
        self.min_freq = 2
        self.BOS = "<BOS>"
        self.EOS = "<EOS>"
        self.PAD = "<PAD>"
        self.UNK = "<UNK>"
        self.build(captions) # captions, tokenized_captions

    def normalize(self, text:str) -> list[str]:
        keeped = []
        for word in text.split():
            word = word.lower()
            _w = ""
            for ch in word:
                if ch.isalpha():
                    _w += ch
            keeped.append(_w)
        return [self.BOS] + keeped + [self.EOS]

    def build(self, captions:list[str]):

        self.captions = []
        for sentence in captions:
            cap = self.normalize(sentence)
            self.captions.append(cap)
        
        freq = {}
        for cap in self.captions:
            for t in cap:
                freq[t] = freq.get(t, 0) + 1

        captions = []
        for cap in self.captions:
            sentence = []
            for w in cap:
                if freq[w] > self.min_freq:
                    sentence.append(w)
                else:
                    sentence.append(self.UNK)
            captions.append(sentence)
        
        self.captions = captions
        
        sepical_tokens = [self.PAD, self.UNK, self.BOS, self.EOS]
        self.vocap = list(set(sepical_tokens+[word for cap in self.captions for word in cap]))
        self.vocap_size = len(self.vocap)
        
        seqs = np.array([len(cap) for cap in self.captions])
        self.max_seq_len = np.max(seqs)
        self.min_seq_len = np.min(seqs)
        self.avg_len = np.mean(seqs)
        self.leng_std = np.std(seqs)
        self.recommended_seq_len = math.ceil(self.avg_len + 2 * self.leng_std)*2

        self.captions = [cap + [self.PAD] * (self.max_seq_len - len(cap)) for cap in self.captions]
        
        self.char2idx = {t:i for i,t in enumerate(self.vocap)}
        self.idx2char = {i:t for t,i in self.char2idx.items()}

        captions = []
        for cap in self.captions:
            sentence = []
            for w in cap:
                sentence.append(self.char2idx[w])
            captions.append(sentence)
        self.tokenized_captions = captions

    def set_vocab(self, vocap:list[str]):
        self.vocap = vocap
        self.char2idx = {t:i for i,t in enumerate(self.vocap)}
        self.idx2char = {i:t for t,i in self.char2idx.items()}
        self.vocap_size = len(self.char2idx)
        
        # captions = []
        # for cap in self.captions:
        #     sentence = []
        #     for w in cap:
        #         sentence.append(self.char2idx[w])
        #     captions.append(sentence)
        # self.tokenized_captions = captions
        return self

    def encode(self, text:str) -> list[int]:
        toks = self.normalize(text)
        unk_id = self.char2idx[self.UNK]
        return [self.char2idx.get(t, unk_id) for t in toks]

    def decode(self, ids:list[int]) -> str:
        s = []
        pad = self.char2idx.get(self.PAD)
        bos = self.char2idx.get(self.BOS)
        eos = self.char2idx.get(self.EOS)
        for i in ids:
            if pad is not None and i == pad:   continue
            if bos is not None and i == bos:   continue
            if eos is not None and i == eos:   break
            s.append(self.idx2char.get(i, self.UNK))
        return " ".join(s)