Spaces:
Sleeping
Sleeping
File size: 3,484 Bytes
9ec3d0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import math
import numpy as np
import pandas as pd
class Tokenizer:
def __init__(self, captions:list[str]):
self.captions: list[str] = captions
self.min_freq = 2
self.BOS = "<BOS>"
self.EOS = "<EOS>"
self.PAD = "<PAD>"
self.UNK = "<UNK>"
self.build(captions) # captions, tokenized_captions
def normalize(self, text:str) -> list[str]:
keeped = []
for word in text.split():
word = word.lower()
_w = ""
for ch in word:
if ch.isalpha():
_w += ch
keeped.append(_w)
return [self.BOS] + keeped + [self.EOS]
def build(self, captions:list[str]):
self.captions = []
for sentence in captions:
cap = self.normalize(sentence)
self.captions.append(cap)
freq = {}
for cap in self.captions:
for t in cap:
freq[t] = freq.get(t, 0) + 1
captions = []
for cap in self.captions:
sentence = []
for w in cap:
if freq[w] > self.min_freq:
sentence.append(w)
else:
sentence.append(self.UNK)
captions.append(sentence)
self.captions = captions
sepical_tokens = [self.PAD, self.UNK, self.BOS, self.EOS]
self.vocap = list(set(sepical_tokens+[word for cap in self.captions for word in cap]))
self.vocap_size = len(self.vocap)
seqs = np.array([len(cap) for cap in self.captions])
self.max_seq_len = np.max(seqs)
self.min_seq_len = np.min(seqs)
self.avg_len = np.mean(seqs)
self.leng_std = np.std(seqs)
self.recommended_seq_len = math.ceil(self.avg_len + 2 * self.leng_std)*2
self.captions = [cap + [self.PAD] * (self.max_seq_len - len(cap)) for cap in self.captions]
self.char2idx = {t:i for i,t in enumerate(self.vocap)}
self.idx2char = {i:t for t,i in self.char2idx.items()}
captions = []
for cap in self.captions:
sentence = []
for w in cap:
sentence.append(self.char2idx[w])
captions.append(sentence)
self.tokenized_captions = captions
def set_vocab(self, vocap:list[str]):
self.vocap = vocap
self.char2idx = {t:i for i,t in enumerate(self.vocap)}
self.idx2char = {i:t for t,i in self.char2idx.items()}
self.vocap_size = len(self.char2idx)
# captions = []
# for cap in self.captions:
# sentence = []
# for w in cap:
# sentence.append(self.char2idx[w])
# captions.append(sentence)
# self.tokenized_captions = captions
return self
def encode(self, text:str) -> list[int]:
toks = self.normalize(text)
unk_id = self.char2idx[self.UNK]
return [self.char2idx.get(t, unk_id) for t in toks]
def decode(self, ids:list[int]) -> str:
s = []
pad = self.char2idx.get(self.PAD)
bos = self.char2idx.get(self.BOS)
eos = self.char2idx.get(self.EOS)
for i in ids:
if pad is not None and i == pad: continue
if bos is not None and i == bos: continue
if eos is not None and i == eos: break
s.append(self.idx2char.get(i, self.UNK))
return " ".join(s)
|