image_captioner / src /modules /tokenizer.py
mevlt01001's picture
Upload 7 files
9ec3d0b verified
import math
import numpy as np
import pandas as pd
class Tokenizer:
def __init__(self, captions:list[str]):
self.captions: list[str] = captions
self.min_freq = 2
self.BOS = "<BOS>"
self.EOS = "<EOS>"
self.PAD = "<PAD>"
self.UNK = "<UNK>"
self.build(captions) # captions, tokenized_captions
def normalize(self, text:str) -> list[str]:
keeped = []
for word in text.split():
word = word.lower()
_w = ""
for ch in word:
if ch.isalpha():
_w += ch
keeped.append(_w)
return [self.BOS] + keeped + [self.EOS]
def build(self, captions:list[str]):
self.captions = []
for sentence in captions:
cap = self.normalize(sentence)
self.captions.append(cap)
freq = {}
for cap in self.captions:
for t in cap:
freq[t] = freq.get(t, 0) + 1
captions = []
for cap in self.captions:
sentence = []
for w in cap:
if freq[w] > self.min_freq:
sentence.append(w)
else:
sentence.append(self.UNK)
captions.append(sentence)
self.captions = captions
sepical_tokens = [self.PAD, self.UNK, self.BOS, self.EOS]
self.vocap = list(set(sepical_tokens+[word for cap in self.captions for word in cap]))
self.vocap_size = len(self.vocap)
seqs = np.array([len(cap) for cap in self.captions])
self.max_seq_len = np.max(seqs)
self.min_seq_len = np.min(seqs)
self.avg_len = np.mean(seqs)
self.leng_std = np.std(seqs)
self.recommended_seq_len = math.ceil(self.avg_len + 2 * self.leng_std)*2
self.captions = [cap + [self.PAD] * (self.max_seq_len - len(cap)) for cap in self.captions]
self.char2idx = {t:i for i,t in enumerate(self.vocap)}
self.idx2char = {i:t for t,i in self.char2idx.items()}
captions = []
for cap in self.captions:
sentence = []
for w in cap:
sentence.append(self.char2idx[w])
captions.append(sentence)
self.tokenized_captions = captions
def set_vocab(self, vocap:list[str]):
self.vocap = vocap
self.char2idx = {t:i for i,t in enumerate(self.vocap)}
self.idx2char = {i:t for t,i in self.char2idx.items()}
self.vocap_size = len(self.char2idx)
# captions = []
# for cap in self.captions:
# sentence = []
# for w in cap:
# sentence.append(self.char2idx[w])
# captions.append(sentence)
# self.tokenized_captions = captions
return self
def encode(self, text:str) -> list[int]:
toks = self.normalize(text)
unk_id = self.char2idx[self.UNK]
return [self.char2idx.get(t, unk_id) for t in toks]
def decode(self, ids:list[int]) -> str:
s = []
pad = self.char2idx.get(self.PAD)
bos = self.char2idx.get(self.BOS)
eos = self.char2idx.get(self.EOS)
for i in ids:
if pad is not None and i == pad: continue
if bos is not None and i == bos: continue
if eos is not None and i == eos: break
s.append(self.idx2char.get(i, self.UNK))
return " ".join(s)