import math import numpy as np import pandas as pd class Tokenizer: def __init__(self, captions:list[str]): self.captions: list[str] = captions self.min_freq = 2 self.BOS = "" self.EOS = "" self.PAD = "" self.UNK = "" self.build(captions) # captions, tokenized_captions def normalize(self, text:str) -> list[str]: keeped = [] for word in text.split(): word = word.lower() _w = "" for ch in word: if ch.isalpha(): _w += ch keeped.append(_w) return [self.BOS] + keeped + [self.EOS] def build(self, captions:list[str]): self.captions = [] for sentence in captions: cap = self.normalize(sentence) self.captions.append(cap) freq = {} for cap in self.captions: for t in cap: freq[t] = freq.get(t, 0) + 1 captions = [] for cap in self.captions: sentence = [] for w in cap: if freq[w] > self.min_freq: sentence.append(w) else: sentence.append(self.UNK) captions.append(sentence) self.captions = captions sepical_tokens = [self.PAD, self.UNK, self.BOS, self.EOS] self.vocap = list(set(sepical_tokens+[word for cap in self.captions for word in cap])) self.vocap_size = len(self.vocap) seqs = np.array([len(cap) for cap in self.captions]) self.max_seq_len = np.max(seqs) self.min_seq_len = np.min(seqs) self.avg_len = np.mean(seqs) self.leng_std = np.std(seqs) self.recommended_seq_len = math.ceil(self.avg_len + 2 * self.leng_std)*2 self.captions = [cap + [self.PAD] * (self.max_seq_len - len(cap)) for cap in self.captions] self.char2idx = {t:i for i,t in enumerate(self.vocap)} self.idx2char = {i:t for t,i in self.char2idx.items()} captions = [] for cap in self.captions: sentence = [] for w in cap: sentence.append(self.char2idx[w]) captions.append(sentence) self.tokenized_captions = captions def set_vocab(self, vocap:list[str]): self.vocap = vocap self.char2idx = {t:i for i,t in enumerate(self.vocap)} self.idx2char = {i:t for t,i in self.char2idx.items()} self.vocap_size = len(self.char2idx) # captions = [] # for cap in self.captions: # sentence = [] # for w in cap: # sentence.append(self.char2idx[w]) # captions.append(sentence) # self.tokenized_captions = captions return self def encode(self, text:str) -> list[int]: toks = self.normalize(text) unk_id = self.char2idx[self.UNK] return [self.char2idx.get(t, unk_id) for t in toks] def decode(self, ids:list[int]) -> str: s = [] pad = self.char2idx.get(self.PAD) bos = self.char2idx.get(self.BOS) eos = self.char2idx.get(self.EOS) for i in ids: if pad is not None and i == pad: continue if bos is not None and i == bos: continue if eos is not None and i == eos: break s.append(self.idx2char.get(i, self.UNK)) return " ".join(s)