Spaces:

mevlt01001
/

image_captioner

Sleeping

App Files Files Community

image_captioner / src /modules /tokenizer.py

mevlt01001

Upload 7 files

9ec3d0b verified 5 months ago

raw

history blame contribute delete

3.48 kB

	import math
	import numpy as np
	import pandas as pd

	class Tokenizer:
	def __init__(self, captions:list[str]):
	self.captions: list[str] = captions
	self.min_freq = 2
	self.BOS = "<BOS>"
	self.EOS = "<EOS>"
	self.PAD = "<PAD>"
	self.UNK = "<UNK>"
	self.build(captions) # captions, tokenized_captions

	def normalize(self, text:str) -> list[str]:
	keeped = []
	for word in text.split():
	word = word.lower()
	_w = ""
	for ch in word:
	if ch.isalpha():
	_w += ch
	keeped.append(_w)
	return [self.BOS] + keeped + [self.EOS]

	def build(self, captions:list[str]):

	self.captions = []
	for sentence in captions:
	cap = self.normalize(sentence)
	self.captions.append(cap)

	freq = {}
	for cap in self.captions:
	for t in cap:
	freq[t] = freq.get(t, 0) + 1

	captions = []
	for cap in self.captions:
	sentence = []
	for w in cap:
	if freq[w] > self.min_freq:
	sentence.append(w)
	else:
	sentence.append(self.UNK)
	captions.append(sentence)

	self.captions = captions

	sepical_tokens = [self.PAD, self.UNK, self.BOS, self.EOS]
	self.vocap = list(set(sepical_tokens+[word for cap in self.captions for word in cap]))
	self.vocap_size = len(self.vocap)

	seqs = np.array([len(cap) for cap in self.captions])
	self.max_seq_len = np.max(seqs)
	self.min_seq_len = np.min(seqs)
	self.avg_len = np.mean(seqs)
	self.leng_std = np.std(seqs)
	self.recommended_seq_len = math.ceil(self.avg_len + 2 * self.leng_std)*2

	self.captions = [cap + [self.PAD] * (self.max_seq_len - len(cap)) for cap in self.captions]

	self.char2idx = {t:i for i,t in enumerate(self.vocap)}
	self.idx2char = {i:t for t,i in self.char2idx.items()}

	captions = []
	for cap in self.captions:
	sentence = []
	for w in cap:
	sentence.append(self.char2idx[w])
	captions.append(sentence)
	self.tokenized_captions = captions

	def set_vocab(self, vocap:list[str]):
	self.vocap = vocap
	self.char2idx = {t:i for i,t in enumerate(self.vocap)}
	self.idx2char = {i:t for t,i in self.char2idx.items()}
	self.vocap_size = len(self.char2idx)

	# captions = []
	# for cap in self.captions:
	# sentence = []
	# for w in cap:
	# sentence.append(self.char2idx[w])
	# captions.append(sentence)
	# self.tokenized_captions = captions
	return self

	def encode(self, text:str) -> list[int]:
	toks = self.normalize(text)
	unk_id = self.char2idx[self.UNK]
	return [self.char2idx.get(t, unk_id) for t in toks]

	def decode(self, ids:list[int]) -> str:
	s = []
	pad = self.char2idx.get(self.PAD)
	bos = self.char2idx.get(self.BOS)
	eos = self.char2idx.get(self.EOS)
	for i in ids:
	if pad is not None and i == pad: continue
	if bos is not None and i == bos: continue
	if eos is not None and i == eos: break
	s.append(self.idx2char.get(i, self.UNK))
	return " ".join(s)