shivendrra
/

enigma1

bio-transformers

transfomer-model

mixture-of-experts

Model card Files Files and versions

enigma1 / tokenizer /perChar.py

shivendrra's picture

added tokenizer files

f5eb6b9 verified over 1 year ago

history blame contribute delete

1.47 kB

	import os
	current_dir = os.path.dirname(os.path.realpath(__file__))
	os.chdir(current_dir)

	class PerCharTokenizer:
	"""
	Args:
	- chars (list): all bases along with special tokens represented as characters
	- vocab_size (int): size of vocabulary

	Working:
	- vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token
	- encode(): iterates over each character a time and the looks up for the position in vocab
	and returns it's position as integer
	- decode(): takes input of a list of integers and returns the specific item from vocab
	"""
	def __init__(self):
	super().__init__()
	self.chars = ['\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' ']
	self.vocab_size = len(self.chars)
	self.string_to_index = {ch: i for i, ch in enumerate(self.chars)}
	self.index_to_string = {i: ch for i, ch in enumerate(self.chars)}

	def encode(self, string):
	encoded = []
	for char in string:
	if char in self.string_to_index:
	encoded.append(self.string_to_index[char])
	else:
	special_index = len(self.string_to_index)
	self.string_to_index[char] = special_index
	self.index_to_string[special_index] = char
	encoded.append(special_index)
	return encoded

	def decode(self, integer):
	decoded = []
	for i in integer:
	if i in self.index_to_string:
	decoded.append(self.index_to_string[i])
	else:
	continue
	return ''.join(decoded)