Spaces:
Sleeping
Sleeping
File size: 1,253 Bytes
d83076b b25aa2d d83076b b25aa2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import os
class SimpleTokenizer:
"""
A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
"""
def __init__(self):
"""Initialize the tokenizer with an empty vocabulary."""
self.vocab = set()
self.stoi = {'<pad>': 0, '<unk>': 1}
self.itos = {0: '<pad>', 1: '<unk>'}
self.vocab_size = 2 # Starting with <pad> and <unk> tokens
def update_vocab(self, text):
"""Update vocabulary with new text."""
tokens = word_tokenize(text)
new_tokens = set(tokens) - self.vocab
for token in new_tokens:
index = self.vocab_size
self.vocab.add(token)
self.stoi[token] = index
self.itos[index] = token
self.vocab_size += 1
def encode(self, text):
"""Encode the text into a list of indices."""
tokens = word_tokenize(text)
return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
def decode(self, indices):
"""Decode the list of indices back into text."""
return ' '.join([self.itos.get(index, '<unk>') for index in indices]) |