Spaces:
Sleeping
Sleeping
Update tokenizer.py
Browse files- tokenizer.py +41 -38
tokenizer.py
CHANGED
|
@@ -1,38 +1,41 @@
|
|
| 1 |
-
import nltk
|
| 2 |
-
nltk.download('punkt')
|
| 3 |
-
|
| 4 |
-
from nltk.tokenize import word_tokenize
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class SimpleTokenizer:
|
| 9 |
-
"""
|
| 10 |
-
A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
def __init__(self
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
self.
|
| 17 |
-
self.
|
| 18 |
-
self.
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
nltk.download('punkt')
|
| 3 |
+
|
| 4 |
+
from nltk.tokenize import word_tokenize
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SimpleTokenizer:
|
| 9 |
+
"""
|
| 10 |
+
A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
print(nltk.__version__)
|
| 15 |
+
"""Initialize the tokenizer with an empty vocabulary."""
|
| 16 |
+
self.vocab = set()
|
| 17 |
+
self.stoi = {'<pad>': 0, '<unk>': 1}
|
| 18 |
+
self.itos = {0: '<pad>', 1: '<unk>'}
|
| 19 |
+
self.vocab_size = 2 # Starting with <pad> and <unk> tokens
|
| 20 |
+
|
| 21 |
+
def update_vocab(self, text):
|
| 22 |
+
"""Update vocabulary with new text."""
|
| 23 |
+
tokens = word_tokenize(text)
|
| 24 |
+
new_tokens = set(tokens) - self.vocab
|
| 25 |
+
|
| 26 |
+
for token in new_tokens:
|
| 27 |
+
index = self.vocab_size
|
| 28 |
+
self.vocab.add(token)
|
| 29 |
+
self.stoi[token] = index
|
| 30 |
+
self.itos[index] = token
|
| 31 |
+
self.vocab_size += 1
|
| 32 |
+
|
| 33 |
+
def encode(self, text):
|
| 34 |
+
"""Encode the text into a list of indices."""
|
| 35 |
+
tokens = word_tokenize(text)
|
| 36 |
+
return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
|
| 37 |
+
|
| 38 |
+
def decode(self, indices):
|
| 39 |
+
"""Decode the list of indices back into text."""
|
| 40 |
+
return ' '.join([self.itos.get(index, '<unk>') for index in indices])
|
| 41 |
+
|