Spaces:
Sleeping
Sleeping
Update tokenizer.py
Browse files- tokenizer.py +2 -4
tokenizer.py
CHANGED
|
@@ -11,7 +11,6 @@ class SimpleTokenizer:
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
def __init__(self):
|
| 14 |
-
print(nltk.__version__)
|
| 15 |
"""Initialize the tokenizer with an empty vocabulary."""
|
| 16 |
self.vocab = set()
|
| 17 |
self.stoi = {'<pad>': 0, '<unk>': 1}
|
|
@@ -22,7 +21,7 @@ class SimpleTokenizer:
|
|
| 22 |
"""Update vocabulary with new text."""
|
| 23 |
tokens = word_tokenize(text)
|
| 24 |
new_tokens = set(tokens) - self.vocab
|
| 25 |
-
|
| 26 |
for token in new_tokens:
|
| 27 |
index = self.vocab_size
|
| 28 |
self.vocab.add(token)
|
|
@@ -37,5 +36,4 @@ class SimpleTokenizer:
|
|
| 37 |
|
| 38 |
def decode(self, indices):
|
| 39 |
"""Decode the list of indices back into text."""
|
| 40 |
-
return ' '.join([self.itos.get(index, '<unk>') for index in indices])
|
| 41 |
-
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
def __init__(self):
|
|
|
|
| 14 |
"""Initialize the tokenizer with an empty vocabulary."""
|
| 15 |
self.vocab = set()
|
| 16 |
self.stoi = {'<pad>': 0, '<unk>': 1}
|
|
|
|
| 21 |
"""Update vocabulary with new text."""
|
| 22 |
tokens = word_tokenize(text)
|
| 23 |
new_tokens = set(tokens) - self.vocab
|
| 24 |
+
|
| 25 |
for token in new_tokens:
|
| 26 |
index = self.vocab_size
|
| 27 |
self.vocab.add(token)
|
|
|
|
| 36 |
|
| 37 |
def decode(self, indices):
|
| 38 |
"""Decode the list of indices back into text."""
|
| 39 |
+
return ' '.join([self.itos.get(index, '<unk>') for index in indices])
|
|
|