edwjin commited on
Commit
b25aa2d
·
verified ·
1 Parent(s): 727198a

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +2 -4
tokenizer.py CHANGED
@@ -11,7 +11,6 @@ class SimpleTokenizer:
11
  """
12
 
13
  def __init__(self):
14
- print(nltk.__version__)
15
  """Initialize the tokenizer with an empty vocabulary."""
16
  self.vocab = set()
17
  self.stoi = {'<pad>': 0, '<unk>': 1}
@@ -22,7 +21,7 @@ class SimpleTokenizer:
22
  """Update vocabulary with new text."""
23
  tokens = word_tokenize(text)
24
  new_tokens = set(tokens) - self.vocab
25
-
26
  for token in new_tokens:
27
  index = self.vocab_size
28
  self.vocab.add(token)
@@ -37,5 +36,4 @@ class SimpleTokenizer:
37
 
38
  def decode(self, indices):
39
  """Decode the list of indices back into text."""
40
- return ' '.join([self.itos.get(index, '<unk>') for index in indices])
41
-
 
11
  """
12
 
13
  def __init__(self):
 
14
  """Initialize the tokenizer with an empty vocabulary."""
15
  self.vocab = set()
16
  self.stoi = {'<pad>': 0, '<unk>': 1}
 
21
  """Update vocabulary with new text."""
22
  tokens = word_tokenize(text)
23
  new_tokens = set(tokens) - self.vocab
24
+
25
  for token in new_tokens:
26
  index = self.vocab_size
27
  self.vocab.add(token)
 
36
 
37
  def decode(self, indices):
38
  """Decode the list of indices back into text."""
39
+ return ' '.join([self.itos.get(index, '<unk>') for index in indices])