edwjin commited on
Commit
d83076b
·
verified ·
1 Parent(s): 7ba111a

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +41 -38
tokenizer.py CHANGED
@@ -1,38 +1,41 @@
1
- import nltk
2
- nltk.download('punkt')
3
-
4
- from nltk.tokenize import word_tokenize
5
- import os
6
-
7
-
8
- class SimpleTokenizer:
9
- """
10
- A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
11
- """
12
-
13
- def __init__(self, text):
14
- """Initialize the tokenizer with the initial text to build vocabulary."""
15
- self.vocab = set()
16
- self.stoi = {}
17
- self.itos = {}
18
- self.build_vocab(text)
19
-
20
- def build_vocab(self, text):
21
- """Build vocabulary from the given text."""
22
- tokens = word_tokenize(text)
23
- self.vocab = set(tokens)
24
- self.vocab_size = len(self.vocab) + 2
25
- self.stoi = {word: i for i, word in enumerate(self.vocab, start=2)}
26
- self.stoi['<pad>'] = 0
27
- self.stoi['<unk>'] = 1
28
- self.itos = {i: word for word, i in self.stoi.items()}
29
-
30
- def encode(self, text):
31
- """Encode the text into a list of indices."""
32
- tokens = word_tokenize(text)
33
- return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
34
-
35
- def decode(self, indices):
36
- """Decode the list of indices back into text."""
37
- return ' '.join([self.itos.get(index, '<unk>') for index in indices])
38
-
 
 
 
 
1
+ import nltk
2
+ nltk.download('punkt')
3
+
4
+ from nltk.tokenize import word_tokenize
5
+ import os
6
+
7
+
8
+ class SimpleTokenizer:
9
+ """
10
+ A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
11
+ """
12
+
13
+ def __init__(self):
14
+ print(nltk.__version__)
15
+ """Initialize the tokenizer with an empty vocabulary."""
16
+ self.vocab = set()
17
+ self.stoi = {'<pad>': 0, '<unk>': 1}
18
+ self.itos = {0: '<pad>', 1: '<unk>'}
19
+ self.vocab_size = 2 # Starting with <pad> and <unk> tokens
20
+
21
+ def update_vocab(self, text):
22
+ """Update vocabulary with new text."""
23
+ tokens = word_tokenize(text)
24
+ new_tokens = set(tokens) - self.vocab
25
+
26
+ for token in new_tokens:
27
+ index = self.vocab_size
28
+ self.vocab.add(token)
29
+ self.stoi[token] = index
30
+ self.itos[index] = token
31
+ self.vocab_size += 1
32
+
33
+ def encode(self, text):
34
+ """Encode the text into a list of indices."""
35
+ tokens = word_tokenize(text)
36
+ return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
37
+
38
+ def decode(self, indices):
39
+ """Decode the list of indices back into text."""
40
+ return ' '.join([self.itos.get(index, '<unk>') for index in indices])
41
+