Spaces:

edwjin
/

docker-classifier

Sleeping

edwjin commited on Jun 23, 2024

Commit

d83076b

verified ·

1 Parent(s): 7ba111a

Update tokenizer.py

Files changed (1) hide show

tokenizer.py CHANGED Viewed

@@ -1,38 +1,41 @@
-import nltk
-nltk.download('punkt')
-from nltk.tokenize import word_tokenize
-import os
-class SimpleTokenizer:
-    """
-    A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
-    """
-    def __init__(self, text):
-        """Initialize the tokenizer with the initial text to build vocabulary."""
-        self.vocab = set()
-        self.stoi = {}
-        self.itos = {}
-        self.build_vocab(text)
-    def build_vocab(self, text):
-        """Build vocabulary from the given text."""
-        tokens = word_tokenize(text)
-        self.vocab = set(tokens)
-        self.vocab_size = len(self.vocab) + 2
-        self.stoi = {word: i for i, word in enumerate(self.vocab, start=2)}
-        self.stoi['<pad>'] = 0
-        self.stoi['<unk>'] = 1
-        self.itos = {i: word for word, i in self.stoi.items()}
-    def encode(self, text):
-        """Encode the text into a list of indices."""
-        tokens = word_tokenize(text)
-        return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
-    def decode(self, indices):
-        """Decode the list of indices back into text."""
-        return ' '.join([self.itos.get(index, '<unk>') for index in indices])

+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+import os
+class SimpleTokenizer:
+    """
+    A simple tokenizer class that builds a vocabulary from the given text and encodes/decodes text into indices.
+    """
+    def __init__(self):
+        print(nltk.__version__)
+        """Initialize the tokenizer with an empty vocabulary."""
+        self.vocab = set()
+        self.stoi = {'<pad>': 0, '<unk>': 1}
+        self.itos = {0: '<pad>', 1: '<unk>'}
+        self.vocab_size = 2  # Starting with <pad> and <unk> tokens
+    def update_vocab(self, text):
+        """Update vocabulary with new text."""
+        tokens = word_tokenize(text)
+        new_tokens = set(tokens) - self.vocab
+        for token in new_tokens:
+            index = self.vocab_size
+            self.vocab.add(token)
+            self.stoi[token] = index
+            self.itos[index] = token
+            self.vocab_size += 1
+    def encode(self, text):
+        """Encode the text into a list of indices."""
+        tokens = word_tokenize(text)
+        return [self.stoi.get(word, self.stoi['<unk>']) for word in tokens]
+    def decode(self, indices):
+        """Decode the list of indices back into text."""
+        return ' '.join([self.itos.get(index, '<unk>') for index in indices])