Spaces:

vedaco
/

veda-programming

Sleeping

App Files Files Community

vedaco commited on Jan 18

Commit

c4cd8de

verified ·

1 Parent(s): 594bc39

Update tokenizer.py

Browse files

Files changed (1) hide show

tokenizer.py +175 -113

tokenizer.py CHANGED Viewed

@@ -1,21 +1,28 @@
-"""Tokenizer for Veda Programming Assistant"""
 import json
 import re
-from typing import List, Dict, Optional
 class VedaTokenizer:
-    """Tokenizer with conversation support"""
     def __init__(self, vocab_size: int = 8000):
         self.vocab_size = vocab_size
         self.token_to_idx: Dict[str, int] = {}
         self.idx_to_token: Dict[int, str] = {}
-        self._init_vocab()
-    def _init_vocab(self):
-        """Initialize vocabulary with conversation tokens"""
         special = [
             "<PAD>", "<UNK>", "<START>", "<END>",
             "<CODE>", "<ENDCODE>",
@@ -26,143 +33,190 @@ class VedaTokenizer:
             self.token_to_idx[token] = idx
             self.idx_to_token[idx] = token
         idx = len(special)
         for i in range(32, 127):
             char = chr(i)
-            self.token_to_idx[char] = idx
-            self.idx_to_token[idx] = char
-            idx += 1
-        for char in ["\n", "\t"]:
-            self.token_to_idx[char] = idx
-            self.idx_to_token[idx] = char
-            idx += 1
         self.base_vocab_size = idx
     def fit(self, texts: List[str]):
-        """Build vocabulary"""
-        word_freq = {}
         for text in texts:
-            words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[^\s]', text)
             for word in words:
-                word_freq[word] = word_freq.get(word, 0) + 1
-        sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
-        idx = self.base_vocab_size
-        for word, _ in sorted_words:
-            if idx >= self.vocab_size:
                 break
-            if word not in self.token_to_idx and len(word) <= 25:
-                self.token_to_idx[word] = idx
-                self.idx_to_token[idx] = word
-                idx += 1
-        print(f"Vocabulary: {len(self.token_to_idx)} tokens")
     def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
-        """Encode text"""
-        tokens = self._tokenize(text)
         encoded = []
-        for token in tokens:
-            if token in self.token_to_idx:
-                encoded.append(self.token_to_idx[token])
             else:
-                for char in token:
-                    encoded.append(self.token_to_idx.get(char, 1))
         if max_length:
-            if len(encoded) < max_length:
-                encoded += [0] * (max_length - len(encoded))
-            else:
                 encoded = encoded[:max_length]
         return encoded
-    def _tokenize(self, text: str) -> List[str]:
-        """Tokenize text"""
-        tokens = []
-        parts = re.split(r'(\s+)', text)
-        for part in parts:
-            if not part:
-                continue
-            if part.isspace():
-                for char in part:
-                    tokens.append(char)
-            elif part in self.token_to_idx:
-                tokens.append(part)
-            else:
-                i = 0
-                while i < len(part):
-                    matched = False
-                    for length in range(min(len(part) - i, 20), 0, -1):
-                        substr = part[i:i+length]
-                        if substr in self.token_to_idx:
-                            tokens.append(substr)
-                            i += length
-                            matched = True
-                            break
-                    if not matched:
-                        tokens.append(part[i])
-                        i += 1
-        return tokens
     def decode(self, indices: List[int]) -> str:
         """Decode indices to text"""
-        result = []
-        prev = ""
         for idx in indices:
-            if idx == 0:
-                continue
-            if idx not in self.idx_to_token:
-                continue
-            token = self.idx_to_token[idx]
-            if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
-                continue
-            if token == "<CODE>":
-                result.append("\n```python\n")
-                prev = "\n"
-                continue
-            if token == "<ENDCODE>":
-                result.append("\n```\n")
-                prev = "\n"
-                continue
-            if not result:
-                result.append(token)
-            elif token in "\n\t":
-                result.append(token)
-            elif token in ".,;:!?()[]{}":
-                result.append(token)
-            elif prev in "(\n\t[{":
-                result.append(token)
-            elif len(prev) > 0 and prev[-1].isalnum() and len(token) > 0 and token[0].isalnum():
-                result.append(" " + token)
-            else:
-                result.append(token)
-            prev = token
-        return "".join(result)
     def save(self, path: str):
         """Save tokenizer"""
         with open(path, 'w') as f:
-            json.dump({
-                'vocab_size': self.vocab_size,
-                'token_to_idx': self.token_to_idx,
-                'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
-                'base_vocab_size': self.base_vocab_size
-            }, f, indent=2)
     def load(self, path: str):
         """Load tokenizer"""
@@ -172,6 +226,14 @@ class VedaTokenizer:
         self.token_to_idx = data['token_to_idx']
         self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
         self.base_vocab_size = data.get('base_vocab_size', 100)
     @property
     def vocabulary_size(self) -> int:

+"""Subword Tokenizer (BPE-like) for Veda Programming Assistant"""
 import json
 import re
+from typing import List, Dict, Optional, Tuple
 class VedaTokenizer:
+    """
+    Subword tokenizer that learns common subwords/phrases.
+    Better than word-level or char-level tokenization.
+    """
     def __init__(self, vocab_size: int = 8000):
         self.vocab_size = vocab_size
         self.token_to_idx: Dict[str, int] = {}
         self.idx_to_token: Dict[int, str] = {}
+        # Base vocabulary (special tokens + ASCII)
+        self._init_base_vocab()
+        # Merges for subwords (pair -> new_token)
+        self.merges: Dict[Tuple[str, str], str] = {}
+    def _init_base_vocab(self):
+        """Initialize base vocabulary"""
         special = [
             "<PAD>", "<UNK>", "<START>", "<END>",
             "<CODE>", "<ENDCODE>",
             self.token_to_idx[token] = idx
             self.idx_to_token[idx] = token
+        # ASCII characters as base tokens
         idx = len(special)
+        # Printable ASCII range
         for i in range(32, 127):
             char = chr(i)
+            if char not in self.token_to_idx:
+                self.token_to_idx[char] = idx
+                self.idx_to_token[idx] = char
+                idx += 1
+        # Common whitespace
+        for char in ["\n", "\t", "    "]:  # spaces for indentation
+            if char not in self.token_to_idx:
+                self.token_to_idx[char] = idx
+                self.idx_to_token[idx] = char
+                idx += 1
         self.base_vocab_size = idx
+    def _get_stats(self, vocab: Dict[Tuple[str, ...], int]) -> Dict[Tuple[str, str], int]:
+        """Count frequency of adjacent pairs"""
+        pairs = {}
+        for word_tuple, freq in vocab.items():
+            for i in range(len(word_tuple) - 1):
+                pair = (word_tuple[i], word_tuple[i+1])
+                pairs[pair] = pairs.get(pair, 0) + freq
+        return pairs
+    def _merge_vocab(self, pair: Tuple[str, str], vocab: Dict[Tuple[str, ...], int]) -> Dict[Tuple[str, ...], int]:
+        """Merge all occurrences of pair in vocabulary"""
+        new_vocab = {}
+        bigram = pair
+        new_token = "".join(pair)
+        for word, freq in vocab.items():
+            new_word = []
+            i = 0
+            while i < len(word):
+                if i < len(word) - 1 and word[i] == bigram[0] and word[i+1] == bigram[1]:
+                    new_word.append(new_token)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_vocab[tuple(new_word)] = freq
+        return new_vocab
     def fit(self, texts: List[str]):
+        """Train BPE tokenizer on texts"""
+        # Pre-tokenize into words to avoid merging across word boundaries
+        # This regex splits by whitespace but keeps punctuation
+        # Also handles code symbols better
+        word_counts = {}
         for text in texts:
+            # Simple pre-tokenization for code
+            words = re.findall(r'[a-zA-Z0-9_]+|[^\s\w]', text)
             for word in words:
+                # Convert word to tuple of characters
+                token_tuple = tuple(c for c in word)
+                word_counts[token_tuple] = word_counts.get(token_tuple, 0) + 1
+        # BPE training loop
+        vocab = word_counts
+        num_merges = self.vocab_size - self.base_vocab_size
+        print(f"Training BPE tokenizer (target vocab: {self.vocab_size})...")
+        for i in range(num_merges):
+            pairs = self._get_stats(vocab)
+            if not pairs:
                 break
+            # Find most frequent pair
+            best_pair = max(pairs, key=pairs.get)
+            # Stop if pair frequency is too low (e.g., 1)
+            if pairs[best_pair] < 2:
+                break
+            # Merge pair
+            vocab = self._merge_vocab(best_pair, vocab)
+            # Add new token to vocabulary
+            new_token = "".join(best_pair)
+            self.merges[best_pair] = new_token
+            idx = len(self.token_to_idx)
+            self.token_to_idx[new_token] = idx
+            self.idx_to_token[idx] = new_token
+            if (i + 1) % 100 == 0:
+                print(f"BPE merge {i+1}/{num_merges}: '{best_pair[0]}' + '{best_pair[1]}' -> '{new_token}'")
+        print(f"BPE training complete. Final vocab size: {len(self.token_to_idx)}")
+    def _tokenize_word(self, word: str) -> List[str]:
+        """Tokenize a single word using learned merges"""
+        if word in self.token_to_idx:
+            return [word]
+        # Start with characters
+        tokens = list(word)
+        # Apply merges iteratively
+        # Note: In a real BPE implementation we would apply in order of priority
+        # Here we do a simpler greedy application based on length
+        while True:
+            merged = False
+            i = 0
+            new_tokens = []
+            while i < len(tokens) - 1:
+                pair = (tokens[i], tokens[i+1])
+                pair_str = "".join(pair)
+                # Check if this pair forms a known token
+                if pair_str in self.token_to_idx:
+                    new_tokens.append(pair_str)
+                    i += 2
+                    merged = True
+                else:
+                    new_tokens.append(tokens[i])
+                    i += 1
+            if i < len(tokens):
+                new_tokens.append(tokens[i])
+            if not merged:
+                break
+            tokens = new_tokens
+        return tokens
     def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
+        """Encode text to token indices"""
+        # Pre-tokenize same way as training
+        words = re.findall(r'[a-zA-Z0-9_]+|[^\s\w]|\s+', text)
         encoded = []
+        for word in words:
+            if word in self.token_to_idx:
+                encoded.append(self.token_to_idx[word])
             else:
+                # Apply BPE
+                subwords = self._tokenize_word(word)
+                for sw in subwords:
+                    encoded.append(self.token_to_idx.get(sw, self.token_to_idx["<UNK>"]))
+        # Truncate or Pad
         if max_length:
+            if len(encoded) > max_length:
                 encoded = encoded[:max_length]
+            elif len(encoded) < max_length:
+                encoded += [self.token_to_idx["<PAD>"]] * (max_length - len(encoded))
         return encoded
     def decode(self, indices: List[int]) -> str:
         """Decode indices to text"""
+        tokens = []
         for idx in indices:
+            # Skip special tokens if needed, but usually we decode them
+            # and let post-processing handle cleanup
+            if idx in self.idx_to_token:
+                token = self.idx_to_token[idx]
+                if token not in ["<PAD>", "<UNK>", "<START>", "<END>"]:
+                    tokens.append(token)
+        return "".join(tokens)
     def save(self, path: str):
         """Save tokenizer"""
+        data = {
+            'vocab_size': self.vocab_size,
+            'token_to_idx': self.token_to_idx,
+            'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
+            'base_vocab_size': self.base_vocab_size,
+            'merges': {f"{p[0]}|{p[1]}": m for p, m in self.merges.items()}
+        }
         with open(path, 'w') as f:
+            json.dump(data, f, indent=2)
     def load(self, path: str):
         """Load tokenizer"""
         self.token_to_idx = data['token_to_idx']
         self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
         self.base_vocab_size = data.get('base_vocab_size', 100)
+        # Load merges
+        if 'merges' in data:
+            self.merges = {}
+            for k, v in data['merges'].items():
+                p = k.split('|')
+                if len(p) == 2:
+                    self.merges[(p[0], p[1])] = v
     @property
     def vocabulary_size(self) -> int: