Spaces:

vedaco
/

veda-programming

Running

App Files Files Community

vedaco commited on 4 days ago

Commit

89d56eb

verified ·

1 Parent(s): ada3e2e

Update tokenizer.py

Browse files

Files changed (1) hide show

tokenizer.py +135 -68

tokenizer.py CHANGED Viewed

@@ -1,114 +1,181 @@
 import json
 import re
 from typing import List, Dict, Optional
 class VedaTokenizer:
-    """Custom tokenizer for Veda Programming LLM"""
-    def __init__(self, vocab_size: int = 5000):
         self.vocab_size = vocab_size
-        self.word_to_idx: Dict[str, int] = {}
-        self.idx_to_word: Dict[int, str] = {}
-        self._init_special_tokens()
-    def _init_special_tokens(self):
-        """Initialize special tokens"""
-        special_tokens = ["<PAD>", "<UNK>", "<START>", "<END>", "<NL>", "<INDENT>"]
-        for idx, token in enumerate(special_tokens):
-            self.word_to_idx[token] = idx
-            self.idx_to_word[idx] = token
-    def _tokenize_code(self, text: str) -> List[str]:
-        """Tokenize code"""
-        text = text.replace('\n', ' <NL> ')
-        text = text.replace('\t', ' <INDENT> ')
-        text = text.replace('    ', ' <INDENT> ')
-        pattern = r'[a-zA-Z_]\w*|[0-9]+\.?[0-9]*|\"[^\"]*\"|\'[^\']*\'|==|!=|<=|>=|[^\s]'
-        tokens = re.findall(pattern, text)
-        return [t for t in tokens if t.strip()]
     def fit(self, texts: List[str]):
-        """Build vocabulary from texts"""
         word_freq = {}
         for text in texts:
-            for token in self._tokenize_code(text):
-                word_freq[token] = word_freq.get(token, 0) + 1
         sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
-        start_idx = len(self.word_to_idx)
-        for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]):
-            actual_idx = idx + start_idx
-            self.word_to_idx[word] = actual_idx
-            self.idx_to_word[actual_idx] = word
-        print(f"Vocabulary: {len(self.word_to_idx)} tokens")
     def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
-        """Encode text to indices"""
-        tokens = self._tokenize_code(text)
-        encoded = [self.word_to_idx.get(t, 1) for t in tokens]  # 1 = UNK
         if max_length:
             if len(encoded) < max_length:
-                encoded += [0] * (max_length - len(encoded))  # 0 = PAD
             else:
                 encoded = encoded[:max_length]
         return encoded
-    def decode(self, indices: List[int]) -> str:
-        """Decode indices to text"""
         tokens = []
-        for idx in indices:
-            if idx in self.idx_to_word:
-                token = self.idx_to_word[idx]
-                if token == "<PAD>":
-                    continue
-                elif token == "<NL>":
-                    tokens.append('\n')
-                elif token == "<INDENT>":
-                    tokens.append('    ')
-                elif token in ["<UNK>", "<START>", "<END>"]:
-                    continue
-                else:
-                    tokens.append(token)
         result = []
-        for i, token in enumerate(tokens):
-            if token in '\n':
                 result.append(token)
-            elif token == '    ':
                 result.append(token)
-            elif token in '.,;:)]}':
                 result.append(token)
-            elif i > 0 and tokens[i-1] in '([{':
                 result.append(token)
-            elif token in '([{':
-                result.append(' ' + token if result and result[-1] not in '\n ' else token)
             else:
-                result.append(' ' + token if result and result[-1] not in '\n    ' else token)
-        return ''.join(result).strip()
     def save(self, path: str):
-        """Save tokenizer"""
-        data = {
-            'vocab_size': self.vocab_size,
-            'word_to_idx': self.word_to_idx,
-            'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()}
-        }
         with open(path, 'w') as f:
-            json.dump(data, f)
     def load(self, path: str):
-        """Load tokenizer"""
         with open(path, 'r') as f:
             data = json.load(f)
         self.vocab_size = data['vocab_size']
-        self.word_to_idx = data['word_to_idx']
-        self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()}
     @property
     def vocabulary_size(self) -> int:
-        return len(self.word_to_idx)

+"""Tokenizer - MODIFIED for conversations"""
 import json
 import re
 from typing import List, Dict, Optional
 class VedaTokenizer:
+    """Tokenizer with conversation support"""
+    def __init__(self, vocab_size: int = 8000):
         self.vocab_size = vocab_size
+        self.token_to_idx: Dict[str, int] = {}
+        self.idx_to_token: Dict[int, str] = {}
+        self._init_vocab()
+    def _init_vocab(self):
+        """Initialize vocabulary with conversation tokens"""
+        # Special tokens - ADDED conversation tokens
+        special = [
+            "<PAD>", "<UNK>", "<START>", "<END>",
+            "<CODE>", "<ENDCODE>",  # For code blocks
+            "<USER>", "<ASSISTANT>"  # For conversation
+        ]
+        for idx, token in enumerate(special):
+            self.token_to_idx[token] = idx
+            self.idx_to_token[idx] = token
+        # ASCII characters
+        idx = len(special)
+        for i in range(32, 127):
+            char = chr(i)
+            self.token_to_idx[char] = idx
+            self.idx_to_token[idx] = char
+            idx += 1
+        # Whitespace
+        for char in ["\n", "\t"]:
+            self.token_to_idx[char] = idx
+            self.idx_to_token[idx] = char
+            idx += 1
+        self.base_vocab_size = idx
     def fit(self, texts: List[str]):
+        """Build vocabulary"""
         word_freq = {}
         for text in texts:
+            words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[^\s]', text)
+            for word in words:
+                word_freq[word] = word_freq.get(word, 0) + 1
         sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
+        idx = self.base_vocab_size
+        for word, _ in sorted_words:
+            if idx >= self.vocab_size:
+                break
+            if word not in self.token_to_idx and len(word) <= 25:
+                self.token_to_idx[word] = idx
+                self.idx_to_token[idx] = word
+                idx += 1
+        print(f"Vocabulary: {len(self.token_to_idx)} tokens")
     def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
+        """Encode text"""
+        tokens = self._tokenize(text)
+        encoded = []
+        for token in tokens:
+            if token in self.token_to_idx:
+                encoded.append(self.token_to_idx[token])
+            else:
+                for char in token:
+                    encoded.append(self.token_to_idx.get(char, 1))
         if max_length:
             if len(encoded) < max_length:
+                encoded += [0] * (max_length - len(encoded))
             else:
                 encoded = encoded[:max_length]
         return encoded
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize text"""
         tokens = []
+        parts = re.split(r'(\s+)', text)
+        for part in parts:
+            if not part:
+                continue
+            if part.isspace():
+                for char in part:
+                    tokens.append(char)
+            elif part in self.token_to_idx:
+                tokens.append(part)
+            else:
+                i = 0
+                while i < len(part):
+                    matched = False
+                    for length in range(min(len(part) - i, 20), 0, -1):
+                        substr = part[i:i+length]
+                        if substr in self.token_to_idx:
+                            tokens.append(substr)
+                            i += length
+                            matched = True
+                            break
+                    if not matched:
+                        tokens.append(part[i])
+                        i += 1
+        return tokens
+    def decode(self, indices: List[int]) -> str:
+        """Decode indices to text - MODIFIED for conversation tokens"""
         result = []
+        prev = ""
+        for idx in indices:
+            if idx == 0:  # PAD
+                continue
+            if idx not in self.idx_to_token:
+                continue
+            token = self.idx_to_token[idx]
+            # Skip special tokens in output
+            if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
+                continue
+            # Handle code blocks
+            if token == "<CODE>":
+                result.append("\n```python\n")
+                prev = "\n"
+                continue
+            if token == "<ENDCODE>":
+                result.append("\n```\n")
+                prev = "\n"
+                continue
+            # Smart joining
+            if not result:
                 result.append(token)
+            elif token in "\n\t":
                 result.append(token)
+            elif token in ".,;:!?()[]{}":
                 result.append(token)
+            elif prev in "(\n\t[{":
                 result.append(token)
+            elif prev.isalnum() and len(token) > 0 and token[0].isalnum():
+                result.append(" " + token)
             else:
+                result.append(token)
+            prev = token
+        return "".join(result)
     def save(self, path: str):
         with open(path, 'w') as f:
+            json.dump({
+                'vocab_size': self.vocab_size,
+                'token_to_idx': self.token_to_idx,
+                'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
+                'base_vocab_size': self.base_vocab_size
+            }, f, indent=2)
     def load(self, path: str):
         with open(path, 'r') as f:
             data = json.load(f)
         self.vocab_size = data['vocab_size']
+        self.token_to_idx = data['token_to_idx']
+        self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
+        self.base_vocab_size = data.get('base_vocab_size', 100)
     @property
     def vocabulary_size(self) -> int:
+        return len(self.token_to_idx)