Spaces:
Running
Running
File size: 5,794 Bytes
ffd2cda 89d56eb 44c948e ffd2cda 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb ffd2cda 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 8eda8fc 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb ffd2cda 44c948e 89d56eb ffd2cda 89d56eb 8eda8fc 89d56eb 8eda8fc 89d56eb 8eda8fc 89d56eb 44c948e ffd2cda 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e ffd2cda 44c948e 89d56eb 44c948e ffd2cda 44c948e 89d56eb 44c948e 89d56eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
"""Tokenizer for Veda Programming Assistant"""
import json
import re
from typing import List, Dict, Optional
class VedaTokenizer:
"""Tokenizer with conversation support"""
def __init__(self, vocab_size: int = 8000):
self.vocab_size = vocab_size
self.token_to_idx: Dict[str, int] = {}
self.idx_to_token: Dict[int, str] = {}
self._init_vocab()
def _init_vocab(self):
"""Initialize vocabulary with conversation tokens"""
special = [
"<PAD>", "<UNK>", "<START>", "<END>",
"<CODE>", "<ENDCODE>",
"<USER>", "<ASSISTANT>"
]
for idx, token in enumerate(special):
self.token_to_idx[token] = idx
self.idx_to_token[idx] = token
idx = len(special)
for i in range(32, 127):
char = chr(i)
self.token_to_idx[char] = idx
self.idx_to_token[idx] = char
idx += 1
for char in ["\n", "\t"]:
self.token_to_idx[char] = idx
self.idx_to_token[idx] = char
idx += 1
self.base_vocab_size = idx
def fit(self, texts: List[str]):
"""Build vocabulary"""
word_freq = {}
for text in texts:
words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[^\s]', text)
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
idx = self.base_vocab_size
for word, _ in sorted_words:
if idx >= self.vocab_size:
break
if word not in self.token_to_idx and len(word) <= 25:
self.token_to_idx[word] = idx
self.idx_to_token[idx] = word
idx += 1
print(f"Vocabulary: {len(self.token_to_idx)} tokens")
def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
"""Encode text"""
tokens = self._tokenize(text)
encoded = []
for token in tokens:
if token in self.token_to_idx:
encoded.append(self.token_to_idx[token])
else:
for char in token:
encoded.append(self.token_to_idx.get(char, 1))
if max_length:
if len(encoded) < max_length:
encoded += [0] * (max_length - len(encoded))
else:
encoded = encoded[:max_length]
return encoded
def _tokenize(self, text: str) -> List[str]:
"""Tokenize text"""
tokens = []
parts = re.split(r'(\s+)', text)
for part in parts:
if not part:
continue
if part.isspace():
for char in part:
tokens.append(char)
elif part in self.token_to_idx:
tokens.append(part)
else:
i = 0
while i < len(part):
matched = False
for length in range(min(len(part) - i, 20), 0, -1):
substr = part[i:i+length]
if substr in self.token_to_idx:
tokens.append(substr)
i += length
matched = True
break
if not matched:
tokens.append(part[i])
i += 1
return tokens
def decode(self, indices: List[int]) -> str:
"""Decode indices to text"""
result = []
prev = ""
for idx in indices:
if idx == 0:
continue
if idx not in self.idx_to_token:
continue
token = self.idx_to_token[idx]
if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
continue
if token == "<CODE>":
result.append("\n```python\n")
prev = "\n"
continue
if token == "<ENDCODE>":
result.append("\n```\n")
prev = "\n"
continue
if not result:
result.append(token)
elif token in "\n\t":
result.append(token)
elif token in ".,;:!?()[]{}":
result.append(token)
elif prev in "(\n\t[{":
result.append(token)
elif len(prev) > 0 and prev[-1].isalnum() and len(token) > 0 and token[0].isalnum():
result.append(" " + token)
else:
result.append(token)
prev = token
return "".join(result)
def save(self, path: str):
"""Save tokenizer"""
with open(path, 'w') as f:
json.dump({
'vocab_size': self.vocab_size,
'token_to_idx': self.token_to_idx,
'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
'base_vocab_size': self.base_vocab_size
}, f, indent=2)
def load(self, path: str):
"""Load tokenizer"""
with open(path, 'r') as f:
data = json.load(f)
self.vocab_size = data['vocab_size']
self.token_to_idx = data['token_to_idx']
self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
self.base_vocab_size = data.get('base_vocab_size', 100)
@property
def vocabulary_size(self) -> int:
return len(self.token_to_idx) |