Spaces:
Sleeping
Sleeping
File size: 5,998 Bytes
89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 8eda8fc 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 8eda8fc 89d56eb 8eda8fc 89d56eb 8eda8fc 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb 44c948e 89d56eb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | """Tokenizer - MODIFIED for conversations"""
import json
import re
from typing import List, Dict, Optional
class VedaTokenizer:
"""Tokenizer with conversation support"""
def __init__(self, vocab_size: int = 8000):
self.vocab_size = vocab_size
self.token_to_idx: Dict[str, int] = {}
self.idx_to_token: Dict[int, str] = {}
self._init_vocab()
def _init_vocab(self):
"""Initialize vocabulary with conversation tokens"""
# Special tokens - ADDED conversation tokens
special = [
"<PAD>", "<UNK>", "<START>", "<END>",
"<CODE>", "<ENDCODE>", # For code blocks
"<USER>", "<ASSISTANT>" # For conversation
]
for idx, token in enumerate(special):
self.token_to_idx[token] = idx
self.idx_to_token[idx] = token
# ASCII characters
idx = len(special)
for i in range(32, 127):
char = chr(i)
self.token_to_idx[char] = idx
self.idx_to_token[idx] = char
idx += 1
# Whitespace
for char in ["\n", "\t"]:
self.token_to_idx[char] = idx
self.idx_to_token[idx] = char
idx += 1
self.base_vocab_size = idx
def fit(self, texts: List[str]):
"""Build vocabulary"""
word_freq = {}
for text in texts:
words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[^\s]', text)
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
idx = self.base_vocab_size
for word, _ in sorted_words:
if idx >= self.vocab_size:
break
if word not in self.token_to_idx and len(word) <= 25:
self.token_to_idx[word] = idx
self.idx_to_token[idx] = word
idx += 1
print(f"Vocabulary: {len(self.token_to_idx)} tokens")
def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
"""Encode text"""
tokens = self._tokenize(text)
encoded = []
for token in tokens:
if token in self.token_to_idx:
encoded.append(self.token_to_idx[token])
else:
for char in token:
encoded.append(self.token_to_idx.get(char, 1))
if max_length:
if len(encoded) < max_length:
encoded += [0] * (max_length - len(encoded))
else:
encoded = encoded[:max_length]
return encoded
def _tokenize(self, text: str) -> List[str]:
"""Tokenize text"""
tokens = []
parts = re.split(r'(\s+)', text)
for part in parts:
if not part:
continue
if part.isspace():
for char in part:
tokens.append(char)
elif part in self.token_to_idx:
tokens.append(part)
else:
i = 0
while i < len(part):
matched = False
for length in range(min(len(part) - i, 20), 0, -1):
substr = part[i:i+length]
if substr in self.token_to_idx:
tokens.append(substr)
i += length
matched = True
break
if not matched:
tokens.append(part[i])
i += 1
return tokens
def decode(self, indices: List[int]) -> str:
"""Decode indices to text - MODIFIED for conversation tokens"""
result = []
prev = ""
for idx in indices:
if idx == 0: # PAD
continue
if idx not in self.idx_to_token:
continue
token = self.idx_to_token[idx]
# Skip special tokens in output
if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
continue
# Handle code blocks
if token == "<CODE>":
result.append("\n```python\n")
prev = "\n"
continue
if token == "<ENDCODE>":
result.append("\n```\n")
prev = "\n"
continue
# Smart joining
if not result:
result.append(token)
elif token in "\n\t":
result.append(token)
elif token in ".,;:!?()[]{}":
result.append(token)
elif prev in "(\n\t[{":
result.append(token)
elif prev.isalnum() and len(token) > 0 and token[0].isalnum():
result.append(" " + token)
else:
result.append(token)
prev = token
return "".join(result)
def save(self, path: str):
with open(path, 'w') as f:
json.dump({
'vocab_size': self.vocab_size,
'token_to_idx': self.token_to_idx,
'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
'base_vocab_size': self.base_vocab_size
}, f, indent=2)
def load(self, path: str):
with open(path, 'r') as f:
data = json.load(f)
self.vocab_size = data['vocab_size']
self.token_to_idx = data['token_to_idx']
self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
self.base_vocab_size = data.get('base_vocab_size', 100)
@property
def vocabulary_size(self) -> int:
return len(self.token_to_idx) |