Spaces:
Running
Running
File size: 5,150 Bytes
44c948e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import json
import re
from typing import List, Dict, Optional
import numpy as np
class VedaTokenizer:
"""Custom tokenizer for Veda Programming LLM"""
def __init__(self, vocab_size: int = 10000):
self.vocab_size = vocab_size
self.word_to_idx: Dict[str, int] = {}
self.idx_to_word: Dict[int, str] = {}
# Special tokens
self.pad_token = "<PAD>"
self.unk_token = "<UNK>"
self.start_token = "<START>"
self.end_token = "<END>"
self.newline_token = "<NEWLINE>"
self.indent_token = "<INDENT>"
self._init_special_tokens()
def _init_special_tokens(self):
"""Initialize special tokens"""
special_tokens = [
self.pad_token,
self.unk_token,
self.start_token,
self.end_token,
self.newline_token,
self.indent_token
]
for idx, token in enumerate(special_tokens):
self.word_to_idx[token] = idx
self.idx_to_word[idx] = token
def _tokenize_code(self, text: str) -> List[str]:
"""Tokenize code with special handling for programming constructs"""
# Replace newlines and indentation
text = text.replace('\n', f' {self.newline_token} ')
text = text.replace('\t', f' {self.indent_token} ')
text = text.replace(' ', f' {self.indent_token} ')
# Tokenize with regex for code
pattern = r'''
\d+\.\d+| # Floats
\d+| # Integers
[a-zA-Z_]\w*| # Identifiers
\"[^\"]*\"| # Double quoted strings
\'[^\']*\'| # Single quoted strings
\#[^\n]*| # Comments
==|!=|<=|>=| # Comparison operators
\+=|-=|\*=|/=| # Assignment operators
->|=>| # Arrow operators
\S # Other single characters
'''
tokens = re.findall(pattern, text, re.VERBOSE)
return tokens
def fit(self, texts: List[str]):
"""Build vocabulary from texts"""
word_freq = {}
for text in texts:
tokens = self._tokenize_code(text)
for token in tokens:
word_freq[token] = word_freq.get(token, 0) + 1
# Sort by frequency and take top vocab_size
sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
start_idx = len(self.word_to_idx)
for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]):
actual_idx = idx + start_idx
self.word_to_idx[word] = actual_idx
self.idx_to_word[actual_idx] = word
print(f"Vocabulary built with {len(self.word_to_idx)} tokens")
def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
"""Encode text to token indices"""
tokens = self._tokenize_code(text)
encoded = [self.word_to_idx.get(token, self.word_to_idx[self.unk_token])
for token in tokens]
if max_length:
if len(encoded) < max_length:
encoded += [self.word_to_idx[self.pad_token]] * (max_length - len(encoded))
else:
encoded = encoded[:max_length]
return encoded
def decode(self, indices: List[int]) -> str:
"""Decode token indices back to text"""
tokens = []
for idx in indices:
if idx in self.idx_to_word:
token = self.idx_to_word[idx]
if token == self.pad_token:
continue
elif token == self.newline_token:
tokens.append('\n')
elif token == self.indent_token:
tokens.append(' ')
else:
tokens.append(token)
# Join tokens intelligently
result = []
for i, token in enumerate(tokens):
if token in '.,;:)]}' or (i > 0 and tokens[i-1] in '([{'):
result.append(token)
elif token in '([{':
result.append(' ' + token if result else token)
else:
result.append(' ' + token if result else token)
return ''.join(result).strip()
def save(self, path: str):
"""Save tokenizer to file"""
data = {
'vocab_size': self.vocab_size,
'word_to_idx': self.word_to_idx,
'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()}
}
with open(path, 'w') as f:
json.dump(data, f)
def load(self, path: str):
"""Load tokenizer from file"""
with open(path, 'r') as f:
data = json.load(f)
self.vocab_size = data['vocab_size']
self.word_to_idx = data['word_to_idx']
self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()}
@property
def vocabulary_size(self) -> int:
return len(self.word_to_idx) |