import json import re from typing import List, Dict, Optional import numpy as np class VedaTokenizer: """Custom tokenizer for Veda Programming LLM""" def __init__(self, vocab_size: int = 10000): self.vocab_size = vocab_size self.word_to_idx: Dict[str, int] = {} self.idx_to_word: Dict[int, str] = {} # Special tokens self.pad_token = "" self.unk_token = "" self.start_token = "" self.end_token = "" self.newline_token = "" self.indent_token = "" self._init_special_tokens() def _init_special_tokens(self): """Initialize special tokens""" special_tokens = [ self.pad_token, self.unk_token, self.start_token, self.end_token, self.newline_token, self.indent_token ] for idx, token in enumerate(special_tokens): self.word_to_idx[token] = idx self.idx_to_word[idx] = token def _tokenize_code(self, text: str) -> List[str]: """Tokenize code with special handling for programming constructs""" # Replace newlines and indentation text = text.replace('\n', f' {self.newline_token} ') text = text.replace('\t', f' {self.indent_token} ') text = text.replace(' ', f' {self.indent_token} ') # Tokenize with regex for code pattern = r''' \d+\.\d+| # Floats \d+| # Integers [a-zA-Z_]\w*| # Identifiers \"[^\"]*\"| # Double quoted strings \'[^\']*\'| # Single quoted strings \#[^\n]*| # Comments ==|!=|<=|>=| # Comparison operators \+=|-=|\*=|/=| # Assignment operators ->|=>| # Arrow operators \S # Other single characters ''' tokens = re.findall(pattern, text, re.VERBOSE) return tokens def fit(self, texts: List[str]): """Build vocabulary from texts""" word_freq = {} for text in texts: tokens = self._tokenize_code(text) for token in tokens: word_freq[token] = word_freq.get(token, 0) + 1 # Sort by frequency and take top vocab_size sorted_words = sorted(word_freq.items(), key=lambda x: -x[1]) start_idx = len(self.word_to_idx) for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]): actual_idx = idx + start_idx self.word_to_idx[word] = actual_idx self.idx_to_word[actual_idx] = word print(f"Vocabulary built with {len(self.word_to_idx)} tokens") def encode(self, text: str, max_length: Optional[int] = None) -> List[int]: """Encode text to token indices""" tokens = self._tokenize_code(text) encoded = [self.word_to_idx.get(token, self.word_to_idx[self.unk_token]) for token in tokens] if max_length: if len(encoded) < max_length: encoded += [self.word_to_idx[self.pad_token]] * (max_length - len(encoded)) else: encoded = encoded[:max_length] return encoded def decode(self, indices: List[int]) -> str: """Decode token indices back to text""" tokens = [] for idx in indices: if idx in self.idx_to_word: token = self.idx_to_word[idx] if token == self.pad_token: continue elif token == self.newline_token: tokens.append('\n') elif token == self.indent_token: tokens.append(' ') else: tokens.append(token) # Join tokens intelligently result = [] for i, token in enumerate(tokens): if token in '.,;:)]}' or (i > 0 and tokens[i-1] in '([{'): result.append(token) elif token in '([{': result.append(' ' + token if result else token) else: result.append(' ' + token if result else token) return ''.join(result).strip() def save(self, path: str): """Save tokenizer to file""" data = { 'vocab_size': self.vocab_size, 'word_to_idx': self.word_to_idx, 'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()} } with open(path, 'w') as f: json.dump(data, f) def load(self, path: str): """Load tokenizer from file""" with open(path, 'r') as f: data = json.load(f) self.vocab_size = data['vocab_size'] self.word_to_idx = data['word_to_idx'] self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()} @property def vocabulary_size(self) -> int: return len(self.word_to_idx)