Spaces:
Running
Running
| import json | |
| import re | |
| from typing import List, Dict, Optional | |
| import numpy as np | |
| class VedaTokenizer: | |
| """Custom tokenizer for Veda Programming LLM""" | |
| def __init__(self, vocab_size: int = 10000): | |
| self.vocab_size = vocab_size | |
| self.word_to_idx: Dict[str, int] = {} | |
| self.idx_to_word: Dict[int, str] = {} | |
| # Special tokens | |
| self.pad_token = "<PAD>" | |
| self.unk_token = "<UNK>" | |
| self.start_token = "<START>" | |
| self.end_token = "<END>" | |
| self.newline_token = "<NEWLINE>" | |
| self.indent_token = "<INDENT>" | |
| self._init_special_tokens() | |
| def _init_special_tokens(self): | |
| """Initialize special tokens""" | |
| special_tokens = [ | |
| self.pad_token, | |
| self.unk_token, | |
| self.start_token, | |
| self.end_token, | |
| self.newline_token, | |
| self.indent_token | |
| ] | |
| for idx, token in enumerate(special_tokens): | |
| self.word_to_idx[token] = idx | |
| self.idx_to_word[idx] = token | |
| def _tokenize_code(self, text: str) -> List[str]: | |
| """Tokenize code with special handling for programming constructs""" | |
| # Replace newlines and indentation | |
| text = text.replace('\n', f' {self.newline_token} ') | |
| text = text.replace('\t', f' {self.indent_token} ') | |
| text = text.replace(' ', f' {self.indent_token} ') | |
| # Tokenize with regex for code | |
| pattern = r''' | |
| \d+\.\d+| # Floats | |
| \d+| # Integers | |
| [a-zA-Z_]\w*| # Identifiers | |
| \"[^\"]*\"| # Double quoted strings | |
| \'[^\']*\'| # Single quoted strings | |
| \#[^\n]*| # Comments | |
| ==|!=|<=|>=| # Comparison operators | |
| \+=|-=|\*=|/=| # Assignment operators | |
| ->|=>| # Arrow operators | |
| \S # Other single characters | |
| ''' | |
| tokens = re.findall(pattern, text, re.VERBOSE) | |
| return tokens | |
| def fit(self, texts: List[str]): | |
| """Build vocabulary from texts""" | |
| word_freq = {} | |
| for text in texts: | |
| tokens = self._tokenize_code(text) | |
| for token in tokens: | |
| word_freq[token] = word_freq.get(token, 0) + 1 | |
| # Sort by frequency and take top vocab_size | |
| sorted_words = sorted(word_freq.items(), key=lambda x: -x[1]) | |
| start_idx = len(self.word_to_idx) | |
| for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]): | |
| actual_idx = idx + start_idx | |
| self.word_to_idx[word] = actual_idx | |
| self.idx_to_word[actual_idx] = word | |
| print(f"Vocabulary built with {len(self.word_to_idx)} tokens") | |
| def encode(self, text: str, max_length: Optional[int] = None) -> List[int]: | |
| """Encode text to token indices""" | |
| tokens = self._tokenize_code(text) | |
| encoded = [self.word_to_idx.get(token, self.word_to_idx[self.unk_token]) | |
| for token in tokens] | |
| if max_length: | |
| if len(encoded) < max_length: | |
| encoded += [self.word_to_idx[self.pad_token]] * (max_length - len(encoded)) | |
| else: | |
| encoded = encoded[:max_length] | |
| return encoded | |
| def decode(self, indices: List[int]) -> str: | |
| """Decode token indices back to text""" | |
| tokens = [] | |
| for idx in indices: | |
| if idx in self.idx_to_word: | |
| token = self.idx_to_word[idx] | |
| if token == self.pad_token: | |
| continue | |
| elif token == self.newline_token: | |
| tokens.append('\n') | |
| elif token == self.indent_token: | |
| tokens.append(' ') | |
| else: | |
| tokens.append(token) | |
| # Join tokens intelligently | |
| result = [] | |
| for i, token in enumerate(tokens): | |
| if token in '.,;:)]}' or (i > 0 and tokens[i-1] in '([{'): | |
| result.append(token) | |
| elif token in '([{': | |
| result.append(' ' + token if result else token) | |
| else: | |
| result.append(' ' + token if result else token) | |
| return ''.join(result).strip() | |
| def save(self, path: str): | |
| """Save tokenizer to file""" | |
| data = { | |
| 'vocab_size': self.vocab_size, | |
| 'word_to_idx': self.word_to_idx, | |
| 'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()} | |
| } | |
| with open(path, 'w') as f: | |
| json.dump(data, f) | |
| def load(self, path: str): | |
| """Load tokenizer from file""" | |
| with open(path, 'r') as f: | |
| data = json.load(f) | |
| self.vocab_size = data['vocab_size'] | |
| self.word_to_idx = data['word_to_idx'] | |
| self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()} | |
| def vocabulary_size(self) -> int: | |
| return len(self.word_to_idx) |