File size: 5,150 Bytes
44c948e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import json
import re
from typing import List, Dict, Optional
import numpy as np

class VedaTokenizer:
    """Custom tokenizer for Veda Programming LLM"""
    
    def __init__(self, vocab_size: int = 10000):
        self.vocab_size = vocab_size
        self.word_to_idx: Dict[str, int] = {}
        self.idx_to_word: Dict[int, str] = {}
        
        # Special tokens
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.start_token = "<START>"
        self.end_token = "<END>"
        self.newline_token = "<NEWLINE>"
        self.indent_token = "<INDENT>"
        
        self._init_special_tokens()
    
    def _init_special_tokens(self):
        """Initialize special tokens"""
        special_tokens = [
            self.pad_token, 
            self.unk_token, 
            self.start_token, 
            self.end_token,
            self.newline_token,
            self.indent_token
        ]
        for idx, token in enumerate(special_tokens):
            self.word_to_idx[token] = idx
            self.idx_to_word[idx] = token
    
    def _tokenize_code(self, text: str) -> List[str]:
        """Tokenize code with special handling for programming constructs"""
        # Replace newlines and indentation
        text = text.replace('\n', f' {self.newline_token} ')
        text = text.replace('\t', f' {self.indent_token} ')
        text = text.replace('    ', f' {self.indent_token} ')
        
        # Tokenize with regex for code
        pattern = r'''
            \d+\.\d+|           # Floats
            \d+|                # Integers
            [a-zA-Z_]\w*|       # Identifiers
            \"[^\"]*\"|         # Double quoted strings
            \'[^\']*\'|         # Single quoted strings
            \#[^\n]*|           # Comments
            ==|!=|<=|>=|        # Comparison operators
            \+=|-=|\*=|/=|      # Assignment operators
            ->|=>|              # Arrow operators
            \S                  # Other single characters
        '''
        tokens = re.findall(pattern, text, re.VERBOSE)
        return tokens
    
    def fit(self, texts: List[str]):
        """Build vocabulary from texts"""
        word_freq = {}
        
        for text in texts:
            tokens = self._tokenize_code(text)
            for token in tokens:
                word_freq[token] = word_freq.get(token, 0) + 1
        
        # Sort by frequency and take top vocab_size
        sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
        
        start_idx = len(self.word_to_idx)
        for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]):
            actual_idx = idx + start_idx
            self.word_to_idx[word] = actual_idx
            self.idx_to_word[actual_idx] = word
        
        print(f"Vocabulary built with {len(self.word_to_idx)} tokens")
    
    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        """Encode text to token indices"""
        tokens = self._tokenize_code(text)
        encoded = [self.word_to_idx.get(token, self.word_to_idx[self.unk_token]) 
                   for token in tokens]
        
        if max_length:
            if len(encoded) < max_length:
                encoded += [self.word_to_idx[self.pad_token]] * (max_length - len(encoded))
            else:
                encoded = encoded[:max_length]
        
        return encoded
    
    def decode(self, indices: List[int]) -> str:
        """Decode token indices back to text"""
        tokens = []
        for idx in indices:
            if idx in self.idx_to_word:
                token = self.idx_to_word[idx]
                if token == self.pad_token:
                    continue
                elif token == self.newline_token:
                    tokens.append('\n')
                elif token == self.indent_token:
                    tokens.append('    ')
                else:
                    tokens.append(token)
        
        # Join tokens intelligently
        result = []
        for i, token in enumerate(tokens):
            if token in '.,;:)]}' or (i > 0 and tokens[i-1] in '([{'):
                result.append(token)
            elif token in '([{':
                result.append(' ' + token if result else token)
            else:
                result.append(' ' + token if result else token)
        
        return ''.join(result).strip()
    
    def save(self, path: str):
        """Save tokenizer to file"""
        data = {
            'vocab_size': self.vocab_size,
            'word_to_idx': self.word_to_idx,
            'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()}
        }
        with open(path, 'w') as f:
            json.dump(data, f)
    
    def load(self, path: str):
        """Load tokenizer from file"""
        with open(path, 'r') as f:
            data = json.load(f)
        self.vocab_size = data['vocab_size']
        self.word_to_idx = data['word_to_idx']
        self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()}
    
    @property
    def vocabulary_size(self) -> int:
        return len(self.word_to_idx)