vedaco commited on
Commit
44c948e
·
verified ·
1 Parent(s): 5b1197f

Create tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +142 -0
tokenizer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from typing import List, Dict, Optional
4
+ import numpy as np
5
+
6
+ class VedaTokenizer:
7
+ """Custom tokenizer for Veda Programming LLM"""
8
+
9
+ def __init__(self, vocab_size: int = 10000):
10
+ self.vocab_size = vocab_size
11
+ self.word_to_idx: Dict[str, int] = {}
12
+ self.idx_to_word: Dict[int, str] = {}
13
+
14
+ # Special tokens
15
+ self.pad_token = "<PAD>"
16
+ self.unk_token = "<UNK>"
17
+ self.start_token = "<START>"
18
+ self.end_token = "<END>"
19
+ self.newline_token = "<NEWLINE>"
20
+ self.indent_token = "<INDENT>"
21
+
22
+ self._init_special_tokens()
23
+
24
+ def _init_special_tokens(self):
25
+ """Initialize special tokens"""
26
+ special_tokens = [
27
+ self.pad_token,
28
+ self.unk_token,
29
+ self.start_token,
30
+ self.end_token,
31
+ self.newline_token,
32
+ self.indent_token
33
+ ]
34
+ for idx, token in enumerate(special_tokens):
35
+ self.word_to_idx[token] = idx
36
+ self.idx_to_word[idx] = token
37
+
38
+ def _tokenize_code(self, text: str) -> List[str]:
39
+ """Tokenize code with special handling for programming constructs"""
40
+ # Replace newlines and indentation
41
+ text = text.replace('\n', f' {self.newline_token} ')
42
+ text = text.replace('\t', f' {self.indent_token} ')
43
+ text = text.replace(' ', f' {self.indent_token} ')
44
+
45
+ # Tokenize with regex for code
46
+ pattern = r'''
47
+ \d+\.\d+| # Floats
48
+ \d+| # Integers
49
+ [a-zA-Z_]\w*| # Identifiers
50
+ \"[^\"]*\"| # Double quoted strings
51
+ \'[^\']*\'| # Single quoted strings
52
+ \#[^\n]*| # Comments
53
+ ==|!=|<=|>=| # Comparison operators
54
+ \+=|-=|\*=|/=| # Assignment operators
55
+ ->|=>| # Arrow operators
56
+ \S # Other single characters
57
+ '''
58
+ tokens = re.findall(pattern, text, re.VERBOSE)
59
+ return tokens
60
+
61
+ def fit(self, texts: List[str]):
62
+ """Build vocabulary from texts"""
63
+ word_freq = {}
64
+
65
+ for text in texts:
66
+ tokens = self._tokenize_code(text)
67
+ for token in tokens:
68
+ word_freq[token] = word_freq.get(token, 0) + 1
69
+
70
+ # Sort by frequency and take top vocab_size
71
+ sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
72
+
73
+ start_idx = len(self.word_to_idx)
74
+ for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]):
75
+ actual_idx = idx + start_idx
76
+ self.word_to_idx[word] = actual_idx
77
+ self.idx_to_word[actual_idx] = word
78
+
79
+ print(f"Vocabulary built with {len(self.word_to_idx)} tokens")
80
+
81
+ def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
82
+ """Encode text to token indices"""
83
+ tokens = self._tokenize_code(text)
84
+ encoded = [self.word_to_idx.get(token, self.word_to_idx[self.unk_token])
85
+ for token in tokens]
86
+
87
+ if max_length:
88
+ if len(encoded) < max_length:
89
+ encoded += [self.word_to_idx[self.pad_token]] * (max_length - len(encoded))
90
+ else:
91
+ encoded = encoded[:max_length]
92
+
93
+ return encoded
94
+
95
+ def decode(self, indices: List[int]) -> str:
96
+ """Decode token indices back to text"""
97
+ tokens = []
98
+ for idx in indices:
99
+ if idx in self.idx_to_word:
100
+ token = self.idx_to_word[idx]
101
+ if token == self.pad_token:
102
+ continue
103
+ elif token == self.newline_token:
104
+ tokens.append('\n')
105
+ elif token == self.indent_token:
106
+ tokens.append(' ')
107
+ else:
108
+ tokens.append(token)
109
+
110
+ # Join tokens intelligently
111
+ result = []
112
+ for i, token in enumerate(tokens):
113
+ if token in '.,;:)]}' or (i > 0 and tokens[i-1] in '([{'):
114
+ result.append(token)
115
+ elif token in '([{':
116
+ result.append(' ' + token if result else token)
117
+ else:
118
+ result.append(' ' + token if result else token)
119
+
120
+ return ''.join(result).strip()
121
+
122
+ def save(self, path: str):
123
+ """Save tokenizer to file"""
124
+ data = {
125
+ 'vocab_size': self.vocab_size,
126
+ 'word_to_idx': self.word_to_idx,
127
+ 'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()}
128
+ }
129
+ with open(path, 'w') as f:
130
+ json.dump(data, f)
131
+
132
+ def load(self, path: str):
133
+ """Load tokenizer from file"""
134
+ with open(path, 'r') as f:
135
+ data = json.load(f)
136
+ self.vocab_size = data['vocab_size']
137
+ self.word_to_idx = data['word_to_idx']
138
+ self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()}
139
+
140
+ @property
141
+ def vocabulary_size(self) -> int:
142
+ return len(self.word_to_idx)