vedaco commited on
Commit
89d56eb
·
verified ·
1 Parent(s): ada3e2e

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +135 -68
tokenizer.py CHANGED
@@ -1,114 +1,181 @@
 
 
1
  import json
2
  import re
3
  from typing import List, Dict, Optional
4
 
5
  class VedaTokenizer:
6
- """Custom tokenizer for Veda Programming LLM"""
7
 
8
- def __init__(self, vocab_size: int = 5000):
9
  self.vocab_size = vocab_size
10
- self.word_to_idx: Dict[str, int] = {}
11
- self.idx_to_word: Dict[int, str] = {}
12
- self._init_special_tokens()
13
-
14
- def _init_special_tokens(self):
15
- """Initialize special tokens"""
16
- special_tokens = ["<PAD>", "<UNK>", "<START>", "<END>", "<NL>", "<INDENT>"]
17
- for idx, token in enumerate(special_tokens):
18
- self.word_to_idx[token] = idx
19
- self.idx_to_word[idx] = token
20
 
21
- def _tokenize_code(self, text: str) -> List[str]:
22
- """Tokenize code"""
23
- text = text.replace('\n', ' <NL> ')
24
- text = text.replace('\t', ' <INDENT> ')
25
- text = text.replace(' ', ' <INDENT> ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- pattern = r'[a-zA-Z_]\w*|[0-9]+\.?[0-9]*|\"[^\"]*\"|\'[^\']*\'|==|!=|<=|>=|[^\s]'
28
- tokens = re.findall(pattern, text)
29
- return [t for t in tokens if t.strip()]
30
 
31
  def fit(self, texts: List[str]):
32
- """Build vocabulary from texts"""
33
  word_freq = {}
 
34
  for text in texts:
35
- for token in self._tokenize_code(text):
36
- word_freq[token] = word_freq.get(token, 0) + 1
 
37
 
38
  sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
39
- start_idx = len(self.word_to_idx)
40
 
41
- for idx, (word, _) in enumerate(sorted_words[:self.vocab_size - start_idx]):
42
- actual_idx = idx + start_idx
43
- self.word_to_idx[word] = actual_idx
44
- self.idx_to_word[actual_idx] = word
 
 
 
 
45
 
46
- print(f"Vocabulary: {len(self.word_to_idx)} tokens")
47
 
48
  def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
49
- """Encode text to indices"""
50
- tokens = self._tokenize_code(text)
51
- encoded = [self.word_to_idx.get(t, 1) for t in tokens] # 1 = UNK
 
 
 
 
 
 
 
52
 
53
  if max_length:
54
  if len(encoded) < max_length:
55
- encoded += [0] * (max_length - len(encoded)) # 0 = PAD
56
  else:
57
  encoded = encoded[:max_length]
 
58
  return encoded
59
 
60
- def decode(self, indices: List[int]) -> str:
61
- """Decode indices to text"""
62
  tokens = []
63
- for idx in indices:
64
- if idx in self.idx_to_word:
65
- token = self.idx_to_word[idx]
66
- if token == "<PAD>":
67
- continue
68
- elif token == "<NL>":
69
- tokens.append('\n')
70
- elif token == "<INDENT>":
71
- tokens.append(' ')
72
- elif token in ["<UNK>", "<START>", "<END>"]:
73
- continue
74
- else:
75
- tokens.append(token)
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  result = []
78
- for i, token in enumerate(tokens):
79
- if token in '\n':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  result.append(token)
81
- elif token == ' ':
82
  result.append(token)
83
- elif token in '.,;:)]}':
84
  result.append(token)
85
- elif i > 0 and tokens[i-1] in '([{':
86
  result.append(token)
87
- elif token in '([{':
88
- result.append(' ' + token if result and result[-1] not in '\n ' else token)
89
  else:
90
- result.append(' ' + token if result and result[-1] not in '\n ' else token)
 
 
91
 
92
- return ''.join(result).strip()
93
 
94
  def save(self, path: str):
95
- """Save tokenizer"""
96
- data = {
97
- 'vocab_size': self.vocab_size,
98
- 'word_to_idx': self.word_to_idx,
99
- 'idx_to_word': {str(k): v for k, v in self.idx_to_word.items()}
100
- }
101
  with open(path, 'w') as f:
102
- json.dump(data, f)
 
 
 
 
 
103
 
104
  def load(self, path: str):
105
- """Load tokenizer"""
106
  with open(path, 'r') as f:
107
  data = json.load(f)
108
  self.vocab_size = data['vocab_size']
109
- self.word_to_idx = data['word_to_idx']
110
- self.idx_to_word = {int(k): v for k, v in data['idx_to_word'].items()}
 
111
 
112
  @property
113
  def vocabulary_size(self) -> int:
114
- return len(self.word_to_idx)
 
1
+ """Tokenizer - MODIFIED for conversations"""
2
+
3
  import json
4
  import re
5
  from typing import List, Dict, Optional
6
 
7
  class VedaTokenizer:
8
+ """Tokenizer with conversation support"""
9
 
10
+ def __init__(self, vocab_size: int = 8000):
11
  self.vocab_size = vocab_size
12
+ self.token_to_idx: Dict[str, int] = {}
13
+ self.idx_to_token: Dict[int, str] = {}
14
+ self._init_vocab()
 
 
 
 
 
 
 
15
 
16
+ def _init_vocab(self):
17
+ """Initialize vocabulary with conversation tokens"""
18
+ # Special tokens - ADDED conversation tokens
19
+ special = [
20
+ "<PAD>", "<UNK>", "<START>", "<END>",
21
+ "<CODE>", "<ENDCODE>", # For code blocks
22
+ "<USER>", "<ASSISTANT>" # For conversation
23
+ ]
24
+
25
+ for idx, token in enumerate(special):
26
+ self.token_to_idx[token] = idx
27
+ self.idx_to_token[idx] = token
28
+
29
+ # ASCII characters
30
+ idx = len(special)
31
+ for i in range(32, 127):
32
+ char = chr(i)
33
+ self.token_to_idx[char] = idx
34
+ self.idx_to_token[idx] = char
35
+ idx += 1
36
+
37
+ # Whitespace
38
+ for char in ["\n", "\t"]:
39
+ self.token_to_idx[char] = idx
40
+ self.idx_to_token[idx] = char
41
+ idx += 1
42
 
43
+ self.base_vocab_size = idx
 
 
44
 
45
  def fit(self, texts: List[str]):
46
+ """Build vocabulary"""
47
  word_freq = {}
48
+
49
  for text in texts:
50
+ words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|[0-9]+|[^\s]', text)
51
+ for word in words:
52
+ word_freq[word] = word_freq.get(word, 0) + 1
53
 
54
  sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
 
55
 
56
+ idx = self.base_vocab_size
57
+ for word, _ in sorted_words:
58
+ if idx >= self.vocab_size:
59
+ break
60
+ if word not in self.token_to_idx and len(word) <= 25:
61
+ self.token_to_idx[word] = idx
62
+ self.idx_to_token[idx] = word
63
+ idx += 1
64
 
65
+ print(f"Vocabulary: {len(self.token_to_idx)} tokens")
66
 
67
  def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
68
+ """Encode text"""
69
+ tokens = self._tokenize(text)
70
+ encoded = []
71
+
72
+ for token in tokens:
73
+ if token in self.token_to_idx:
74
+ encoded.append(self.token_to_idx[token])
75
+ else:
76
+ for char in token:
77
+ encoded.append(self.token_to_idx.get(char, 1))
78
 
79
  if max_length:
80
  if len(encoded) < max_length:
81
+ encoded += [0] * (max_length - len(encoded))
82
  else:
83
  encoded = encoded[:max_length]
84
+
85
  return encoded
86
 
87
+ def _tokenize(self, text: str) -> List[str]:
88
+ """Tokenize text"""
89
  tokens = []
90
+ parts = re.split(r'(\s+)', text)
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ for part in parts:
93
+ if not part:
94
+ continue
95
+ if part.isspace():
96
+ for char in part:
97
+ tokens.append(char)
98
+ elif part in self.token_to_idx:
99
+ tokens.append(part)
100
+ else:
101
+ i = 0
102
+ while i < len(part):
103
+ matched = False
104
+ for length in range(min(len(part) - i, 20), 0, -1):
105
+ substr = part[i:i+length]
106
+ if substr in self.token_to_idx:
107
+ tokens.append(substr)
108
+ i += length
109
+ matched = True
110
+ break
111
+ if not matched:
112
+ tokens.append(part[i])
113
+ i += 1
114
+
115
+ return tokens
116
+
117
+ def decode(self, indices: List[int]) -> str:
118
+ """Decode indices to text - MODIFIED for conversation tokens"""
119
  result = []
120
+ prev = ""
121
+
122
+ for idx in indices:
123
+ if idx == 0: # PAD
124
+ continue
125
+ if idx not in self.idx_to_token:
126
+ continue
127
+
128
+ token = self.idx_to_token[idx]
129
+
130
+ # Skip special tokens in output
131
+ if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
132
+ continue
133
+
134
+ # Handle code blocks
135
+ if token == "<CODE>":
136
+ result.append("\n```python\n")
137
+ prev = "\n"
138
+ continue
139
+ if token == "<ENDCODE>":
140
+ result.append("\n```\n")
141
+ prev = "\n"
142
+ continue
143
+
144
+ # Smart joining
145
+ if not result:
146
  result.append(token)
147
+ elif token in "\n\t":
148
  result.append(token)
149
+ elif token in ".,;:!?()[]{}":
150
  result.append(token)
151
+ elif prev in "(\n\t[{":
152
  result.append(token)
153
+ elif prev.isalnum() and len(token) > 0 and token[0].isalnum():
154
+ result.append(" " + token)
155
  else:
156
+ result.append(token)
157
+
158
+ prev = token
159
 
160
+ return "".join(result)
161
 
162
  def save(self, path: str):
 
 
 
 
 
 
163
  with open(path, 'w') as f:
164
+ json.dump({
165
+ 'vocab_size': self.vocab_size,
166
+ 'token_to_idx': self.token_to_idx,
167
+ 'idx_to_token': {str(k): v for k, v in self.idx_to_token.items()},
168
+ 'base_vocab_size': self.base_vocab_size
169
+ }, f, indent=2)
170
 
171
  def load(self, path: str):
 
172
  with open(path, 'r') as f:
173
  data = json.load(f)
174
  self.vocab_size = data['vocab_size']
175
+ self.token_to_idx = data['token_to_idx']
176
+ self.idx_to_token = {int(k): v for k, v in data['idx_to_token'].items()}
177
+ self.base_vocab_size = data.get('base_vocab_size', 100)
178
 
179
  @property
180
  def vocabulary_size(self) -> int:
181
+ return len(self.token_to_idx)