vedaco commited on
Commit
ffd2cda
·
verified ·
1 Parent(s): 6677818

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +9 -12
tokenizer.py CHANGED
@@ -1,9 +1,10 @@
1
- """Tokenizer - MODIFIED for conversations"""
2
 
3
  import json
4
  import re
5
  from typing import List, Dict, Optional
6
 
 
7
  class VedaTokenizer:
8
  """Tokenizer with conversation support"""
9
 
@@ -15,18 +16,16 @@ class VedaTokenizer:
15
 
16
  def _init_vocab(self):
17
  """Initialize vocabulary with conversation tokens"""
18
- # Special tokens - ADDED conversation tokens
19
  special = [
20
  "<PAD>", "<UNK>", "<START>", "<END>",
21
- "<CODE>", "<ENDCODE>", # For code blocks
22
- "<USER>", "<ASSISTANT>" # For conversation
23
  ]
24
 
25
  for idx, token in enumerate(special):
26
  self.token_to_idx[token] = idx
27
  self.idx_to_token[idx] = token
28
 
29
- # ASCII characters
30
  idx = len(special)
31
  for i in range(32, 127):
32
  char = chr(i)
@@ -34,7 +33,6 @@ class VedaTokenizer:
34
  self.idx_to_token[idx] = char
35
  idx += 1
36
 
37
- # Whitespace
38
  for char in ["\n", "\t"]:
39
  self.token_to_idx[char] = idx
40
  self.idx_to_token[idx] = char
@@ -115,23 +113,21 @@ class VedaTokenizer:
115
  return tokens
116
 
117
  def decode(self, indices: List[int]) -> str:
118
- """Decode indices to text - MODIFIED for conversation tokens"""
119
  result = []
120
  prev = ""
121
 
122
  for idx in indices:
123
- if idx == 0: # PAD
124
  continue
125
  if idx not in self.idx_to_token:
126
  continue
127
 
128
  token = self.idx_to_token[idx]
129
 
130
- # Skip special tokens in output
131
  if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
132
  continue
133
 
134
- # Handle code blocks
135
  if token == "<CODE>":
136
  result.append("\n```python\n")
137
  prev = "\n"
@@ -141,7 +137,6 @@ class VedaTokenizer:
141
  prev = "\n"
142
  continue
143
 
144
- # Smart joining
145
  if not result:
146
  result.append(token)
147
  elif token in "\n\t":
@@ -150,7 +145,7 @@ class VedaTokenizer:
150
  result.append(token)
151
  elif prev in "(\n\t[{":
152
  result.append(token)
153
- elif prev.isalnum() and len(token) > 0 and token[0].isalnum():
154
  result.append(" " + token)
155
  else:
156
  result.append(token)
@@ -160,6 +155,7 @@ class VedaTokenizer:
160
  return "".join(result)
161
 
162
  def save(self, path: str):
 
163
  with open(path, 'w') as f:
164
  json.dump({
165
  'vocab_size': self.vocab_size,
@@ -169,6 +165,7 @@ class VedaTokenizer:
169
  }, f, indent=2)
170
 
171
  def load(self, path: str):
 
172
  with open(path, 'r') as f:
173
  data = json.load(f)
174
  self.vocab_size = data['vocab_size']
 
1
+ """Tokenizer for Veda Programming Assistant"""
2
 
3
  import json
4
  import re
5
  from typing import List, Dict, Optional
6
 
7
+
8
  class VedaTokenizer:
9
  """Tokenizer with conversation support"""
10
 
 
16
 
17
  def _init_vocab(self):
18
  """Initialize vocabulary with conversation tokens"""
 
19
  special = [
20
  "<PAD>", "<UNK>", "<START>", "<END>",
21
+ "<CODE>", "<ENDCODE>",
22
+ "<USER>", "<ASSISTANT>"
23
  ]
24
 
25
  for idx, token in enumerate(special):
26
  self.token_to_idx[token] = idx
27
  self.idx_to_token[idx] = token
28
 
 
29
  idx = len(special)
30
  for i in range(32, 127):
31
  char = chr(i)
 
33
  self.idx_to_token[idx] = char
34
  idx += 1
35
 
 
36
  for char in ["\n", "\t"]:
37
  self.token_to_idx[char] = idx
38
  self.idx_to_token[idx] = char
 
113
  return tokens
114
 
115
  def decode(self, indices: List[int]) -> str:
116
+ """Decode indices to text"""
117
  result = []
118
  prev = ""
119
 
120
  for idx in indices:
121
+ if idx == 0:
122
  continue
123
  if idx not in self.idx_to_token:
124
  continue
125
 
126
  token = self.idx_to_token[idx]
127
 
 
128
  if token in ["<PAD>", "<UNK>", "<START>", "<END>", "<USER>", "<ASSISTANT>"]:
129
  continue
130
 
 
131
  if token == "<CODE>":
132
  result.append("\n```python\n")
133
  prev = "\n"
 
137
  prev = "\n"
138
  continue
139
 
 
140
  if not result:
141
  result.append(token)
142
  elif token in "\n\t":
 
145
  result.append(token)
146
  elif prev in "(\n\t[{":
147
  result.append(token)
148
+ elif len(prev) > 0 and prev[-1].isalnum() and len(token) > 0 and token[0].isalnum():
149
  result.append(" " + token)
150
  else:
151
  result.append(token)
 
155
  return "".join(result)
156
 
157
  def save(self, path: str):
158
+ """Save tokenizer"""
159
  with open(path, 'w') as f:
160
  json.dump({
161
  'vocab_size': self.vocab_size,
 
165
  }, f, indent=2)
166
 
167
  def load(self, path: str):
168
+ """Load tokenizer"""
169
  with open(path, 'r') as f:
170
  data = json.load(f)
171
  self.vocab_size = data['vocab_size']