flpelerin commited on
Commit
2735988
·
1 Parent(s): 8ee9950

Update file tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +16 -4
tokenizer.py CHANGED
@@ -159,8 +159,17 @@ class Tokenizer:
159
  subprocess.run(['make'], cwd=c_dir)
160
 
161
 
162
- def c_run(self, c_dir, c_data, c_vocab):
163
- subprocess.run(['./a.out', c_data, c_vocab], cwd=c_dir)
 
 
 
 
 
 
 
 
 
164
 
165
 
166
  def c_encode(self, text):
@@ -169,6 +178,7 @@ class Tokenizer:
169
 
170
  c_vocab = c_dir + 'tokenizer.bin'
171
  c_data = c_dir + 'dataset.txt'
 
172
 
173
  with open(c_data, 'w') as f:
174
  f.write(text)
@@ -176,6 +186,8 @@ class Tokenizer:
176
 
177
  self.to_file(c_vocab)
178
  self.c_compile(c_dir)
179
- self.c_run(c_dir, c_data, c_vocab)
 
 
180
 
181
- return [1, 2, 3, 4]
 
159
  subprocess.run(['make'], cwd=c_dir)
160
 
161
 
162
+ def c_run(self, c_dir, c_data, c_out):
163
+ subprocess.run(['./a.out', c_data, c_out], cwd=c_dir)
164
+
165
+
166
+ def load_binary_file(file_path):
167
+ with open(file_path, 'rb') as file:
168
+ data = file.read()
169
+ # Assuming uint16_t is 2 bytes long
170
+ num_values = len(data) // 2
171
+ values = struct.unpack('H' * num_values, data)
172
+ return list(values)
173
 
174
 
175
  def c_encode(self, text):
 
178
 
179
  c_vocab = c_dir + 'tokenizer.bin'
180
  c_data = c_dir + 'dataset.txt'
181
+ c_out = c_dir + 'dataset.bin'
182
 
183
  with open(c_data, 'w') as f:
184
  f.write(text)
 
186
 
187
  self.to_file(c_vocab)
188
  self.c_compile(c_dir)
189
+ self.c_run(c_dir, c_data, c_out)
190
+
191
+ ids = load_binary_file(c_out)
192
 
193
+ return ids