Update file tokenizer.py
Browse files- tokenizer.py +16 -4
tokenizer.py
CHANGED
|
@@ -159,8 +159,17 @@ class Tokenizer:
|
|
| 159 |
subprocess.run(['make'], cwd=c_dir)
|
| 160 |
|
| 161 |
|
| 162 |
-
def c_run(self, c_dir, c_data,
|
| 163 |
-
subprocess.run(['./a.out', c_data,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
|
| 166 |
def c_encode(self, text):
|
|
@@ -169,6 +178,7 @@ class Tokenizer:
|
|
| 169 |
|
| 170 |
c_vocab = c_dir + 'tokenizer.bin'
|
| 171 |
c_data = c_dir + 'dataset.txt'
|
|
|
|
| 172 |
|
| 173 |
with open(c_data, 'w') as f:
|
| 174 |
f.write(text)
|
|
@@ -176,6 +186,8 @@ class Tokenizer:
|
|
| 176 |
|
| 177 |
self.to_file(c_vocab)
|
| 178 |
self.c_compile(c_dir)
|
| 179 |
-
self.c_run(c_dir, c_data,
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
return
|
|
|
|
| 159 |
subprocess.run(['make'], cwd=c_dir)
|
| 160 |
|
| 161 |
|
| 162 |
+
def c_run(self, c_dir, c_data, c_out):
|
| 163 |
+
subprocess.run(['./a.out', c_data, c_out], cwd=c_dir)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def load_binary_file(file_path):
|
| 167 |
+
with open(file_path, 'rb') as file:
|
| 168 |
+
data = file.read()
|
| 169 |
+
# Assuming uint16_t is 2 bytes long
|
| 170 |
+
num_values = len(data) // 2
|
| 171 |
+
values = struct.unpack('H' * num_values, data)
|
| 172 |
+
return list(values)
|
| 173 |
|
| 174 |
|
| 175 |
def c_encode(self, text):
|
|
|
|
| 178 |
|
| 179 |
c_vocab = c_dir + 'tokenizer.bin'
|
| 180 |
c_data = c_dir + 'dataset.txt'
|
| 181 |
+
c_out = c_dir + 'dataset.bin'
|
| 182 |
|
| 183 |
with open(c_data, 'w') as f:
|
| 184 |
f.write(text)
|
|
|
|
| 186 |
|
| 187 |
self.to_file(c_vocab)
|
| 188 |
self.c_compile(c_dir)
|
| 189 |
+
self.c_run(c_dir, c_data, c_out)
|
| 190 |
+
|
| 191 |
+
ids = load_binary_file(c_out)
|
| 192 |
|
| 193 |
+
return ids
|