flopml
/

mamba

flpelerin commited on Aug 25, 2024

Commit

2735988

1 Parent(s): 8ee9950

Update file tokenizer.py

Files changed (1) hide show

tokenizer.py CHANGED Viewed

@@ -159,8 +159,17 @@ class Tokenizer:
         subprocess.run(['make'], cwd=c_dir)
-    def c_run(self, c_dir, c_data, c_vocab):
-        subprocess.run(['./a.out', c_data, c_vocab], cwd=c_dir)
     def c_encode(self, text):
@@ -169,6 +178,7 @@ class Tokenizer:
         c_vocab = c_dir + 'tokenizer.bin'
         c_data  = c_dir + 'dataset.txt'
         with open(c_data, 'w') as f:
             f.write(text)
@@ -176,6 +186,8 @@ class Tokenizer:
         self.to_file(c_vocab)
         self.c_compile(c_dir)
-        self.c_run(c_dir, c_data, c_vocab)
-        return [1, 2, 3, 4]

         subprocess.run(['make'], cwd=c_dir)
+    def c_run(self, c_dir, c_data, c_out):
+        subprocess.run(['./a.out', c_data, c_out], cwd=c_dir)
+    def load_binary_file(file_path):
+        with open(file_path, 'rb') as file:
+            data = file.read()
+            # Assuming uint16_t is 2 bytes long
+            num_values = len(data) // 2
+            values = struct.unpack('H' * num_values, data)
+            return list(values)
     def c_encode(self, text):
         c_vocab = c_dir + 'tokenizer.bin'
         c_data  = c_dir + 'dataset.txt'
+        c_out   = c_dir + 'dataset.bin'
         with open(c_data, 'w') as f:
             f.write(text)
         self.to_file(c_vocab)
         self.c_compile(c_dir)
+        self.c_run(c_dir, c_data, c_out)
+        ids = load_binary_file(c_out)
+        return ids