Update 2 files
Browse files- /tokenizer.py
- /trainer.cli.py
- tokenizer.py +3 -0
- trainer.cli.py +1 -1
tokenizer.py
CHANGED
|
@@ -147,6 +147,9 @@ class Tokenizer:
|
|
| 147 |
|
| 148 |
|
| 149 |
def c_compile(self):
|
|
|
|
|
|
|
|
|
|
| 150 |
# Get the path of the current Python script
|
| 151 |
script_dir = os.path.dirname(__file__)
|
| 152 |
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def c_compile(self):
|
| 150 |
+
import os
|
| 151 |
+
import subprocess
|
| 152 |
+
|
| 153 |
# Get the path of the current Python script
|
| 154 |
script_dir = os.path.dirname(__file__)
|
| 155 |
|
trainer.cli.py
CHANGED
|
@@ -28,7 +28,7 @@ if __name__ == '__main__':
|
|
| 28 |
dataset = Dataset(config.dataset)
|
| 29 |
|
| 30 |
tokenizer = Tokenizer()
|
| 31 |
-
tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
|
| 32 |
ids = tokenizer.c_encode(dataset.text)
|
| 33 |
|
| 34 |
|
|
|
|
| 28 |
dataset = Dataset(config.dataset)
|
| 29 |
|
| 30 |
tokenizer = Tokenizer()
|
| 31 |
+
#tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
|
| 32 |
ids = tokenizer.c_encode(dataset.text)
|
| 33 |
|
| 34 |
|