Upload folder using huggingface_hub
Browse files- config.toml +32 -0
- run.sh +27 -0
config.toml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name = "test"
|
| 2 |
+
n_layers = 2
|
| 3 |
+
d_model = 512
|
| 4 |
+
d_mlp = 2048
|
| 5 |
+
d_head = 64
|
| 6 |
+
n_heads = 8
|
| 7 |
+
attn_only = false
|
| 8 |
+
layer_norm_eps = 1e-05
|
| 9 |
+
init_range = 0.02
|
| 10 |
+
n_ctx = 1024
|
| 11 |
+
d_vocab = 48262
|
| 12 |
+
dataset_name = "eoinf/unprocessed-c4-code-test"
|
| 13 |
+
tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits"
|
| 14 |
+
seed = 10
|
| 15 |
+
device = "cuda"
|
| 16 |
+
use_bfloat16_matmul = false
|
| 17 |
+
batch_size_per_device = 32
|
| 18 |
+
n_devices = 1
|
| 19 |
+
batches_per_step = 1
|
| 20 |
+
max_tokens = 200000000
|
| 21 |
+
lr_hidden = 0.002
|
| 22 |
+
lr_vector = 0.001
|
| 23 |
+
lr_schedule = "constant_with_warmup"
|
| 24 |
+
warmup_tokens = 30000000
|
| 25 |
+
weight_decay = 0.05
|
| 26 |
+
grad_norm_clip = 1.0
|
| 27 |
+
train_loss_moving_average_beta = 0.99
|
| 28 |
+
log_interval = 25
|
| 29 |
+
save_checkpoints = true
|
| 30 |
+
checkpoint_interval = 500
|
| 31 |
+
checkpoint_interval_ratio = 1.10
|
| 32 |
+
save_log_checkpoints = true
|
run.sh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Check if "restart" argument is passed to force normal training
|
| 3 |
+
if [ "$1" = "restart" ]; then
|
| 4 |
+
echo "Force restart: Running normal training ..."
|
| 5 |
+
python -c "
|
| 6 |
+
import os
|
| 7 |
+
from toy_models.models.trainer import train_transformer_from_config
|
| 8 |
+
current_dir = os.getcwd()
|
| 9 |
+
train_transformer_from_config('config.toml', current_dir)
|
| 10 |
+
"
|
| 11 |
+
else
|
| 12 |
+
# Check for checkpoints and run appropriate training
|
| 13 |
+
python -c "
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint
|
| 17 |
+
current_dir = os.getcwd()
|
| 18 |
+
# Check if checkpoints directory exists and has .pt files
|
| 19 |
+
latest_checkpoint = Path('latest_checkpoint.pt')
|
| 20 |
+
if latest_checkpoint.exists():
|
| 21 |
+
print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...')
|
| 22 |
+
restart_from_checkpoint(current_dir)
|
| 23 |
+
else:
|
| 24 |
+
print('Starting training from beginning ...')
|
| 25 |
+
train_transformer_from_config(current_dir)
|
| 26 |
+
"
|
| 27 |
+
fi
|