eoinf commited on
Commit
86afed9
·
verified ·
1 Parent(s): d74cf85

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.toml +32 -0
  2. run.sh +27 -0
config.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name = "test"
2
+ n_layers = 2
3
+ d_model = 512
4
+ d_mlp = 2048
5
+ d_head = 64
6
+ n_heads = 8
7
+ attn_only = false
8
+ layer_norm_eps = 1e-05
9
+ init_range = 0.02
10
+ n_ctx = 1024
11
+ d_vocab = 48262
12
+ dataset_name = "eoinf/unprocessed-c4-code-test"
13
+ tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits"
14
+ seed = 10
15
+ device = "cuda"
16
+ use_bfloat16_matmul = false
17
+ batch_size_per_device = 32
18
+ n_devices = 1
19
+ batches_per_step = 1
20
+ max_tokens = 200000000
21
+ lr_hidden = 0.002
22
+ lr_vector = 0.001
23
+ lr_schedule = "constant_with_warmup"
24
+ warmup_tokens = 30000000
25
+ weight_decay = 0.05
26
+ grad_norm_clip = 1.0
27
+ train_loss_moving_average_beta = 0.99
28
+ log_interval = 25
29
+ save_checkpoints = true
30
+ checkpoint_interval = 500
31
+ checkpoint_interval_ratio = 1.10
32
+ save_log_checkpoints = true
run.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Check if "restart" argument is passed to force normal training
3
+ if [ "$1" = "restart" ]; then
4
+ echo "Force restart: Running normal training ..."
5
+ python -c "
6
+ import os
7
+ from toy_models.models.trainer import train_transformer_from_config
8
+ current_dir = os.getcwd()
9
+ train_transformer_from_config('config.toml', current_dir)
10
+ "
11
+ else
12
+ # Check for checkpoints and run appropriate training
13
+ python -c "
14
+ import os
15
+ from pathlib import Path
16
+ from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint
17
+ current_dir = os.getcwd()
18
+ # Check if checkpoints directory exists and has .pt files
19
+ latest_checkpoint = Path('latest_checkpoint.pt')
20
+ if latest_checkpoint.exists():
21
+ print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...')
22
+ restart_from_checkpoint(current_dir)
23
+ else:
24
+ print('Starting training from beginning ...')
25
+ train_transformer_from_config(current_dir)
26
+ "
27
+ fi