|
|
[model] |
|
|
config = "fla-hub/hamilton-350M-15B" |
|
|
tokenizer_path = "mistralai/Mistral-7B-v0.1" |
|
|
|
|
|
[job] |
|
|
dump_folder = "exp" |
|
|
print_args = true |
|
|
|
|
|
[training] |
|
|
batch_size = 2 |
|
|
seq_len = 2048 |
|
|
context_len = 2048 |
|
|
gradient_accumulation_steps = 1 |
|
|
steps = 20480 |
|
|
max_norm = 1.0 |
|
|
skip_nan_inf = true |
|
|
data_parallel_replicate_degree = 1 |
|
|
data_parallel_shard_degree = -1 |
|
|
tensor_parallel_degree = 1 |
|
|
compile = false |
|
|
dataset = "SlimPajama-627B" |
|
|
dataset_name = "default" |
|
|
num_workers = 32 |
|
|
pin_memory = false |
|
|
persistent_workers = false |
|
|
prefetch_factor = 2 |
|
|
seed = 42 |
|
|
varlen = false |
|
|
|
|
|
[optimizer] |
|
|
name = "AdamW" |
|
|
eps = 1e-15 |
|
|
lr = 3e-4 |
|
|
|
|
|
[lr_scheduler] |
|
|
warmup_steps = 1024 |
|
|
decay_type = "cosine" |
|
|
lr_min = 0.1 |
|
|
|
|
|
[checkpoint] |
|
|
enable_checkpoint = true |
|
|
folder = "checkpoint" |
|
|
interval_type = "steps" |
|
|
interval = 2048 |
|
|
model_weights_only = false |
|
|
export_dtype = "float32" |
|
|
async_mode = "disabled" |
|
|
|
|
|
[profiling] |
|
|
enable_profiling = false |
|
|
save_traces_folder = "profile_trace" |
|
|
profile_freq = 256 |
|
|
|
|
|
[metrics] |
|
|
log_freq = 32 |
|
|
enable_wandb = true |
|
|
|
|
|
[experimental] |
|
|
context_parallel_degree = 1 |
|
|
pipeline_parallel_degree = 1 |
|
|
|
|
|
[float8] |
|
|
enable_fsdp_float8_all_gather = false |
|
|
precompute_float8_dynamic_scale_for_fsdp = false |
|
|
|
|
|
[activation_checkpoint] |
|
|
mode = "none" |