Text Generation
Safetensors
Danish
English
llama
dfm-decoder-open-v0-7b-pt / stage3 /open-stage3.toml
peter-sk's picture
Super-squash branch 'main' using huggingface_hub
255c557
model_name = "llama3"
flavor = "Comma7B"
tokenizer_name = "common-pile/comma-v0.1-2t"
# job
job_name = "munin-7b-open-stage3"
wandb_project = "munin-7b-open-stage3"
enable_wandb = false
# parallelism
num_nodes = 1
data_parallel_shard_degree = 8
data_parallel_replicate_degree = 1
# training settings
train_batch_size = 8
gradient_accumulation_steps = 2
gradient_accumulation_sync_each_step = true
seq_len = 4096
train_num_steps = 18926 # 37852 // 2
scheduler = "linear_warmup_constant_sqrt_decay"
warmup_steps = 500
cooldown_steps = 18426
checkpoint_interval = 1000
forced_load_path = "/work/training/maester/jobs/munin-7b-open-stage2/checkpoints/step-18926/"
compile = true
enable_cut_cross_entropy = false
ac_mode = "none"
selective_ac_option = "op"
[dataset]
bos_token = 2
eos_token = 1
data_dirs = [
"/work/production/data/munin-open-dyna-0-of-1-cp-2-of-16-train/",
]
dataset_weights = "1.0"
[opt_cfg] # must specify *all* fields here, will not merge with defaults
lr = 1e-5
betas = [0.9, 0.95]
weight_decay = 0.1
eps = 1e-9
fused = true