{ "dump_dir": "/checkpoint/comem/barlaso/amaia_dumps/ar_scale_1M_64_1.5e-4", "seed": 42, "batch_size": 4, "grad_acc_steps": 1, "loss_reduction": "mean", "seq_len": 4096, "checkpoint_freq": 8000, "eval_freq": 8000, "gc_collect_freq": 1000, "logging_freq": 10, "logging_acc_freq": -1, "probe_freq": null, "probe_dump_tensors": [], "warn_thresh_curr_iter_seconds": 10, "warn_thresh_data_load_seconds": 10, "steps": 1000000, "data": { "sources": [ { "path": "/checkpoint/comem/barlaso/data/simpleqa/wiki_scale/amaia", "type": "pretrain", "weight": 94.4 }, { "path": "/datasets/llm/pretraining/llama2/github_oss_with_stack", "type": "pretrain", "weight": 27.0 }, { "path": "/datasets/llm/pretraining/llama2/b3g", "type": "pretrain", "weight": 0.3 }, { "path": "/datasets/llm/pretraining/llama2/arxiv", "type": "pretrain", "weight": 0.4 }, { "path": "/datasets/llm/pretraining/llama2/stackexchange", "type": "pretrain", "weight": 0.7 }, { "path": "/datasets/llm/pretraining/llama2/wikipedia", "type": "pretrain", "weight": 1.0 }, { "path": "/datasets/llm/pretraining/llama2/edouard_cc_20220927_new", "type": "pretrain", "weight": 10.0 }, { "path": "/datasets/llm/pretraining/dclm", "type": "pretrain", "weight": 55.0 } ], "loader": "new", "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "shuffle_buffer_size": 64, "tokenizer": { "name": "tiktoken", "path": "/checkpoint/comem/barlaso/amaia/Meta-Llama-3.1-8B/cl_toplang_128k.tiktoken" } }, "optim": { "lr": 0.00015, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "fused_optimizer": true, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "cooldown": 2000, "lr_min_ratio": 0.01, "cycle_length": 1.0, "cosine_theta": 1.0, "exp_factor": 0.5, "n_steps": null }, "model": { "dim": 4096, "n_layers": 32, "n_heads": 32, "n_kv_heads": 8, "max_seq_len": 4096, "vocab_size": 128256, "ffn_dim_multiplier": 1.3, "ffn_exp": 4, "multiple_of": 1024, "norm_eps": 1e-05, "pos_embed_impl": "rope", "rope_theta": 500000.0, "scaled_rope": { "scale_factor": 8, "old_context_len": 8192, "low_freq_factor": 1, "high_freq_factor": 4, "use_attn_scale": false }, "attn_impl": "flex_attention", "attn_bias_type": "doc_causal", "weight_tying": false, "init_method": "current_depth", "init_base_std": null, "seed": 42 }, "distributed": { "dp_size": 512, "tp_size": 1, "dp_type": "fsdp", "model_dtype": "bf16", "vocab_parallel": false, "loss_parallel": false, "compile": true, "ac_mode": "none", "selective_ac_option": 2, "partitioner_ac_budget": 0.99, "fp8_recipe": "rowwise", "fp8_filter": "layers\\.", "fp8_healing": null, "async_tp": false }, "setup": { "spawn_method": "forkserver", "torch_init_timeout": 600, "cuda_matmul_allow_tf32": true, "cuda_allow_bf16_reduced_precision_reduction": true, "autograd_detect_anomaly": false }, "logging": { "enable_tensorboard": true, "enable_wandb": false, "enable_otel": false, "wandb": { "project": "activereading", "group": null, "job_type": null, "entity": null, "name": null, "resume": null, "fork_from_step": null, "disable_on_init_failure": false } }, "profiling": { "run": false, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2 }, "checkpoint": { "path": "/checkpoint/comem/barlaso/amaia_dumps/ar_scale_1M_64_1.5e-4/checkpoints", "keep_latest": 1, "keep_eval_checkpoints": true, "compress": false }, "continue_from": { "checkpoint_dir": "/checkpoint/comem/barlaso/amaia/Meta-Llama-3.1-8B", "reload_optim": false, "reload_dataloader": false, "reload_train_state": false, "validation_mode": "raise" }, "eval_on_gpus": 8, "eval": { "dataset_dir": "/datasets/llm/eval", "tasks": "hellaswag,nq,simpleqa_wiki,simpleqa_wiki_nll", "task_args": null, "ppl": null, "predictor": "llama_predictor", "predictor_config": { "checkpoint_dir": "", "tp_size": 1, "dp_type": null, "compile": false, "model_dtype": "bf16", "device": "cuda", "batch_size": 128, "generation_batch_size": null, "auto_batch_size": true }, "temperature": 0.0, "top_k": 0, "top_p": 0.0, "seed": 42, "dump_dir": "", "metric_log_dir": "", "tb_log_dir": null, "no_resume": false, "max_samples": null, "show_progress": false, "log_to_tb": false, "global_step": null, "disable_metric_logging": false, "logging": null, "checkpoint_dir": "", "tp_size": 1, "dp_type": null, "compile": false, "model_dtype": "bf16", "device": "cuda", "batch_size": 128, "generation_batch_size": null } }