| { | |
| "dump_dir": "/checkpoint/comem/barlaso/amaia_dumps/ar_scale_1M_64_1.5e-4", | |
| "seed": 42, | |
| "batch_size": 4, | |
| "grad_acc_steps": 1, | |
| "loss_reduction": "mean", | |
| "seq_len": 4096, | |
| "checkpoint_freq": 8000, | |
| "eval_freq": 8000, | |
| "gc_collect_freq": 1000, | |
| "logging_freq": 10, | |
| "logging_acc_freq": -1, | |
| "probe_freq": null, | |
| "probe_dump_tensors": [], | |
| "warn_thresh_curr_iter_seconds": 10, | |
| "warn_thresh_data_load_seconds": 10, | |
| "steps": 1000000, | |
| "data": { | |
| "sources": [ | |
| { | |
| "path": "/checkpoint/comem/barlaso/data/simpleqa/wiki_scale/amaia", | |
| "type": "pretrain", | |
| "weight": 94.4 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/github_oss_with_stack", | |
| "type": "pretrain", | |
| "weight": 27.0 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/b3g", | |
| "type": "pretrain", | |
| "weight": 0.3 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/arxiv", | |
| "type": "pretrain", | |
| "weight": 0.4 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/stackexchange", | |
| "type": "pretrain", | |
| "weight": 0.7 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/wikipedia", | |
| "type": "pretrain", | |
| "weight": 1.0 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/llama2/edouard_cc_20220927_new", | |
| "type": "pretrain", | |
| "weight": 10.0 | |
| }, | |
| { | |
| "path": "/datasets/llm/pretraining/dclm", | |
| "type": "pretrain", | |
| "weight": 55.0 | |
| } | |
| ], | |
| "loader": "new", | |
| "seed": 42, | |
| "add_bos": true, | |
| "add_eos": true, | |
| "load_async": true, | |
| "shuffle_buffer_size": 64, | |
| "tokenizer": { | |
| "name": "tiktoken", | |
| "path": "/checkpoint/comem/barlaso/amaia/Meta-Llama-3.1-8B/cl_toplang_128k.tiktoken" | |
| } | |
| }, | |
| "optim": { | |
| "lr": 0.00015, | |
| "weight_decay": 0.1, | |
| "epsilon": 1e-08, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "fused_optimizer": true, | |
| "clip": 1.0, | |
| "scheduler": "cosine", | |
| "warmup": 2000, | |
| "cooldown": 2000, | |
| "lr_min_ratio": 0.01, | |
| "cycle_length": 1.0, | |
| "cosine_theta": 1.0, | |
| "exp_factor": 0.5, | |
| "n_steps": null | |
| }, | |
| "model": { | |
| "dim": 4096, | |
| "n_layers": 32, | |
| "n_heads": 32, | |
| "n_kv_heads": 8, | |
| "max_seq_len": 4096, | |
| "vocab_size": 128256, | |
| "ffn_dim_multiplier": 1.3, | |
| "ffn_exp": 4, | |
| "multiple_of": 1024, | |
| "norm_eps": 1e-05, | |
| "pos_embed_impl": "rope", | |
| "rope_theta": 500000.0, | |
| "scaled_rope": { | |
| "scale_factor": 8, | |
| "old_context_len": 8192, | |
| "low_freq_factor": 1, | |
| "high_freq_factor": 4, | |
| "use_attn_scale": false | |
| }, | |
| "attn_impl": "flex_attention", | |
| "attn_bias_type": "doc_causal", | |
| "weight_tying": false, | |
| "init_method": "current_depth", | |
| "init_base_std": null, | |
| "seed": 42 | |
| }, | |
| "distributed": { | |
| "dp_size": 512, | |
| "tp_size": 1, | |
| "dp_type": "fsdp", | |
| "model_dtype": "bf16", | |
| "vocab_parallel": false, | |
| "loss_parallel": false, | |
| "compile": true, | |
| "ac_mode": "none", | |
| "selective_ac_option": 2, | |
| "partitioner_ac_budget": 0.99, | |
| "fp8_recipe": "rowwise", | |
| "fp8_filter": "layers\\.", | |
| "fp8_healing": null, | |
| "async_tp": false | |
| }, | |
| "setup": { | |
| "spawn_method": "forkserver", | |
| "torch_init_timeout": 600, | |
| "cuda_matmul_allow_tf32": true, | |
| "cuda_allow_bf16_reduced_precision_reduction": true, | |
| "autograd_detect_anomaly": false | |
| }, | |
| "logging": { | |
| "enable_tensorboard": true, | |
| "enable_wandb": false, | |
| "enable_otel": false, | |
| "wandb": { | |
| "project": "activereading", | |
| "group": null, | |
| "job_type": null, | |
| "entity": null, | |
| "name": null, | |
| "resume": null, | |
| "fork_from_step": null, | |
| "disable_on_init_failure": false | |
| } | |
| }, | |
| "profiling": { | |
| "run": false, | |
| "trace_folder": "profiling", | |
| "mem_warmup": 100, | |
| "mem_steps": 2, | |
| "profile_warmup": 102, | |
| "profile_steps": 2 | |
| }, | |
| "checkpoint": { | |
| "path": "/checkpoint/comem/barlaso/amaia_dumps/ar_scale_1M_64_1.5e-4/checkpoints", | |
| "keep_latest": 1, | |
| "keep_eval_checkpoints": true, | |
| "compress": false | |
| }, | |
| "continue_from": { | |
| "checkpoint_dir": "/checkpoint/comem/barlaso/amaia/Meta-Llama-3.1-8B", | |
| "reload_optim": false, | |
| "reload_dataloader": false, | |
| "reload_train_state": false, | |
| "validation_mode": "raise" | |
| }, | |
| "eval_on_gpus": 8, | |
| "eval": { | |
| "dataset_dir": "/datasets/llm/eval", | |
| "tasks": "hellaswag,nq,simpleqa_wiki,simpleqa_wiki_nll", | |
| "task_args": null, | |
| "ppl": null, | |
| "predictor": "llama_predictor", | |
| "predictor_config": { | |
| "checkpoint_dir": "", | |
| "tp_size": 1, | |
| "dp_type": null, | |
| "compile": false, | |
| "model_dtype": "bf16", | |
| "device": "cuda", | |
| "batch_size": 128, | |
| "generation_batch_size": null, | |
| "auto_batch_size": true | |
| }, | |
| "temperature": 0.0, | |
| "top_k": 0, | |
| "top_p": 0.0, | |
| "seed": 42, | |
| "dump_dir": "", | |
| "metric_log_dir": "", | |
| "tb_log_dir": null, | |
| "no_resume": false, | |
| "max_samples": null, | |
| "show_progress": false, | |
| "log_to_tb": false, | |
| "global_step": null, | |
| "disable_metric_logging": false, | |
| "logging": null, | |
| "checkpoint_dir": "", | |
| "tp_size": 1, | |
| "dp_type": null, | |
| "compile": false, | |
| "model_dtype": "bf16", | |
| "device": "cuda", | |
| "batch_size": 128, | |
| "generation_batch_size": null | |
| } | |
| } |