| {"name": "stem_llama3_1B", "dump_dir": "/checkpoints-fsx/beidchen-sandbox/STEM/logs/lm1b-midtrain-stem-100B", "seed": 777, "model_type": "llama", "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": 100, "steps": 200000, "data": {"root_dir": "/dev/shm/dolmino-mix_shuffled", "sources": {"CommonCrawlHQ": 22.5, "Code_Main": 20.0, "Math_Main": 18.06, "InstQA_Main": 11.1, "Reasoning_Traces": 7.49, "STEM_Heavy": 10.0, "Flashcards_RCQA": 8.9}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "/checkpoints-fsx/beidchen-sandbox/stem/Llama-3.2-1B/original/tokenizer.model"}}, "optim": {"lr": 0.0001, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 0.01, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 16, "head_dim": 64, "n_heads": 32, "n_kv_heads": 8, "ffn_dim_multiplier": 1.5, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 500000.0, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}, "init_base_std": 0.02, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 128256, "weight_tying": true, "sliding_window": null, "stem_layers": [2, 6, 10, 14], "stem_embedding_dim": 8192}, "distributed": {"dp_shard": 1, "dp_replicate": 32, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver", "stem_parallel_size": 8}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": 1}, "eval": {"every": 10000, "keep": 1}, "path": "/checkpoints-fsx/beidchen-sandbox/STEM/logs/lm1b-midtrain-stem-100B/checkpoints", "init_ckpt_path": "/dev/shm/Llama-1B-stem-init", "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 10, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "stem", "entity": null, "tags": null, "group": null, "name": "stem_llama3_1B", "notes": null, "config_exclude_keys": null, "config_include_keys": null, "anonymous": null, "mode": null, "allow_val_change": null, "resume": null, "force": null, "tensorboard": null, "sync_tensorboard": null, "monitor_gym": null, "save_code": null, "id": null, "fork_from": null, "resume_from": null}}, "async_eval_gpus": null, "eval": {"generator": {"max_tokens": 4096, "dtype": "bf16", "temperature": 1.0, "top_p": 0.95}, "harness": {"tasks": [{"task": "hellaswag", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/hellaswag"}, {"task": "boolq", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/super_glue"}, {"task": "piqa", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/piqa"}, {"task": "winogrande", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/winogrande"}, {"task": "openbookqa", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/openbookqa"}, {"task": "arc_easy", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/ai2_arc"}, {"task": "arc_challenge", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/ai2_arc"}, {"task": "race", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/race"}, {"task": "gsm8k", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/gsm8k"}, {"task": "mmlu", "dataset_path": "/checkpoints-fsx/beidchen-sandbox/stem/eval_data/mmlu"}], "limit": 1000}, "validation": {"max_steps": 100}}, "stem_lr": 0.0005, "stem_weight_decay": 0.0} |