Add files using upload-large-folder tool
Browse files- checkpoints/0000050000/params.json +1 -0
- checkpoints/0000050000/train_state_00000.json +1 -0
- checkpoints/0000050000/train_state_00001.json +1 -0
- checkpoints/0000050000/train_state_00002.json +1 -0
- checkpoints/0000050000/train_state_00003.json +1 -0
- checkpoints/0000050000/train_state_00004.json +1 -0
- checkpoints/0000050000/train_state_00005.json +1 -0
- checkpoints/0000050000/train_state_00006.json +1 -0
- checkpoints/0000050000/train_state_00007.json +1 -0
- checkpoints/0000050000/train_state_00008.json +1 -0
- checkpoints/0000050000/train_state_00010.json +1 -0
- checkpoints/0000050000/train_state_00013.json +1 -0
- checkpoints/0000050000/train_state_00014.json +1 -0
- checkpoints/0000050000/train_state_00015.json +1 -0
- checkpoints/0000050000/train_state_00017.json +1 -0
- checkpoints/0000050000/train_state_00018.json +1 -0
- checkpoints/0000050000/train_state_00019.json +1 -0
- checkpoints/0000050000/train_state_00020.json +1 -0
- config.yaml +179 -0
- evals/0000050000/config.yaml +62 -0
- evals/0000050000/results.json +1 -0
- metrics.eval.jsonl +1 -0
- train.log +0 -0
- wandb/debug-internal.log +0 -0
- wandb/debug.log +53 -0
- wandb/run-20260429_011802-2wmkezq3/files/media/html/memory_trace_50_79effaa90bfee7eb3207.html +0 -0
- wandb/run-20260429_011802-2wmkezq3/files/media/html/profile_trace_51_ae282608c6eeb7f48826.html +1 -0
- wandb/run-20260429_011802-2wmkezq3/files/output.log +0 -0
- wandb/run-20260429_011802-2wmkezq3/files/requirements.txt +177 -0
- wandb/run-20260429_011802-2wmkezq3/files/wandb-metadata.json +155 -0
- wandb/run-20260429_011802-2wmkezq3/logs/debug-core.log +11 -0
- wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log +0 -0
- wandb/run-20260429_011802-2wmkezq3/logs/debug.log +19 -0
- wandb/run-20260429_141040-a48q7rq3/files/output.log +1 -0
- wandb/run-20260429_141040-a48q7rq3/files/requirements.txt +199 -0
- wandb/run-20260429_141040-a48q7rq3/files/wandb-metadata.json +155 -0
- wandb/run-20260429_141040-a48q7rq3/logs/debug-core.log +10 -0
- wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log +9 -0
- wandb/run-20260429_141040-a48q7rq3/logs/debug.log +19 -0
- wandb/run-20260429_141040-a48q7rq3/run-a48q7rq3.wandb +0 -0
- wandb/run-20260429_153552-r20yn80u/files/config.yaml +359 -0
- wandb/run-20260429_153552-r20yn80u/files/media/html/memory_trace_15050_79effaa90bfee7eb3207.html +0 -0
- wandb/run-20260429_153552-r20yn80u/files/media/html/profile_trace_15051_ae282608c6eeb7f48826.html +1 -0
- wandb/run-20260429_153552-r20yn80u/files/output.log +0 -0
- wandb/run-20260429_153552-r20yn80u/files/requirements.txt +199 -0
- wandb/run-20260429_153552-r20yn80u/files/wandb-metadata.json +155 -0
- wandb/run-20260429_153552-r20yn80u/files/wandb-summary.json +1 -0
- wandb/run-20260429_153552-r20yn80u/logs/debug-core.log +15 -0
- wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log +0 -0
- wandb/run-20260429_153552-r20yn80u/logs/debug.log +53 -0
checkpoints/0000050000/params.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"name": "olmo2_1B_midfine", "dump_dir": "/home/xun/rsadhukh/STEM/logs/midfine_base_final", "seed": 777, "model_type": "olmo3", "stem_up_proj_layers": [], "grad_acc_steps": 2, "gc_collect_freq": 1000, "probe_freq": 100, "steps": 50000, "stage_steps": null, "data": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "node_local": false, "batch_size": 8, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "track_packed_source_mixture": true, "packed_source_counts": {"code-meta-reasoning": 10977463, "common_crawl-high-quality": 536940263, "cranecode": 238637058, "cranemath": 134353949, "dolmino-math": 255343043, "dolmino_1-flan": 119319719, "gemini-reasoning-traces": 5969666, "general_reasoning_mix": 44625695, "math-meta-reasoning": 9068782, "megamatt": 41284629, "nemotron-synth-qa": 119319374, "olmocr_science_pdfs": 119319342, "openthoughts2": 29838881, "program_verifiable": 3819403, "qwq-reasoning-traces": 44631652, "reddit_to_flashcards": 140789161, "stack_edu": 238638082, "stem-heavy-crawl": 119319916, "tinymath-mind": 21478051, "tinymath-pot": 5727426, "tulu-3-sft": 26250698, "wiki_to_rcqa": 71590884}}, "optim": {"lr": 7.44e-05, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "linear", "warmup": 0, "lr_min_ratio": 0.0, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5, "initial_token_offset": 0, "global_final_step": null}, "model": {"dim": 2048, "n_layers": 16, "head_dim": 128, "n_heads": 16, "n_kv_heads": 16, "ffn_dim_multiplier": 1.5, "multiple_of": 256, "norm_eps": 1e-06, "rope_theta": 500000.0, "rope_scaling": null, "init_base_std": 0.02, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 100352, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 32, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver", "stem_parallel_size": 8}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 5000, "keep": 1}, "eval": {"every": 100000, "keep": 1}, "path": "/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints", "init_ckpt_path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/", "continue_training_from_init": true, "legacy_init_ckpt_lm_transformer": false, "merge_lm_optim_seed_ckpt_path": null}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 10, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "stem", "entity": null, "tags": null, "group": null, "name": "olmo2_1B_midfine", "notes": null, "config_exclude_keys": null, "config_include_keys": null, "anonymous": null, "mode": null, "allow_val_change": null, "resume": null, "force": null, "tensorboard": null, "sync_tensorboard": null, "monitor_gym": null, "save_code": null, "id": null, "fork_from": null, "resume_from": null}}, "async_eval_gpus": null, "eval": {"generator": {"max_tokens": 16384, "dtype": "bf16", "temperature": 1.0, "top_p": 0.95}, "harness": {"tasks": [{"task": "hellaswag", "dataset_path": "/data/rsadhukh/eval_data/hellaswag"}, {"task": "boolq", "dataset_path": "/data/rsadhukh/eval_data/super_glue"}, {"task": "piqa", "dataset_path": "/data/rsadhukh/eval_data/piqa"}, {"task": "winogrande", "dataset_path": "/data/rsadhukh/eval_data/winogrande"}, {"task": "openbookqa", "dataset_path": "/data/rsadhukh/eval_data/openbookqa"}, {"task": "arc_easy", "dataset_path": "/data/rsadhukh/eval_data/ai2_arc"}, {"task": "arc_challenge", "dataset_path": "/data/rsadhukh/eval_data/ai2_arc"}], "confirm_run_unsafe_code": true, "batch_size": 64}, "validation": null}}
|
checkpoints/0000050000/train_state_00000.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1030, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.00.jsonl", "position": 169184122, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.00.jsonl", "position": 103555217, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.00.jsonl", "position": 121315811, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.00.jsonl", "position": 93158712, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.00.jsonl", "position": 61186565, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.00.jsonl", "position": 5174957, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.00.jsonl", "position": 1546250, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.00.jsonl", "position": 1255920958, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.00.jsonl", "position": 26279313, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.00.jsonl", "position": 49452440, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.00.jsonl", "position": 2391376, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.00.jsonl", "position": 4049828, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.00.jsonl", "position": 601143, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.00.jsonl", "position": 14984817, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.00.jsonl", "position": 7215763, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.00.jsonl", "position": 19007656, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.00.jsonl", "position": 2371004, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.00.jsonl", "position": 9629373, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.00.jsonl", "position": 57425970, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.00.jsonl", "position": 180096848, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.00.jsonl", "position": 69872688, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.00.jsonl", "position": 503556493, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 132764577257282494100248622808569340063, "inc": 199517438996687927661581397869791268041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717362, "common_crawl-high-quality": 524179168, "cranecode": 232968050, "cranemath": 131159704, "dolmino-math": 249275298, "dolmino_1-flan": 116484402, "gemini-reasoning-traces": 5826646, "general_reasoning_mix": 43565000, "math-meta-reasoning": 8853292, "megamatt": 40304231, "nemotron-synth-qa": 116482462, "olmocr_science_pdfs": 116491847, "openthoughts2": 29121250, "program_verifiable": 3727783, "qwq-reasoning-traces": 43578408, "reddit_to_flashcards": 137439257, "stack_edu": 232970173, "stem-heavy-crawl": 116483563, "tinymath-mind": 20967277, "tinymath-pot": 5591237, "tulu-3-sft": 25625221, "wiki_to_rcqa": 69888715}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 190856181678508905526281042577254913524, "inc": 203371896531876761410193603683292290457}, "has_uint32": 0, "uinteger": 2500812634}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00001.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 985, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.01.jsonl", "position": 168720688, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.01.jsonl", "position": 110618352, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.01.jsonl", "position": 119384353, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.01.jsonl", "position": 85448056, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.01.jsonl", "position": 58608170, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.01.jsonl", "position": 6474706, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.01.jsonl", "position": 2024232, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.01.jsonl", "position": 1255733371, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.01.jsonl", "position": 25598945, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.01.jsonl", "position": 49983995, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.01.jsonl", "position": 2213392, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.01.jsonl", "position": 5370488, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.01.jsonl", "position": 1099892, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.01.jsonl", "position": 12654115, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.01.jsonl", "position": 7726124, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.01.jsonl", "position": 11163765, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.01.jsonl", "position": 2018846, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.01.jsonl", "position": 11701266, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.01.jsonl", "position": 63132979, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.01.jsonl", "position": 177519305, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.01.jsonl", "position": 62396313, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.01.jsonl", "position": 476242817, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 312829929264520745690347284204771532019, "inc": 70355339299095406607494999455309704455}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716459, "common_crawl-high-quality": 524177711, "cranecode": 232968196, "cranemath": 131161732, "dolmino-math": 249277186, "dolmino_1-flan": 116484725, "gemini-reasoning-traces": 5828249, "general_reasoning_mix": 43564962, "math-meta-reasoning": 8853356, "megamatt": 40303546, "nemotron-synth-qa": 116483943, "olmocr_science_pdfs": 116489141, "openthoughts2": 29120744, "program_verifiable": 3727513, "qwq-reasoning-traces": 43571770, "reddit_to_flashcards": 137442567, "stack_edu": 232968216, "stem-heavy-crawl": 116484314, "tinymath-mind": 20967123, "tinymath-pot": 5591618, "tulu-3-sft": 25627737, "wiki_to_rcqa": 69889583}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 85749820953316004262550710266339839890, "inc": 170649501745871541162088638073684795575}, "has_uint32": 1, "uinteger": 2932988183}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00002.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 555, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.02.jsonl", "position": 176228084, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.02.jsonl", "position": 129632377, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.02.jsonl", "position": 122739527, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.02.jsonl", "position": 91737533, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.02.jsonl", "position": 56597994, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.02.jsonl", "position": 5028790, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.02.jsonl", "position": 1803908, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.02.jsonl", "position": 1255796112, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.02.jsonl", "position": 24967558, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.02.jsonl", "position": 50283192, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.02.jsonl", "position": 1967757, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.02.jsonl", "position": 4589237, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.02.jsonl", "position": 1002074, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.02.jsonl", "position": 16371244, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.02.jsonl", "position": 6490169, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.02.jsonl", "position": 13450598, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.02.jsonl", "position": 2831925, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.02.jsonl", "position": 8059609, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.02.jsonl", "position": 60613291, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.02.jsonl", "position": 177954094, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.02.jsonl", "position": 71330956, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.02.jsonl", "position": 521030735, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 242607930559933803328842844375603027451, "inc": 121235836284466329520550355651886291025}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10718222, "common_crawl-high-quality": 524181486, "cranecode": 232968996, "cranemath": 131164862, "dolmino-math": 249275920, "dolmino_1-flan": 116484004, "gemini-reasoning-traces": 5826345, "general_reasoning_mix": 43566403, "math-meta-reasoning": 8852843, "megamatt": 40303557, "nemotron-synth-qa": 116482321, "olmocr_science_pdfs": 116490151, "openthoughts2": 29123805, "program_verifiable": 3728848, "qwq-reasoning-traces": 43566871, "reddit_to_flashcards": 137442228, "stack_edu": 232968086, "stem-heavy-crawl": 116482679, "tinymath-mind": 20967403, "tinymath-pot": 5591193, "tulu-3-sft": 25626062, "wiki_to_rcqa": 69888536}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 250410953641482978242998027896627465990, "inc": 90616052583066224228118909460993474701}, "has_uint32": 0, "uinteger": 3635088313}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00003.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 169, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.03.jsonl", "position": 170654791, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.03.jsonl", "position": 91809052, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.03.jsonl", "position": 110455016, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.03.jsonl", "position": 97889578, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.03.jsonl", "position": 57431495, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.03.jsonl", "position": 5536452, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.03.jsonl", "position": 1466681, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.03.jsonl", "position": 1255744355, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.03.jsonl", "position": 25367057, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.03.jsonl", "position": 49940637, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.03.jsonl", "position": 2845827, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.03.jsonl", "position": 4144517, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.03.jsonl", "position": 844780, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.03.jsonl", "position": 7450160, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.03.jsonl", "position": 8479977, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.03.jsonl", "position": 13761925, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.03.jsonl", "position": 1860938, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.03.jsonl", "position": 14422945, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.03.jsonl", "position": 55080209, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.03.jsonl", "position": 174271581, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.03.jsonl", "position": 67406397, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.03.jsonl", "position": 501500696, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 1517453642602235172476618770163698185, "inc": 125151366308830146461603246033367959405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717133, "common_crawl-high-quality": 524176962, "cranecode": 232969091, "cranemath": 131160607, "dolmino-math": 249277028, "dolmino_1-flan": 116482573, "gemini-reasoning-traces": 5825222, "general_reasoning_mix": 43565173, "math-meta-reasoning": 8853780, "megamatt": 40303862, "nemotron-synth-qa": 116484017, "olmocr_science_pdfs": 116485559, "openthoughts2": 29128678, "program_verifiable": 3728185, "qwq-reasoning-traces": 43572973, "reddit_to_flashcards": 137444086, "stack_edu": 232968559, "stem-heavy-crawl": 116483325, "tinymath-mind": 20967592, "tinymath-pot": 5591073, "tulu-3-sft": 25626621, "wiki_to_rcqa": 69889108}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 225979724602204095822997413923518164685, "inc": 50378060617174794402292948380426361097}, "has_uint32": 1, "uinteger": 4253124320}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00004.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1060, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.04.jsonl", "position": 169275751, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.04.jsonl", "position": 96547876, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.04.jsonl", "position": 109398714, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.04.jsonl", "position": 93408469, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.04.jsonl", "position": 61104233, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.04.jsonl", "position": 5458783, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.04.jsonl", "position": 1761347, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.04.jsonl", "position": 1255648101, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.04.jsonl", "position": 26651625, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.04.jsonl", "position": 50919354, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.04.jsonl", "position": 1786151, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.04.jsonl", "position": 3303958, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.04.jsonl", "position": 1017483, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.04.jsonl", "position": 13149844, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.04.jsonl", "position": 6798775, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.04.jsonl", "position": 16963025, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.04.jsonl", "position": 2481608, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.04.jsonl", "position": 13933845, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.04.jsonl", "position": 60123187, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.04.jsonl", "position": 172086600, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.04.jsonl", "position": 73441735, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.04.jsonl", "position": 474397686, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 299509270770031916446716497279612719220, "inc": 316940505624043840809154587289841925383}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10718107, "common_crawl-high-quality": 524179029, "cranecode": 232968763, "cranemath": 131160913, "dolmino-math": 249275587, "dolmino_1-flan": 116483658, "gemini-reasoning-traces": 5827106, "general_reasoning_mix": 43565076, "math-meta-reasoning": 8853501, "megamatt": 40304265, "nemotron-synth-qa": 116484560, "olmocr_science_pdfs": 116484274, "openthoughts2": 29124875, "program_verifiable": 3730007, "qwq-reasoning-traces": 43566573, "reddit_to_flashcards": 137435461, "stack_edu": 232979110, "stem-heavy-crawl": 116484556, "tinymath-mind": 20966966, "tinymath-pot": 5591964, "tulu-3-sft": 25626834, "wiki_to_rcqa": 69889131}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 236078094047468840435218590199867614684, "inc": 93583685013543266762176536914137130671}, "has_uint32": 0, "uinteger": 2213378322}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00005.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 517, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.05.jsonl", "position": 165270704, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.05.jsonl", "position": 110875001, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.05.jsonl", "position": 109407333, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.05.jsonl", "position": 93106183, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.05.jsonl", "position": 57342920, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.05.jsonl", "position": 5863139, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.05.jsonl", "position": 1397544, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.05.jsonl", "position": 1255942889, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.05.jsonl", "position": 26325478, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.05.jsonl", "position": 49875226, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.05.jsonl", "position": 2278746, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.05.jsonl", "position": 5572015, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.05.jsonl", "position": 1001385, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.05.jsonl", "position": 11253557, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.05.jsonl", "position": 9052417, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.05.jsonl", "position": 13987777, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.05.jsonl", "position": 506245, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.05.jsonl", "position": 12615069, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.05.jsonl", "position": 58437931, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.05.jsonl", "position": 177327447, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.05.jsonl", "position": 63448265, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.05.jsonl", "position": 498508745, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 304150823691374556939996871776077939637, "inc": 76256721842129589702307426791162152551}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717480, "common_crawl-high-quality": 524188091, "cranecode": 232966972, "cranemath": 131160476, "dolmino-math": 249274497, "dolmino_1-flan": 116483319, "gemini-reasoning-traces": 5830287, "general_reasoning_mix": 43565202, "math-meta-reasoning": 8853796, "megamatt": 40304571, "nemotron-synth-qa": 116482836, "olmocr_science_pdfs": 116487868, "openthoughts2": 29124066, "program_verifiable": 3727892, "qwq-reasoning-traces": 43566903, "reddit_to_flashcards": 137438488, "stack_edu": 232972672, "stem-heavy-crawl": 116483038, "tinymath-mind": 20967246, "tinymath-pot": 5591344, "tulu-3-sft": 25625385, "wiki_to_rcqa": 69888430}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 128588098932200963179261404019648935844, "inc": 22268798220195505414013091073155496089}, "has_uint32": 1, "uinteger": 4091980020}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00006.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 483, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.06.jsonl", "position": 175513919, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.06.jsonl", "position": 119913136, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.06.jsonl", "position": 113088935, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.06.jsonl", "position": 92242241, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.06.jsonl", "position": 56182271, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.06.jsonl", "position": 4485607, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.06.jsonl", "position": 1756960, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.06.jsonl", "position": 1255696572, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.06.jsonl", "position": 25259631, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.06.jsonl", "position": 50553295, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.06.jsonl", "position": 2585746, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.06.jsonl", "position": 3188909, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.06.jsonl", "position": 955594, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.06.jsonl", "position": 13384344, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.06.jsonl", "position": 7133854, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.06.jsonl", "position": 13017552, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.06.jsonl", "position": 1774383, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.06.jsonl", "position": 10681856, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.06.jsonl", "position": 59558626, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.06.jsonl", "position": 175162987, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.06.jsonl", "position": 61980645, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.06.jsonl", "position": 511837250, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 88967541265145310930449644189407446893, "inc": 88395164212990118791130299107903034087}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717747, "common_crawl-high-quality": 524179506, "cranecode": 232968095, "cranemath": 131162388, "dolmino-math": 249275950, "dolmino_1-flan": 116484296, "gemini-reasoning-traces": 5824972, "general_reasoning_mix": 43566328, "math-meta-reasoning": 8853803, "megamatt": 40303908, "nemotron-synth-qa": 116483858, "olmocr_science_pdfs": 116485538, "openthoughts2": 29122821, "program_verifiable": 3727710, "qwq-reasoning-traces": 43572466, "reddit_to_flashcards": 137444658, "stack_edu": 232967739, "stem-heavy-crawl": 116484718, "tinymath-mind": 20967234, "tinymath-pot": 5591654, "tulu-3-sft": 25626431, "wiki_to_rcqa": 69889073}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 48653964429580767134480002661016468688, "inc": 159974370505239494345331372639210830695}, "has_uint32": 1, "uinteger": 837376111}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00007.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 230, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.07.jsonl", "position": 178993908, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.07.jsonl", "position": 93661101, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.07.jsonl", "position": 113088199, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.07.jsonl", "position": 96214147, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.07.jsonl", "position": 59690564, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.07.jsonl", "position": 5700038, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.07.jsonl", "position": 1612363, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.07.jsonl", "position": 1255868845, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.07.jsonl", "position": 26986400, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.07.jsonl", "position": 51968967, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.07.jsonl", "position": 1631747, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.07.jsonl", "position": 4546589, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.07.jsonl", "position": 582704, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.07.jsonl", "position": 13174586, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.07.jsonl", "position": 9796597, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.07.jsonl", "position": 11680804, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.07.jsonl", "position": 2684246, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.07.jsonl", "position": 7643072, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.07.jsonl", "position": 61990705, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.07.jsonl", "position": 180390710, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.07.jsonl", "position": 66016263, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.07.jsonl", "position": 495406359, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 337829025199356394220275822052077951457, "inc": 322195181278880022317637923384652014617}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717886, "common_crawl-high-quality": 524176392, "cranecode": 232967839, "cranemath": 131160724, "dolmino-math": 249273811, "dolmino_1-flan": 116483260, "gemini-reasoning-traces": 5824211, "general_reasoning_mix": 43567443, "math-meta-reasoning": 8854450, "megamatt": 40304218, "nemotron-synth-qa": 116482875, "olmocr_science_pdfs": 116495386, "openthoughts2": 29128949, "program_verifiable": 3729274, "qwq-reasoning-traces": 43572187, "reddit_to_flashcards": 137437008, "stack_edu": 232969699, "stem-heavy-crawl": 116483337, "tinymath-mind": 20966971, "tinymath-pot": 5591018, "tulu-3-sft": 25625528, "wiki_to_rcqa": 69888680}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 211283518072739736885050693825571609459, "inc": 334003994887173083725390277461237639581}, "has_uint32": 0, "uinteger": 2129520558}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00008.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 155, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.08.jsonl", "position": 181847675, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.08.jsonl", "position": 118620031, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.08.jsonl", "position": 115152774, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.08.jsonl", "position": 93485013, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.08.jsonl", "position": 62089139, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.08.jsonl", "position": 5940802, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.08.jsonl", "position": 1836807, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.08.jsonl", "position": 1255656187, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.08.jsonl", "position": 24977680, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.08.jsonl", "position": 51685664, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.08.jsonl", "position": 1408077, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.08.jsonl", "position": 4149030, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.08.jsonl", "position": 721073, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.08.jsonl", "position": 9031490, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.08.jsonl", "position": 7370162, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.08.jsonl", "position": 13978000, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.08.jsonl", "position": 781819, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.08.jsonl", "position": 4089669, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.08.jsonl", "position": 60967384, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.08.jsonl", "position": 174206698, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.08.jsonl", "position": 63204528, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.08.jsonl", "position": 516359564, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 222457326697416819089385188638000029889, "inc": 232333011630326481733766413984974244825}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716611, "common_crawl-high-quality": 524178499, "cranecode": 232966609, "cranemath": 131162093, "dolmino-math": 249275842, "dolmino_1-flan": 116483883, "gemini-reasoning-traces": 5825253, "general_reasoning_mix": 43564737, "math-meta-reasoning": 8852922, "megamatt": 40302803, "nemotron-synth-qa": 116482437, "olmocr_science_pdfs": 116496133, "openthoughts2": 29125776, "program_verifiable": 3729065, "qwq-reasoning-traces": 43573173, "reddit_to_flashcards": 137440414, "stack_edu": 232967665, "stem-heavy-crawl": 116483824, "tinymath-mind": 20966813, "tinymath-pot": 5591042, "tulu-3-sft": 25626768, "wiki_to_rcqa": 69888859}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 299032876526771153828468208281902936020, "inc": 137127179917161464848280992898870611721}, "has_uint32": 1, "uinteger": 1502052356}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00010.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 13543, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.10.jsonl", "position": 175265944, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.10.jsonl", "position": 121743979, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.10.jsonl", "position": 111160799, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.10.jsonl", "position": 93444517, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.10.jsonl", "position": 59454435, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.10.jsonl", "position": 6012630, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.10.jsonl", "position": 1465622, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.10.jsonl", "position": 1255699065, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.10.jsonl", "position": 26284095, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.10.jsonl", "position": 49220860, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.10.jsonl", "position": 2012332, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.10.jsonl", "position": 5742655, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.10.jsonl", "position": 957674, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.10.jsonl", "position": 10241213, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.10.jsonl", "position": 5957290, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.10.jsonl", "position": 11461296, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.10.jsonl", "position": 2050981, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.10.jsonl", "position": 12589568, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.10.jsonl", "position": 60080169, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.10.jsonl", "position": 181118161, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.10.jsonl", "position": 59983809, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.10.jsonl", "position": 494733451, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 28093547970186331396263766593751657933, "inc": 262455344491392898496039790594093439699}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717569, "common_crawl-high-quality": 524180383, "cranecode": 232965732, "cranemath": 131160185, "dolmino-math": 249274229, "dolmino_1-flan": 116483372, "gemini-reasoning-traces": 5825079, "general_reasoning_mix": 43567081, "math-meta-reasoning": 8854578, "megamatt": 40302644, "nemotron-synth-qa": 116482333, "olmocr_science_pdfs": 116483332, "openthoughts2": 29125195, "program_verifiable": 3729421, "qwq-reasoning-traces": 43575502, "reddit_to_flashcards": 137439506, "stack_edu": 232965170, "stem-heavy-crawl": 116483438, "tinymath-mind": 20966814, "tinymath-pot": 5591233, "tulu-3-sft": 25626416, "wiki_to_rcqa": 69888621}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 235473873037057258417520760620699177061, "inc": 320333070685036219503813667660565218295}, "has_uint32": 1, "uinteger": 405884923}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00013.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1800, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.13.jsonl", "position": 174087877, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.13.jsonl", "position": 108225052, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.13.jsonl", "position": 127021689, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.13.jsonl", "position": 95316638, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.13.jsonl", "position": 54466205, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.13.jsonl", "position": 6133281, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.13.jsonl", "position": 1654547, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.13.jsonl", "position": 1255766540, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.13.jsonl", "position": 25647381, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.13.jsonl", "position": 50238500, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.13.jsonl", "position": 1699188, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.13.jsonl", "position": 5159402, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.13.jsonl", "position": 715353, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.13.jsonl", "position": 14753381, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.13.jsonl", "position": 10014180, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.13.jsonl", "position": 12622795, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.13.jsonl", "position": 2335795, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.13.jsonl", "position": 9495528, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.13.jsonl", "position": 58413331, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.13.jsonl", "position": 183520843, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.13.jsonl", "position": 64289959, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.13.jsonl", "position": 485303490, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 132025057233364719154679098857642617910, "inc": 74474439248560930919804404709578726689}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717316, "common_crawl-high-quality": 524178631, "cranecode": 232968594, "cranemath": 131161250, "dolmino-math": 249275030, "dolmino_1-flan": 116484307, "gemini-reasoning-traces": 5826836, "general_reasoning_mix": 43568343, "math-meta-reasoning": 8852935, "megamatt": 40305038, "nemotron-synth-qa": 116483779, "olmocr_science_pdfs": 116493800, "openthoughts2": 29123973, "program_verifiable": 3730237, "qwq-reasoning-traces": 43566124, "reddit_to_flashcards": 137438937, "stack_edu": 232966670, "stem-heavy-crawl": 116483776, "tinymath-mind": 20967369, "tinymath-pot": 5591158, "tulu-3-sft": 25626283, "wiki_to_rcqa": 69889190}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 314334276976028320770252057746092726092, "inc": 116345837432125979447787307728243085409}, "has_uint32": 0, "uinteger": 2351727945}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00014.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 68, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.14.jsonl", "position": 176758368, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.14.jsonl", "position": 93132936, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.14.jsonl", "position": 112642230, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.14.jsonl", "position": 97219845, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.14.jsonl", "position": 55568604, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.14.jsonl", "position": 5491293, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.14.jsonl", "position": 1876968, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.14.jsonl", "position": 1255805570, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.14.jsonl", "position": 26267980, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.14.jsonl", "position": 51643736, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.14.jsonl", "position": 2264880, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.14.jsonl", "position": 5453053, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.14.jsonl", "position": 852646, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.14.jsonl", "position": 13208096, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.14.jsonl", "position": 11275043, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.14.jsonl", "position": 14055259, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.14.jsonl", "position": 2172150, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.14.jsonl", "position": 14771471, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.14.jsonl", "position": 58513543, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.14.jsonl", "position": 182524110, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.14.jsonl", "position": 71903506, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.14.jsonl", "position": 519920795, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 255381197467711588087437858821645240560, "inc": 116685831757775029071911556784912414583}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717638, "common_crawl-high-quality": 524178128, "cranecode": 232967970, "cranemath": 131161394, "dolmino-math": 249275748, "dolmino_1-flan": 116483297, "gemini-reasoning-traces": 5827283, "general_reasoning_mix": 43563885, "math-meta-reasoning": 8854132, "megamatt": 40302842, "nemotron-synth-qa": 116482750, "olmocr_science_pdfs": 116495217, "openthoughts2": 29123641, "program_verifiable": 3727738, "qwq-reasoning-traces": 43576648, "reddit_to_flashcards": 137442725, "stack_edu": 232964692, "stem-heavy-crawl": 116481841, "tinymath-mind": 20967997, "tinymath-pot": 5591077, "tulu-3-sft": 25625761, "wiki_to_rcqa": 69888904}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239470938684036100765165062773953078142, "inc": 94545970901749447227174325208480182179}, "has_uint32": 1, "uinteger": 2722351717}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00015.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 179, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.15.jsonl", "position": 173713486, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.15.jsonl", "position": 110463185, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.15.jsonl", "position": 106563803, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.15.jsonl", "position": 94464152, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.15.jsonl", "position": 58075665, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.15.jsonl", "position": 6015268, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.15.jsonl", "position": 1941291, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.15.jsonl", "position": 1255938623, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.15.jsonl", "position": 25431252, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.15.jsonl", "position": 50923899, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.15.jsonl", "position": 2723627, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.15.jsonl", "position": 3919932, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.15.jsonl", "position": 1182999, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.15.jsonl", "position": 9116381, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.15.jsonl", "position": 5035635, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.15.jsonl", "position": 15851392, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.15.jsonl", "position": 915669, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.15.jsonl", "position": 15889044, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.15.jsonl", "position": 61176154, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.15.jsonl", "position": 187577205, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.15.jsonl", "position": 52985604, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.15.jsonl", "position": 491485676, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 210310075468178302712232204289748778423, "inc": 76775115875861029591577098614554519321}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716262, "common_crawl-high-quality": 524177843, "cranecode": 232966787, "cranemath": 131161342, "dolmino-math": 249274848, "dolmino_1-flan": 116483615, "gemini-reasoning-traces": 5825673, "general_reasoning_mix": 43565036, "math-meta-reasoning": 8853315, "megamatt": 40303314, "nemotron-synth-qa": 116482593, "olmocr_science_pdfs": 116484893, "openthoughts2": 29125702, "program_verifiable": 3729674, "qwq-reasoning-traces": 43566333, "reddit_to_flashcards": 137441925, "stack_edu": 232979321, "stem-heavy-crawl": 116484351, "tinymath-mind": 20966990, "tinymath-pot": 5591522, "tulu-3-sft": 25630934, "wiki_to_rcqa": 69888924}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 188849628256972546641584293984929704290, "inc": 277855105455093796469568551045482401825}, "has_uint32": 0, "uinteger": 1607433859}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00017.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 126, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.17.jsonl", "position": 168585760, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.17.jsonl", "position": 116391016, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.17.jsonl", "position": 118429992, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.17.jsonl", "position": 96777587, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.17.jsonl", "position": 56467618, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.17.jsonl", "position": 5487486, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.17.jsonl", "position": 1544757, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.17.jsonl", "position": 1256025172, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.17.jsonl", "position": 26272420, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.17.jsonl", "position": 49064652, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.17.jsonl", "position": 2197504, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.17.jsonl", "position": 4911378, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.17.jsonl", "position": 1108850, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.17.jsonl", "position": 10501019, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.17.jsonl", "position": 8507102, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.17.jsonl", "position": 10235169, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.17.jsonl", "position": 1319148, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.17.jsonl", "position": 8509743, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.17.jsonl", "position": 58515449, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.17.jsonl", "position": 181009735, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.17.jsonl", "position": 70561045, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.17.jsonl", "position": 487618469, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 281159287891267573308938063003501825062, "inc": 219509020097628037272727839178453712549}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717920, "common_crawl-high-quality": 524180536, "cranecode": 232968796, "cranemath": 131162408, "dolmino-math": 249275706, "dolmino_1-flan": 116483614, "gemini-reasoning-traces": 5826187, "general_reasoning_mix": 43565263, "math-meta-reasoning": 8852773, "megamatt": 40303411, "nemotron-synth-qa": 116483810, "olmocr_science_pdfs": 116491197, "openthoughts2": 29122305, "program_verifiable": 3730339, "qwq-reasoning-traces": 43570599, "reddit_to_flashcards": 137441957, "stack_edu": 232967706, "stem-heavy-crawl": 116482954, "tinymath-mind": 20966899, "tinymath-pot": 5591355, "tulu-3-sft": 25625852, "wiki_to_rcqa": 69889663}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 306088215367541730463663193923924069886, "inc": 223203856372306606023862000991848715983}, "has_uint32": 1, "uinteger": 2015769118}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00018.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 169, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.18.jsonl", "position": 177593162, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.18.jsonl", "position": 112024423, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.18.jsonl", "position": 108674929, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.18.jsonl", "position": 92783847, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.18.jsonl", "position": 60339824, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.18.jsonl", "position": 5374110, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.18.jsonl", "position": 1714677, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.18.jsonl", "position": 1255742819, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.18.jsonl", "position": 27000590, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.18.jsonl", "position": 47595014, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.18.jsonl", "position": 2168729, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.18.jsonl", "position": 4173716, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.18.jsonl", "position": 1079768, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.18.jsonl", "position": 10977216, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.18.jsonl", "position": 13318672, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.18.jsonl", "position": 19588889, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.18.jsonl", "position": 2880445, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.18.jsonl", "position": 9643767, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.18.jsonl", "position": 56497729, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.18.jsonl", "position": 171682864, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.18.jsonl", "position": 69648695, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.18.jsonl", "position": 510468579, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 176598571205868443551507785009731189475, "inc": 156889124676145633533243121458300926517}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717811, "common_crawl-high-quality": 524179086, "cranecode": 232967481, "cranemath": 131160917, "dolmino-math": 249275198, "dolmino_1-flan": 116483571, "gemini-reasoning-traces": 5826880, "general_reasoning_mix": 43566901, "math-meta-reasoning": 8852813, "megamatt": 40303435, "nemotron-synth-qa": 116483180, "olmocr_science_pdfs": 116488728, "openthoughts2": 29125477, "program_verifiable": 3728122, "qwq-reasoning-traces": 43578173, "reddit_to_flashcards": 137441529, "stack_edu": 232965423, "stem-heavy-crawl": 116483337, "tinymath-mind": 20966662, "tinymath-pot": 5591014, "tulu-3-sft": 25626319, "wiki_to_rcqa": 69889150}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 232054951445483104053298414178273037645, "inc": 97364009206775596287829320185459903201}, "has_uint32": 1, "uinteger": 2029988552}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00019.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 480, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.19.jsonl", "position": 174993083, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.19.jsonl", "position": 112064730, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.19.jsonl", "position": 102536595, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.19.jsonl", "position": 92591090, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.19.jsonl", "position": 56681747, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.19.jsonl", "position": 5018474, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.19.jsonl", "position": 1468262, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.19.jsonl", "position": 1255849812, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.19.jsonl", "position": 26997909, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.19.jsonl", "position": 49796011, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.19.jsonl", "position": 2174527, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.19.jsonl", "position": 3828069, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.19.jsonl", "position": 1103250, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.19.jsonl", "position": 12133736, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.19.jsonl", "position": 6510047, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.19.jsonl", "position": 16007350, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.19.jsonl", "position": 1033850, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.19.jsonl", "position": 7648502, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.19.jsonl", "position": 58685525, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.19.jsonl", "position": 171979147, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.19.jsonl", "position": 63462364, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.19.jsonl", "position": 471848585, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 286677816895253057066229023720408382481, "inc": 43481686913609198584180156906717904807}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717007, "common_crawl-high-quality": 524178701, "cranecode": 232968255, "cranemath": 131159952, "dolmino-math": 249274890, "dolmino_1-flan": 116483358, "gemini-reasoning-traces": 5829935, "general_reasoning_mix": 43569350, "math-meta-reasoning": 8854296, "megamatt": 40304210, "nemotron-synth-qa": 116483911, "olmocr_science_pdfs": 116484675, "openthoughts2": 29122154, "program_verifiable": 3727795, "qwq-reasoning-traces": 43574624, "reddit_to_flashcards": 137440587, "stack_edu": 232968680, "stem-heavy-crawl": 116483944, "tinymath-mind": 20967742, "tinymath-pot": 5591349, "tulu-3-sft": 25626269, "wiki_to_rcqa": 69889212}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 123854162493351245575415109320812074887, "inc": 87592583178118397041772301245082262155}, "has_uint32": 1, "uinteger": 1529574907}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
checkpoints/0000050000/train_state_00020.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 433, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.20.jsonl", "position": 167915599, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.20.jsonl", "position": 122851356, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.20.jsonl", "position": 111224126, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.20.jsonl", "position": 100992607, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.20.jsonl", "position": 61350136, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.20.jsonl", "position": 4966952, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.20.jsonl", "position": 1796014, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.20.jsonl", "position": 1255686843, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.20.jsonl", "position": 25418292, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.20.jsonl", "position": 50324020, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.20.jsonl", "position": 1803998, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.20.jsonl", "position": 5460263, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.20.jsonl", "position": 778003, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.20.jsonl", "position": 6434548, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.20.jsonl", "position": 6012861, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.20.jsonl", "position": 19097313, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.20.jsonl", "position": 2046188, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.20.jsonl", "position": 7941533, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.20.jsonl", "position": 59795006, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.20.jsonl", "position": 178358604, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.20.jsonl", "position": 63159426, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.20.jsonl", "position": 478691348, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 27459530917289031682824315422237314089, "inc": 295199436789328193413091494604713432465}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716765, "common_crawl-high-quality": 524176807, "cranecode": 232969472, "cranemath": 131161012, "dolmino-math": 249272989, "dolmino_1-flan": 116483318, "gemini-reasoning-traces": 5827400, "general_reasoning_mix": 43567618, "math-meta-reasoning": 8854817, "megamatt": 40303390, "nemotron-synth-qa": 116482269, "olmocr_science_pdfs": 116491557, "openthoughts2": 29121964, "program_verifiable": 3727666, "qwq-reasoning-traces": 43580768, "reddit_to_flashcards": 137440787, "stack_edu": 232966003, "stem-heavy-crawl": 116483881, "tinymath-mind": 20966829, "tinymath-pot": 5591205, "tulu-3-sft": 25625678, "wiki_to_rcqa": 69888748}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 231369944016304886428559638266269115931, "inc": 107548519163465652363222144173017049179}, "has_uint32": 1, "uinteger": 1707974445}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
config.yaml
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: olmo2_1B_midfine
|
| 2 |
+
dump_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final
|
| 3 |
+
seed: 777
|
| 4 |
+
model_type: olmo3
|
| 5 |
+
stem_up_proj_layers: []
|
| 6 |
+
grad_acc_steps: 2
|
| 7 |
+
gc_collect_freq: 1000
|
| 8 |
+
probe_freq: 100
|
| 9 |
+
steps: 50000
|
| 10 |
+
stage_steps: null
|
| 11 |
+
data:
|
| 12 |
+
root_dir: /home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
|
| 13 |
+
sources:
|
| 14 |
+
cranecode: 10.0
|
| 15 |
+
stack_edu: 10.0
|
| 16 |
+
cranemath: 5.63
|
| 17 |
+
dolmino-math: 10.7
|
| 18 |
+
megamatt: 1.73
|
| 19 |
+
tinymath-mind: 0.9
|
| 20 |
+
tinymath-pot: 0.24
|
| 21 |
+
reddit_to_flashcards: 5.9
|
| 22 |
+
wiki_to_rcqa: 3.0
|
| 23 |
+
nemotron-synth-qa: 5.0
|
| 24 |
+
math-meta-reasoning: 0.38
|
| 25 |
+
code-meta-reasoning: 0.46
|
| 26 |
+
program_verifiable: 0.16
|
| 27 |
+
qwq-reasoning-traces: 1.87
|
| 28 |
+
openthoughts2: 1.25
|
| 29 |
+
general_reasoning_mix: 1.87
|
| 30 |
+
gemini-reasoning-traces: 0.25
|
| 31 |
+
tulu-3-sft: 1.1
|
| 32 |
+
dolmino_1-flan: 5.0
|
| 33 |
+
olmocr_science_pdfs: 5.0
|
| 34 |
+
stem-heavy-crawl: 5.0
|
| 35 |
+
common_crawl-high-quality: 22.5
|
| 36 |
+
node_local: false
|
| 37 |
+
batch_size: 8
|
| 38 |
+
seq_len: 4096
|
| 39 |
+
n_views: 2
|
| 40 |
+
seed: 42
|
| 41 |
+
add_bos: true
|
| 42 |
+
add_eos: true
|
| 43 |
+
load_async: true
|
| 44 |
+
prefetch_size: 1024
|
| 45 |
+
tokenizer:
|
| 46 |
+
name: huggingface
|
| 47 |
+
path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 48 |
+
track_packed_source_mixture: true
|
| 49 |
+
packed_source_counts: null
|
| 50 |
+
optim:
|
| 51 |
+
lr: 7.44e-05
|
| 52 |
+
weight_decay: 0.1
|
| 53 |
+
epsilon: 1.0e-08
|
| 54 |
+
beta1: 0.9
|
| 55 |
+
beta2: 0.95
|
| 56 |
+
clip: 1.0
|
| 57 |
+
scheduler: linear
|
| 58 |
+
warmup: 0
|
| 59 |
+
lr_min_ratio: 0.0
|
| 60 |
+
cycle_length: 1.0
|
| 61 |
+
cosine_theta: 1.0
|
| 62 |
+
annealing_step: 1000
|
| 63 |
+
decay_fraction: 0.1
|
| 64 |
+
exp_factor: 0.5
|
| 65 |
+
initial_token_offset: 0
|
| 66 |
+
global_final_step: null
|
| 67 |
+
model:
|
| 68 |
+
dim: 2048
|
| 69 |
+
n_layers: 16
|
| 70 |
+
head_dim: 128
|
| 71 |
+
n_heads: 16
|
| 72 |
+
n_kv_heads: 16
|
| 73 |
+
ffn_dim_multiplier: 1.5
|
| 74 |
+
multiple_of: 256
|
| 75 |
+
norm_eps: 1.0e-06
|
| 76 |
+
rope_theta: 500000.0
|
| 77 |
+
rope_scaling: null
|
| 78 |
+
init_base_std: 0.02
|
| 79 |
+
init_std_factor: disabled
|
| 80 |
+
max_seqlen: 4096
|
| 81 |
+
seed: 42
|
| 82 |
+
vocab_size: 100352
|
| 83 |
+
weight_tying: false
|
| 84 |
+
sliding_window: null
|
| 85 |
+
distributed:
|
| 86 |
+
dp_shard: 1
|
| 87 |
+
dp_replicate: 32
|
| 88 |
+
tp_size: 1
|
| 89 |
+
selective_activation_checkpointing: false
|
| 90 |
+
compile: true
|
| 91 |
+
fsdp_type: full_shard
|
| 92 |
+
model_dtype: bf16
|
| 93 |
+
float8_recipe: null
|
| 94 |
+
float8_filter: layers\.[0-9]+\.
|
| 95 |
+
matmul_allow_tf32: false
|
| 96 |
+
detect_anomaly: false
|
| 97 |
+
compile_cache_size_limit: 8
|
| 98 |
+
spawn_method: forkserver
|
| 99 |
+
stem_parallel_size: 8
|
| 100 |
+
env:
|
| 101 |
+
MKL_SERVICE_FORCE_INTEL: GNU
|
| 102 |
+
OMP_NUM_THREADS: '1'
|
| 103 |
+
MKL_NUM_THREADS: '1'
|
| 104 |
+
ENABLE_INTRA_NODE_COMM: '1'
|
| 105 |
+
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
|
| 106 |
+
NCCL_IB_TIMEOUT: '22'
|
| 107 |
+
NCCL_DEBUG: INFO
|
| 108 |
+
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
|
| 109 |
+
checkpoint:
|
| 110 |
+
dump:
|
| 111 |
+
every: 5000
|
| 112 |
+
keep: 1
|
| 113 |
+
eval:
|
| 114 |
+
every: 100000
|
| 115 |
+
keep: 1
|
| 116 |
+
path: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints
|
| 117 |
+
init_ckpt_path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 118 |
+
continue_training_from_init: true
|
| 119 |
+
legacy_init_ckpt_lm_transformer: false
|
| 120 |
+
merge_lm_optim_seed_ckpt_path: null
|
| 121 |
+
profiling:
|
| 122 |
+
run: true
|
| 123 |
+
trace_folder: profiling
|
| 124 |
+
mem_warmup: 100
|
| 125 |
+
mem_steps: 2
|
| 126 |
+
profile_warmup: 102
|
| 127 |
+
profile_steps: 2
|
| 128 |
+
logging:
|
| 129 |
+
freq: 10
|
| 130 |
+
acc_freq: null
|
| 131 |
+
wandb:
|
| 132 |
+
job_type: null
|
| 133 |
+
dir: null
|
| 134 |
+
project: stem
|
| 135 |
+
entity: null
|
| 136 |
+
tags: null
|
| 137 |
+
group: null
|
| 138 |
+
name: olmo2_1B_midfine
|
| 139 |
+
notes: null
|
| 140 |
+
config_exclude_keys: null
|
| 141 |
+
config_include_keys: null
|
| 142 |
+
anonymous: null
|
| 143 |
+
mode: null
|
| 144 |
+
allow_val_change: null
|
| 145 |
+
resume: null
|
| 146 |
+
force: null
|
| 147 |
+
tensorboard: null
|
| 148 |
+
sync_tensorboard: null
|
| 149 |
+
monitor_gym: null
|
| 150 |
+
save_code: null
|
| 151 |
+
id: null
|
| 152 |
+
fork_from: null
|
| 153 |
+
resume_from: null
|
| 154 |
+
async_eval_gpus: null
|
| 155 |
+
eval:
|
| 156 |
+
generator:
|
| 157 |
+
max_tokens: 16384
|
| 158 |
+
dtype: bf16
|
| 159 |
+
temperature: 1.0
|
| 160 |
+
top_p: 0.95
|
| 161 |
+
harness:
|
| 162 |
+
tasks:
|
| 163 |
+
- task: hellaswag
|
| 164 |
+
dataset_path: /data/rsadhukh/eval_data/hellaswag
|
| 165 |
+
- task: boolq
|
| 166 |
+
dataset_path: /data/rsadhukh/eval_data/super_glue
|
| 167 |
+
- task: piqa
|
| 168 |
+
dataset_path: /data/rsadhukh/eval_data/piqa
|
| 169 |
+
- task: winogrande
|
| 170 |
+
dataset_path: /data/rsadhukh/eval_data/winogrande
|
| 171 |
+
- task: openbookqa
|
| 172 |
+
dataset_path: /data/rsadhukh/eval_data/openbookqa
|
| 173 |
+
- task: arc_easy
|
| 174 |
+
dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 175 |
+
- task: arc_challenge
|
| 176 |
+
dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 177 |
+
confirm_run_unsafe_code: true
|
| 178 |
+
batch_size: 64
|
| 179 |
+
validation: null
|
evals/0000050000/config.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: evals
|
| 2 |
+
dump_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final/evals/0000050000
|
| 3 |
+
metric_log_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final
|
| 4 |
+
ckpt_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints/0000050000
|
| 5 |
+
tokenizer_path: null
|
| 6 |
+
tokenizer_name: null
|
| 7 |
+
model_type: olmo3
|
| 8 |
+
generator:
|
| 9 |
+
temperature: 1.0
|
| 10 |
+
top_p: 0.95
|
| 11 |
+
top_k: null
|
| 12 |
+
max_gen_len: 512
|
| 13 |
+
max_tokens: 16384
|
| 14 |
+
max_prompt_len: null
|
| 15 |
+
until: []
|
| 16 |
+
compile_prefilling: false
|
| 17 |
+
reduce_generation_overhead: false
|
| 18 |
+
show_progress: false
|
| 19 |
+
dtype: bf16
|
| 20 |
+
device: cuda
|
| 21 |
+
harness:
|
| 22 |
+
tasks:
|
| 23 |
+
- task: hellaswag
|
| 24 |
+
dataset_path: /data/rsadhukh/eval_data/hellaswag
|
| 25 |
+
- task: boolq
|
| 26 |
+
dataset_path: /data/rsadhukh/eval_data/super_glue
|
| 27 |
+
- task: piqa
|
| 28 |
+
dataset_path: /data/rsadhukh/eval_data/piqa
|
| 29 |
+
- task: winogrande
|
| 30 |
+
dataset_path: /data/rsadhukh/eval_data/winogrande
|
| 31 |
+
- task: openbookqa
|
| 32 |
+
dataset_path: /data/rsadhukh/eval_data/openbookqa
|
| 33 |
+
- task: arc_easy
|
| 34 |
+
dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 35 |
+
- task: arc_challenge
|
| 36 |
+
dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 37 |
+
num_fewshot: null
|
| 38 |
+
device: null
|
| 39 |
+
use_cache: null
|
| 40 |
+
cache_requests: false
|
| 41 |
+
rewrite_requests_cache: false
|
| 42 |
+
delete_requests_cache: false
|
| 43 |
+
limit: null
|
| 44 |
+
bootstrap_iters: 100000
|
| 45 |
+
check_integrity: false
|
| 46 |
+
write_out: false
|
| 47 |
+
log_samples: true
|
| 48 |
+
system_instruction: null
|
| 49 |
+
apply_chat_template: false
|
| 50 |
+
fewshot_as_multiturn: false
|
| 51 |
+
gen_kwargs: null
|
| 52 |
+
verbosity: INFO
|
| 53 |
+
predict_only: false
|
| 54 |
+
random_seed: 0
|
| 55 |
+
numpy_random_seed: 1234
|
| 56 |
+
torch_random_seed: 1234
|
| 57 |
+
fewshot_random_seed: 1234
|
| 58 |
+
batch_size: 64
|
| 59 |
+
confirm_run_unsafe_code: true
|
| 60 |
+
validation: null
|
| 61 |
+
wandb: null
|
| 62 |
+
global_step: 50000
|
evals/0000050000/results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"arc_challenge": {"alias": "arc_challenge", "acc,none": 0.42235494880546076, "acc_stderr,none": 0.014434138713379983, "acc_norm,none": 0.45051194539249145, "acc_norm_stderr,none": 0.014539646098471627}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7563131313131313, "acc_stderr,none": 0.00880917174472056, "acc_norm,none": 0.7567340067340067, "acc_norm_stderr,none": 0.00880400984686553}, "boolq": {"alias": "boolq", "acc,none": 0.6938837920489297, "acc_stderr,none": 0.008060817222724517}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4953196574387572, "acc_stderr,none": 0.004989562798280521, "acc_norm,none": 0.6694881497709619, "acc_norm_stderr,none": 0.00469436096892941}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665813}, "piqa": {"alias": "piqa", "acc,none": 0.7486398258977149, "acc_stderr,none": 0.010121156016819262, "acc_norm,none": 0.7464635473340587, "acc_norm_stderr,none": 0.010150090834551784}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558363}}, "versions": {"arc_challenge": 1.0, "arc_easy": 1.0, "boolq": 2.0, "hellaswag": 1.0, "openbookqa": 1.0, "piqa": 1.0, "winogrande": 1.0}, "n-shot": {"arc_challenge": 0, "arc_easy": 0, "boolq": 0, "hellaswag": 0, "openbookqa": 0, "piqa": 0, "winogrande": 0}, "higher_is_better": {"arc_challenge": {"acc": true, "acc_norm": true}, "arc_easy": {"acc": true, "acc_norm": true}, "boolq": {"acc": true}, "hellaswag": {"acc": true, "acc_norm": true}, "openbookqa": {"acc": true, "acc_norm": true}, "piqa": {"acc": true, "acc_norm": true}, "winogrande": {"acc": true}}, "n-samples": {"hellaswag": {"original": 10042, "effective": 10042}, "boolq": {"original": 3270, "effective": 3270}, "piqa": {"original": 1838, "effective": 1838}, "winogrande": {"original": 1267, "effective": 1267}, "openbookqa": {"original": 500, "effective": 500}, "arc_easy": {"original": 2376, "effective": 2376}, "arc_challenge": {"original": 1172, "effective": 1172}}, "git_hash": "1620cbc4", "date": 1777526814.9856765, "pretty_env_info": "PyTorch version: 2.8.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 12.3.0-1ubuntu1~22.04.2) 12.3.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.22.1\nLibc version: glibc-2.35\n\nPython version: 3.11.9 (main, Nov 10 2025, 02:08:09) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-131-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 13.0.88\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 580.95.05\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 176\nOn-line CPU(s) list: 0-175\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8468V\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 44\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4800.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq vmx ssse3 fma cx16 pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 wbnoinvd arat avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b fsrm md_clear serialize tsxldtrk avx512_fp16 arch_capabilities\nVirtualization: VT-x\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 4.1 MiB (88 instances)\nL1i cache: 2.8 MiB (88 instances)\nL2 cache: 176 MiB (88 instances)\nL3 cache: 195 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-87\nNUMA node1 CPU(s): 88-175\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; RSB filling; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; TSX disabled\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.4\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.3\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.8.0\n[pip3] triton==3.4.0\n[conda] _anaconda_depends 2025.12 py313_mkl_0\n[conda] blas 1.0 mkl\n[conda] mkl 2025.0.0 hacee8c2_941\n[conda] mkl-service 2.5.2 py313hacdc0fc_0\n[conda] mkl_fft 2.1.1 py313h57662e1_0\n[conda] mkl_random 1.3.0 py313h23c847b_0\n[conda] numpy 2.3.5 py313h08c6c3d_0\n[conda] numpy-base 2.3.5 py313h00548fb_0\n[conda] numpydoc 1.9.0 py313h06a4308_0", "transformers_version": "5.1.0", "lm_eval_version": "0.4.11"}
|
metrics.eval.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"created_at": "2026-04-30T05:27:56.628379", "global_step": 50000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.42235494880546076, "acc_stderr,none": 0.014434138713379983, "acc_norm,none": 0.45051194539249145, "acc_norm_stderr,none": 0.014539646098471627}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7563131313131313, "acc_stderr,none": 0.00880917174472056, "acc_norm,none": 0.7567340067340067, "acc_norm_stderr,none": 0.00880400984686553}, "boolq": {"alias": "boolq", "acc,none": 0.6938837920489297, "acc_stderr,none": 0.008060817222724517}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4953196574387572, "acc_stderr,none": 0.004989562798280521, "acc_norm,none": 0.6694881497709619, "acc_norm_stderr,none": 0.00469436096892941}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665813}, "piqa": {"alias": "piqa", "acc,none": 0.7486398258977149, "acc_stderr,none": 0.010121156016819262, "acc_norm,none": 0.7464635473340587, "acc_norm_stderr,none": 0.010150090834551784}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558363}}
|
train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/debug-internal.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
|
| 2 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Configure stats pid to 470303
|
| 3 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug.log
|
| 5 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
|
| 6 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():848] calling init triggers
|
| 7 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
|
| 9 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():896] starting backend
|
| 10 |
+
2026-04-29 15:35:52,400 INFO MainThread:470303 [wandb_init.py:init():911] sending inform_init request
|
| 11 |
+
2026-04-29 15:35:52,408 INFO MainThread:470303 [wandb_init.py:init():919] backend started and connected
|
| 12 |
+
2026-04-29 15:35:52,410 INFO MainThread:470303 [wandb_init.py:init():989] updated telemetry
|
| 13 |
+
2026-04-29 15:35:52,430 INFO MainThread:470303 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-29 15:35:53,838 INFO MainThread:470303 [wandb_init.py:init():1058] starting run threads in backend
|
| 15 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_console_start():2542] atexit reg
|
| 16 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2391] redirect: wrap_raw
|
| 17 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2460] Wrapping output streams.
|
| 18 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2483] Redirects installed.
|
| 19 |
+
2026-04-29 15:35:54,077 INFO MainThread:470303 [wandb_init.py:init():1098] run started, returning control to user process
|
| 20 |
+
2026-04-30 05:27:57,103 INFO wandb-AsyncioManager-main:470303 [service_client.py:_forward_responses():134] Reached EOF.
|
| 21 |
+
2026-04-30 05:27:57,104 INFO wandb-AsyncioManager-main:470303 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
|
| 22 |
+
2026-04-30 05:27:59,641 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
|
| 23 |
+
Traceback (most recent call last):
|
| 24 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
|
| 25 |
+
await fn()
|
| 26 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
|
| 27 |
+
await self._send_server_request(request)
|
| 28 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
|
| 29 |
+
await self._drain_writer()
|
| 30 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
|
| 31 |
+
await self._writer.drain()
|
| 32 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
|
| 33 |
+
await self._protocol._drain_helper()
|
| 34 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
|
| 35 |
+
raise ConnectionResetError('Connection lost')
|
| 36 |
+
ConnectionResetError: Connection lost
|
| 37 |
+
2026-04-30 05:27:59,660 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
|
| 40 |
+
await fn()
|
| 41 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
|
| 42 |
+
await self._send_server_request(request)
|
| 43 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 78, in _send_server_request
|
| 44 |
+
raise self._broken_exc.with_traceback(self._broken_tb)
|
| 45 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
|
| 46 |
+
await self._drain_writer()
|
| 47 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
|
| 48 |
+
await self._writer.drain()
|
| 49 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
|
| 50 |
+
await self._protocol._drain_helper()
|
| 51 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
|
| 52 |
+
raise ConnectionResetError('Connection lost')
|
| 53 |
+
ConnectionResetError: Connection lost
|
wandb/run-20260429_011802-2wmkezq3/files/media/html/memory_trace_50_79effaa90bfee7eb3207.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_011802-2wmkezq3/files/media/html/profile_trace_51_ae282608c6eeb7f48826.html
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<base target="_blank"><link rel="stylesheet" type="text/css" href="https://app.wandb.ai/normalize.css" />/home/xun/rsadhukh/STEM/logs/midfine_base_final/profiling/profile_CPU_CUDA_000104/rank00_compute-node-14_1060320.1777425671449116530.pt.trace.html.gz
|
wandb/run-20260429_011802-2wmkezq3/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_011802-2wmkezq3/files/requirements.txt
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DataProperty==1.1.0
|
| 2 |
+
absl-py==2.4.0
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.13.5
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
annotated-doc==0.0.4
|
| 7 |
+
annotated-types==0.7.0
|
| 8 |
+
antlr4-python3-runtime==4.9.3
|
| 9 |
+
anyio==4.13.0
|
| 10 |
+
asttokens==3.0.1
|
| 11 |
+
async-lru==2.3.0
|
| 12 |
+
attrs==26.1.0
|
| 13 |
+
babel==2.18.0
|
| 14 |
+
bleach==6.3.0
|
| 15 |
+
blessed==1.38.0
|
| 16 |
+
blobfile==3.2.0
|
| 17 |
+
certifi==2026.2.25
|
| 18 |
+
chardet==5.2.0
|
| 19 |
+
charset-normalizer==3.4.7
|
| 20 |
+
click==8.3.2
|
| 21 |
+
colorama==0.4.6
|
| 22 |
+
comm==0.2.3
|
| 23 |
+
datasets==4.8.4
|
| 24 |
+
datatrove==0.9.0
|
| 25 |
+
debugpy==1.8.20
|
| 26 |
+
decorator==5.2.1
|
| 27 |
+
defusedxml==0.7.1
|
| 28 |
+
dill==0.4.1
|
| 29 |
+
evaluate==0.4.6
|
| 30 |
+
executing==2.2.1
|
| 31 |
+
fastjsonschema==2.21.2
|
| 32 |
+
filelock==3.28.0
|
| 33 |
+
fqdn==1.5.1
|
| 34 |
+
frozenlist==1.8.0
|
| 35 |
+
fsspec==2026.2.0
|
| 36 |
+
gitdb==4.0.12
|
| 37 |
+
GitPython==3.1.46
|
| 38 |
+
gpustat==1.1.1
|
| 39 |
+
h11==0.16.0
|
| 40 |
+
hf-xet==1.4.3
|
| 41 |
+
httpcore==1.0.9
|
| 42 |
+
httpx==0.28.1
|
| 43 |
+
huggingface_hub==1.11.0
|
| 44 |
+
humanize==4.15.0
|
| 45 |
+
idna==3.11
|
| 46 |
+
ipython_pygments_lexers==1.1.1
|
| 47 |
+
Jinja2==3.1.6
|
| 48 |
+
joblib==1.5.3
|
| 49 |
+
json5==0.14.0
|
| 50 |
+
jsonlines==4.0.0
|
| 51 |
+
jsonpointer==3.1.1
|
| 52 |
+
jupyter_core==5.9.1
|
| 53 |
+
jupyterlab_pygments==0.3.0
|
| 54 |
+
lark==1.3.1
|
| 55 |
+
lm_eval==0.4.11
|
| 56 |
+
loguru==0.7.3
|
| 57 |
+
lxml==6.1.0
|
| 58 |
+
markdown-it-py==4.0.0
|
| 59 |
+
MarkupSafe==3.0.3
|
| 60 |
+
matplotlib-inline==0.2.1
|
| 61 |
+
mbstrdecoder==1.1.4
|
| 62 |
+
mdurl==0.1.2
|
| 63 |
+
mistune==3.2.0
|
| 64 |
+
more-itertools==11.0.2
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
msgspec==0.21.1
|
| 67 |
+
multidict==6.7.1
|
| 68 |
+
multiprocess==0.70.19
|
| 69 |
+
nest-asyncio==1.6.0
|
| 70 |
+
networkx==3.6.1
|
| 71 |
+
nltk==3.9.4
|
| 72 |
+
numpy==2.4.4
|
| 73 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 74 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 75 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 76 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 77 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 78 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 79 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 80 |
+
nvidia-curand-cu12==10.3.9.90
|
| 81 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 82 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 83 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 84 |
+
nvidia-ml-py==13.595.45
|
| 85 |
+
nvidia-nccl-cu12==2.27.3
|
| 86 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 87 |
+
nvidia-nvtx-cu12==12.8.90
|
| 88 |
+
objprint==0.3.0
|
| 89 |
+
omegaconf==2.3.0
|
| 90 |
+
orjson==3.11.8
|
| 91 |
+
overrides==7.7.0
|
| 92 |
+
packaging==26.1
|
| 93 |
+
pandas==3.0.2
|
| 94 |
+
pandocfilters==1.5.1
|
| 95 |
+
parso==0.8.6
|
| 96 |
+
pathvalidate==3.3.1
|
| 97 |
+
pexpect==4.9.0
|
| 98 |
+
pip==26.0.1
|
| 99 |
+
platformdirs==4.9.6
|
| 100 |
+
portalocker==3.2.0
|
| 101 |
+
prometheus_client==0.25.0
|
| 102 |
+
prompt_toolkit==3.0.52
|
| 103 |
+
propcache==0.4.1
|
| 104 |
+
protobuf==7.34.1
|
| 105 |
+
psutil==7.2.2
|
| 106 |
+
ptyprocess==0.7.0
|
| 107 |
+
pure_eval==0.2.3
|
| 108 |
+
pyarrow==23.0.1
|
| 109 |
+
pycparser==3.0
|
| 110 |
+
pycryptodomex==3.23.0
|
| 111 |
+
pydantic==2.13.2
|
| 112 |
+
pydantic_core==2.46.2
|
| 113 |
+
Pygments==2.20.0
|
| 114 |
+
pynvml==13.0.1
|
| 115 |
+
pytablewriter==1.2.1
|
| 116 |
+
python-dateutil==2.9.0.post0
|
| 117 |
+
python-json-logger==4.1.0
|
| 118 |
+
pytz==2026.1.post1
|
| 119 |
+
PyYAML==6.0.3
|
| 120 |
+
pyzmq==27.1.0
|
| 121 |
+
referencing==0.37.0
|
| 122 |
+
regex==2026.4.4
|
| 123 |
+
requests==2.33.1
|
| 124 |
+
rfc3339-validator==0.1.4
|
| 125 |
+
rfc3986-validator==0.1.1
|
| 126 |
+
rfc3987-syntax==1.1.0
|
| 127 |
+
rich==15.0.0
|
| 128 |
+
rouge_score==0.1.2
|
| 129 |
+
rpds-py==0.30.0
|
| 130 |
+
sacrebleu==2.6.0
|
| 131 |
+
safetensors==0.7.0
|
| 132 |
+
scikit-learn==1.8.0
|
| 133 |
+
scipy==1.17.1
|
| 134 |
+
Send2Trash==2.1.0
|
| 135 |
+
sentencepiece==0.2.1
|
| 136 |
+
sentry-sdk==2.58.0
|
| 137 |
+
setuptools==65.5.0
|
| 138 |
+
shellingham==1.5.4
|
| 139 |
+
six==1.17.0
|
| 140 |
+
smmap==5.0.3
|
| 141 |
+
soupsieve==2.8.3
|
| 142 |
+
sqlitedict==2.1.0
|
| 143 |
+
stack-data==0.6.3
|
| 144 |
+
sympy==1.14.0
|
| 145 |
+
tabledata==1.3.4
|
| 146 |
+
tabulate==0.10.0
|
| 147 |
+
tcolorpy==0.1.7
|
| 148 |
+
terminado==0.18.1
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
tiktoken==0.12.0
|
| 151 |
+
tinycss2==1.4.0
|
| 152 |
+
tokenizers==0.22.2
|
| 153 |
+
torch==2.8.0
|
| 154 |
+
tornado==6.5.5
|
| 155 |
+
tqdm==4.67.3
|
| 156 |
+
traitlets==5.14.3
|
| 157 |
+
transformers==5.1.0
|
| 158 |
+
triton==3.4.0
|
| 159 |
+
typepy==1.3.4
|
| 160 |
+
typer==0.24.1
|
| 161 |
+
typer-slim==0.24.0
|
| 162 |
+
typing_extensions==4.15.0
|
| 163 |
+
typing-inspection==0.4.2
|
| 164 |
+
tzdata==2026.2
|
| 165 |
+
uri-template==1.3.0
|
| 166 |
+
urllib3==2.6.3
|
| 167 |
+
viztracer==1.1.1
|
| 168 |
+
wandb==0.26.0
|
| 169 |
+
wcwidth==0.6.0
|
| 170 |
+
webcolors==25.10.0
|
| 171 |
+
webencodings==0.5.1
|
| 172 |
+
websocket-client==1.9.0
|
| 173 |
+
word2number==1.1
|
| 174 |
+
xformers==0.0.32.post1
|
| 175 |
+
xxhash==3.6.0
|
| 176 |
+
yarl==1.23.0
|
| 177 |
+
zstandard==0.25.0
|
wandb/run-20260429_011802-2wmkezq3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.9",
|
| 4 |
+
"startedAt": "2026-04-29T01:18:02.772404Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"config=apps/main/configs/olmo2_1B_midfine.yaml",
|
| 7 |
+
"dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 8 |
+
"checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 9 |
+
"checkpoint.continue_training_from_init=true",
|
| 10 |
+
"checkpoint.dump.every=5000",
|
| 11 |
+
"checkpoint.eval.every=100000",
|
| 12 |
+
"checkpoint.dump.keep=1",
|
| 13 |
+
"data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
|
| 14 |
+
"data.node_local=false",
|
| 15 |
+
"data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 16 |
+
"logging.wandb.name=midfine_base_final"
|
| 17 |
+
],
|
| 18 |
+
"program": "-m apps.main.train",
|
| 19 |
+
"git": {
|
| 20 |
+
"remote": "https://github.com/Infini-AI-Lab/STEM.git",
|
| 21 |
+
"commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
|
| 22 |
+
},
|
| 23 |
+
"email": "rsadhukh@andrew.cmu.edu",
|
| 24 |
+
"root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 25 |
+
"host": "compute-node-14",
|
| 26 |
+
"executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
|
| 27 |
+
"cpu_count": 88,
|
| 28 |
+
"cpu_count_logical": 176,
|
| 29 |
+
"gpu": "NVIDIA H200",
|
| 30 |
+
"gpu_count": 8,
|
| 31 |
+
"disk": {
|
| 32 |
+
"/": {
|
| 33 |
+
"total": "133003395072",
|
| 34 |
+
"used": "82473435136"
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"memory": {
|
| 38 |
+
"total": "2071474651136"
|
| 39 |
+
},
|
| 40 |
+
"gpu_nvidia": [
|
| 41 |
+
{
|
| 42 |
+
"name": "NVIDIA H200",
|
| 43 |
+
"memoryTotal": "150754820096",
|
| 44 |
+
"cudaCores": 16896,
|
| 45 |
+
"architecture": "Hopper",
|
| 46 |
+
"uuid": "GPU-f9dd2b05-45b2-e3c3-43c1-419969cf660f"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"name": "NVIDIA H200",
|
| 50 |
+
"memoryTotal": "150754820096",
|
| 51 |
+
"cudaCores": 16896,
|
| 52 |
+
"architecture": "Hopper",
|
| 53 |
+
"uuid": "GPU-a79c6c73-bf1a-8760-8bed-c89a9a1ff315"
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"name": "NVIDIA H200",
|
| 57 |
+
"memoryTotal": "150754820096",
|
| 58 |
+
"cudaCores": 16896,
|
| 59 |
+
"architecture": "Hopper",
|
| 60 |
+
"uuid": "GPU-84d10e29-34bf-5102-3d30-eb56af3a556d"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA H200",
|
| 64 |
+
"memoryTotal": "150754820096",
|
| 65 |
+
"cudaCores": 16896,
|
| 66 |
+
"architecture": "Hopper",
|
| 67 |
+
"uuid": "GPU-0a7948c2-0a62-1c09-524b-179e3de36a59"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "NVIDIA H200",
|
| 71 |
+
"memoryTotal": "150754820096",
|
| 72 |
+
"cudaCores": 16896,
|
| 73 |
+
"architecture": "Hopper",
|
| 74 |
+
"uuid": "GPU-d4201f0e-1b44-4327-9747-b62cca7ab4bf"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA H200",
|
| 78 |
+
"memoryTotal": "150754820096",
|
| 79 |
+
"cudaCores": 16896,
|
| 80 |
+
"architecture": "Hopper",
|
| 81 |
+
"uuid": "GPU-8fa707f1-9cd9-9384-c9c9-3356b1ad04ec"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H200",
|
| 85 |
+
"memoryTotal": "150754820096",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper",
|
| 88 |
+
"uuid": "GPU-b096838e-c00a-4819-b204-35fab82f7d94"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"name": "NVIDIA H200",
|
| 92 |
+
"memoryTotal": "150754820096",
|
| 93 |
+
"cudaCores": 16896,
|
| 94 |
+
"architecture": "Hopper",
|
| 95 |
+
"uuid": "GPU-cbcadfdd-c45c-2cf6-4408-00f7bce853af"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"cudaVersion": "13.0",
|
| 99 |
+
"slurm": {
|
| 100 |
+
"cluster_name": "cluster",
|
| 101 |
+
"conf": "/var/spool/slurmd/conf-cache/slurm.conf",
|
| 102 |
+
"cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 103 |
+
"cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 104 |
+
"cpu_bind_type": "mask_cpu:",
|
| 105 |
+
"cpu_bind_verbose": "quiet",
|
| 106 |
+
"cpus_on_node": "128",
|
| 107 |
+
"cpus_per_task": "128",
|
| 108 |
+
"distribution": "cyclic",
|
| 109 |
+
"gpus_on_node": "8",
|
| 110 |
+
"gtids": "0",
|
| 111 |
+
"job_cpus_per_node": "128(x4)",
|
| 112 |
+
"job_end_time": "1777598223",
|
| 113 |
+
"job_gid": "1005",
|
| 114 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
| 115 |
+
"job_id": "29496",
|
| 116 |
+
"job_name": "stem",
|
| 117 |
+
"job_nodelist": "compute-node-[14,0,43-44]",
|
| 118 |
+
"job_num_nodes": "4",
|
| 119 |
+
"job_partition": "high",
|
| 120 |
+
"job_start_time": "1777425423",
|
| 121 |
+
"job_uid": "1005",
|
| 122 |
+
"job_user": "xun",
|
| 123 |
+
"jobid": "29496",
|
| 124 |
+
"launch_node_ipaddr": "172.27.49.7",
|
| 125 |
+
"localid": "0",
|
| 126 |
+
"nnodes": "4",
|
| 127 |
+
"nodeid": "0",
|
| 128 |
+
"nodelist": "compute-node-[14,0,43-44]",
|
| 129 |
+
"nprocs": "4",
|
| 130 |
+
"ntasks": "4",
|
| 131 |
+
"ntasks_per_node": "1",
|
| 132 |
+
"output_mode": "standard",
|
| 133 |
+
"prio_process": "0",
|
| 134 |
+
"procid": "0",
|
| 135 |
+
"srun_comm_host": "172.27.49.7",
|
| 136 |
+
"srun_comm_port": "44949",
|
| 137 |
+
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 138 |
+
"step_id": "0",
|
| 139 |
+
"step_launcher_port": "44949",
|
| 140 |
+
"step_nodelist": "compute-node-[14,0,43-44]",
|
| 141 |
+
"step_num_nodes": "4",
|
| 142 |
+
"step_num_tasks": "4",
|
| 143 |
+
"step_tasks_per_node": "1(x4)",
|
| 144 |
+
"stepid": "0",
|
| 145 |
+
"submit_dir": "/home/xun/rsadhukh/STEM",
|
| 146 |
+
"submit_host": "login-node-0",
|
| 147 |
+
"task_pid": "1059988",
|
| 148 |
+
"tasks_per_node": "1(x4)",
|
| 149 |
+
"topology_addr": "compute-node-14",
|
| 150 |
+
"topology_addr_pattern": "node",
|
| 151 |
+
"tres_per_task": "cpu:128",
|
| 152 |
+
"umask": "0000"
|
| 153 |
+
},
|
| 154 |
+
"writerId": "qwoms3z2pk3wk1elrvn86tgqrzw8t43p"
|
| 155 |
+
}
|
wandb/run-20260429_011802-2wmkezq3/logs/debug-core.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-29T01:18:02.8977674Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmp8tk991yk/port-1060320.txt","pid":1060320,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-29T01:18:02.899120332Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1060320}
|
| 3 |
+
{"time":"2026-04-29T01:18:02.899097714Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-1060320-1061249-332575222/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-04-29T01:18:03.07469245Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-29T01:18:03.084970348Z","level":"INFO","msg":"handleInformInit: received","streamId":"2wmkezq3","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-29T01:18:03.640108524Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2wmkezq3","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-29T01:18:09.282362815Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"3qs872as87kw"}
|
| 8 |
+
{"time":"2026-04-29T09:24:06.1057087Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2026-04-29T09:24:06.106154304Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 10 |
+
{"time":"2026-04-29T09:24:06.106165217Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 11 |
+
{"time":"2026-04-29T09:24:07.791740435Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_011802-2wmkezq3/logs/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
|
| 2 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Configure stats pid to 1060320
|
| 3 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_011802-2wmkezq3/logs/debug.log
|
| 5 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log
|
| 6 |
+
2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:init():848] calling init triggers
|
| 7 |
+
2026-04-29 01:18:02,802 INFO MainThread:1060320 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
|
| 9 |
+
2026-04-29 01:18:02,802 INFO MainThread:1060320 [wandb_init.py:init():896] starting backend
|
| 10 |
+
2026-04-29 01:18:03,074 INFO MainThread:1060320 [wandb_init.py:init():911] sending inform_init request
|
| 11 |
+
2026-04-29 01:18:03,083 INFO MainThread:1060320 [wandb_init.py:init():919] backend started and connected
|
| 12 |
+
2026-04-29 01:18:03,085 INFO MainThread:1060320 [wandb_init.py:init():989] updated telemetry
|
| 13 |
+
2026-04-29 01:18:03,119 INFO MainThread:1060320 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-29 01:18:04,059 INFO MainThread:1060320 [wandb_init.py:init():1058] starting run threads in backend
|
| 15 |
+
2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_console_start():2542] atexit reg
|
| 16 |
+
2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2391] redirect: wrap_raw
|
| 17 |
+
2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2460] Wrapping output streams.
|
| 18 |
+
2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2483] Redirects installed.
|
| 19 |
+
2026-04-29 01:18:04,281 INFO MainThread:1060320 [wandb_init.py:init():1098] run started, returning control to user process
|
wandb/run-20260429_141040-a48q7rq3/files/output.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0: INFO 26-04-29 14:10:43.068959 - 0:00:40 - Loadi
|
wandb/run-20260429_141040-a48q7rq3/files/requirements.txt
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DataProperty==1.1.0
|
| 2 |
+
absl-py==2.4.0
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.13.5
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
annotated-doc==0.0.4
|
| 7 |
+
annotated-types==0.7.0
|
| 8 |
+
antlr4-python3-runtime==4.9.3
|
| 9 |
+
anyio==4.13.0
|
| 10 |
+
argon2-cffi==25.1.0
|
| 11 |
+
argon2-cffi-bindings==25.1.0
|
| 12 |
+
arrow==1.4.0
|
| 13 |
+
asttokens==3.0.1
|
| 14 |
+
async-lru==2.3.0
|
| 15 |
+
attrs==26.1.0
|
| 16 |
+
babel==2.18.0
|
| 17 |
+
beautifulsoup4==4.14.3
|
| 18 |
+
bleach==6.3.0
|
| 19 |
+
blessed==1.38.0
|
| 20 |
+
blobfile==3.2.0
|
| 21 |
+
certifi==2026.2.25
|
| 22 |
+
cffi==2.0.0
|
| 23 |
+
chardet==5.2.0
|
| 24 |
+
charset-normalizer==3.4.7
|
| 25 |
+
click==8.3.2
|
| 26 |
+
colorama==0.4.6
|
| 27 |
+
comm==0.2.3
|
| 28 |
+
datasets==4.8.4
|
| 29 |
+
datatrove==0.9.0
|
| 30 |
+
debugpy==1.8.20
|
| 31 |
+
decorator==5.2.1
|
| 32 |
+
defusedxml==0.7.1
|
| 33 |
+
dill==0.4.1
|
| 34 |
+
evaluate==0.4.6
|
| 35 |
+
executing==2.2.1
|
| 36 |
+
fastjsonschema==2.21.2
|
| 37 |
+
filelock==3.28.0
|
| 38 |
+
fqdn==1.5.1
|
| 39 |
+
frozenlist==1.8.0
|
| 40 |
+
fsspec==2026.2.0
|
| 41 |
+
gitdb==4.0.12
|
| 42 |
+
GitPython==3.1.46
|
| 43 |
+
gpustat==1.1.1
|
| 44 |
+
h11==0.16.0
|
| 45 |
+
hf-xet==1.4.3
|
| 46 |
+
httpcore==1.0.9
|
| 47 |
+
httpx==0.28.1
|
| 48 |
+
huggingface_hub==1.11.0
|
| 49 |
+
humanize==4.15.0
|
| 50 |
+
idna==3.11
|
| 51 |
+
ipykernel==7.2.0
|
| 52 |
+
ipython==9.13.0
|
| 53 |
+
ipython_pygments_lexers==1.1.1
|
| 54 |
+
isoduration==20.11.0
|
| 55 |
+
jedi==0.19.2
|
| 56 |
+
Jinja2==3.1.6
|
| 57 |
+
joblib==1.5.3
|
| 58 |
+
json5==0.14.0
|
| 59 |
+
jsonlines==4.0.0
|
| 60 |
+
jsonpointer==3.1.1
|
| 61 |
+
jsonschema==4.26.0
|
| 62 |
+
jsonschema-specifications==2025.9.1
|
| 63 |
+
jupyter_client==8.8.0
|
| 64 |
+
jupyter_core==5.9.1
|
| 65 |
+
jupyter-events==0.12.1
|
| 66 |
+
jupyter-lsp==2.3.1
|
| 67 |
+
jupyter_server==2.17.0
|
| 68 |
+
jupyter_server_terminals==0.5.4
|
| 69 |
+
jupyterlab==4.5.6
|
| 70 |
+
jupyterlab_pygments==0.3.0
|
| 71 |
+
jupyterlab_server==2.28.0
|
| 72 |
+
lark==1.3.1
|
| 73 |
+
lm_eval==0.4.11
|
| 74 |
+
loguru==0.7.3
|
| 75 |
+
lxml==6.1.0
|
| 76 |
+
markdown-it-py==4.0.0
|
| 77 |
+
MarkupSafe==3.0.3
|
| 78 |
+
matplotlib-inline==0.2.1
|
| 79 |
+
mbstrdecoder==1.1.4
|
| 80 |
+
mdurl==0.1.2
|
| 81 |
+
mistune==3.2.0
|
| 82 |
+
more-itertools==11.0.2
|
| 83 |
+
mpmath==1.3.0
|
| 84 |
+
msgspec==0.21.1
|
| 85 |
+
multidict==6.7.1
|
| 86 |
+
multiprocess==0.70.19
|
| 87 |
+
nbclient==0.10.4
|
| 88 |
+
nbconvert==7.17.1
|
| 89 |
+
nbformat==5.10.4
|
| 90 |
+
nest-asyncio==1.6.0
|
| 91 |
+
networkx==3.6.1
|
| 92 |
+
nltk==3.9.4
|
| 93 |
+
notebook_shim==0.2.4
|
| 94 |
+
numpy==2.4.4
|
| 95 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 96 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 97 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 98 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 99 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 100 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 101 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 102 |
+
nvidia-curand-cu12==10.3.9.90
|
| 103 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 104 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 105 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 106 |
+
nvidia-ml-py==13.595.45
|
| 107 |
+
nvidia-nccl-cu12==2.27.3
|
| 108 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 109 |
+
nvidia-nvtx-cu12==12.8.90
|
| 110 |
+
objprint==0.3.0
|
| 111 |
+
omegaconf==2.3.0
|
| 112 |
+
orjson==3.11.8
|
| 113 |
+
overrides==7.7.0
|
| 114 |
+
packaging==26.1
|
| 115 |
+
pandas==3.0.2
|
| 116 |
+
pandocfilters==1.5.1
|
| 117 |
+
parso==0.8.6
|
| 118 |
+
pathvalidate==3.3.1
|
| 119 |
+
pexpect==4.9.0
|
| 120 |
+
pip==26.0.1
|
| 121 |
+
platformdirs==4.9.6
|
| 122 |
+
portalocker==3.2.0
|
| 123 |
+
prometheus_client==0.25.0
|
| 124 |
+
prompt_toolkit==3.0.52
|
| 125 |
+
propcache==0.4.1
|
| 126 |
+
protobuf==7.34.1
|
| 127 |
+
psutil==7.2.2
|
| 128 |
+
ptyprocess==0.7.0
|
| 129 |
+
pure_eval==0.2.3
|
| 130 |
+
pyarrow==23.0.1
|
| 131 |
+
pycparser==3.0
|
| 132 |
+
pycryptodomex==3.23.0
|
| 133 |
+
pydantic==2.13.2
|
| 134 |
+
pydantic_core==2.46.2
|
| 135 |
+
Pygments==2.20.0
|
| 136 |
+
pynvml==13.0.1
|
| 137 |
+
pytablewriter==1.2.1
|
| 138 |
+
python-dateutil==2.9.0.post0
|
| 139 |
+
python-json-logger==4.1.0
|
| 140 |
+
pytz==2026.1.post1
|
| 141 |
+
PyYAML==6.0.3
|
| 142 |
+
pyzmq==27.1.0
|
| 143 |
+
referencing==0.37.0
|
| 144 |
+
regex==2026.4.4
|
| 145 |
+
requests==2.33.1
|
| 146 |
+
rfc3339-validator==0.1.4
|
| 147 |
+
rfc3986-validator==0.1.1
|
| 148 |
+
rfc3987-syntax==1.1.0
|
| 149 |
+
rich==15.0.0
|
| 150 |
+
rouge_score==0.1.2
|
| 151 |
+
rpds-py==0.30.0
|
| 152 |
+
sacrebleu==2.6.0
|
| 153 |
+
safetensors==0.7.0
|
| 154 |
+
scikit-learn==1.8.0
|
| 155 |
+
scipy==1.17.1
|
| 156 |
+
Send2Trash==2.1.0
|
| 157 |
+
sentencepiece==0.2.1
|
| 158 |
+
sentry-sdk==2.58.0
|
| 159 |
+
setuptools==65.5.0
|
| 160 |
+
shellingham==1.5.4
|
| 161 |
+
six==1.17.0
|
| 162 |
+
smmap==5.0.3
|
| 163 |
+
soupsieve==2.8.3
|
| 164 |
+
sqlitedict==2.1.0
|
| 165 |
+
stack-data==0.6.3
|
| 166 |
+
sympy==1.14.0
|
| 167 |
+
tabledata==1.3.4
|
| 168 |
+
tabulate==0.10.0
|
| 169 |
+
tcolorpy==0.1.7
|
| 170 |
+
terminado==0.18.1
|
| 171 |
+
threadpoolctl==3.6.0
|
| 172 |
+
tiktoken==0.12.0
|
| 173 |
+
tinycss2==1.4.0
|
| 174 |
+
tokenizers==0.22.2
|
| 175 |
+
torch==2.8.0
|
| 176 |
+
tornado==6.5.5
|
| 177 |
+
tqdm==4.67.3
|
| 178 |
+
traitlets==5.14.3
|
| 179 |
+
transformers==5.1.0
|
| 180 |
+
triton==3.4.0
|
| 181 |
+
typepy==1.3.4
|
| 182 |
+
typer==0.24.1
|
| 183 |
+
typer-slim==0.24.0
|
| 184 |
+
typing_extensions==4.15.0
|
| 185 |
+
typing-inspection==0.4.2
|
| 186 |
+
tzdata==2026.2
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.6.3
|
| 189 |
+
viztracer==1.1.1
|
| 190 |
+
wandb==0.26.0
|
| 191 |
+
wcwidth==0.6.0
|
| 192 |
+
webcolors==25.10.0
|
| 193 |
+
webencodings==0.5.1
|
| 194 |
+
websocket-client==1.9.0
|
| 195 |
+
word2number==1.1
|
| 196 |
+
xformers==0.0.32.post1
|
| 197 |
+
xxhash==3.6.0
|
| 198 |
+
yarl==1.23.0
|
| 199 |
+
zstandard==0.25.0
|
wandb/run-20260429_141040-a48q7rq3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.9",
|
| 4 |
+
"startedAt": "2026-04-29T14:10:40.707156Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"config=apps/main/configs/olmo2_1B_midfine.yaml",
|
| 7 |
+
"dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 8 |
+
"checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 9 |
+
"checkpoint.continue_training_from_init=true",
|
| 10 |
+
"checkpoint.dump.every=5000",
|
| 11 |
+
"checkpoint.eval.every=100000",
|
| 12 |
+
"checkpoint.dump.keep=1",
|
| 13 |
+
"data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
|
| 14 |
+
"data.node_local=false",
|
| 15 |
+
"data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 16 |
+
"logging.wandb.name=midfine_base_final"
|
| 17 |
+
],
|
| 18 |
+
"program": "-m apps.main.train",
|
| 19 |
+
"git": {
|
| 20 |
+
"remote": "https://github.com/Infini-AI-Lab/STEM.git",
|
| 21 |
+
"commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
|
| 22 |
+
},
|
| 23 |
+
"email": "rsadhukh@andrew.cmu.edu",
|
| 24 |
+
"root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 25 |
+
"host": "compute-node-14",
|
| 26 |
+
"executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
|
| 27 |
+
"cpu_count": 88,
|
| 28 |
+
"cpu_count_logical": 176,
|
| 29 |
+
"gpu": "NVIDIA H200",
|
| 30 |
+
"gpu_count": 8,
|
| 31 |
+
"disk": {
|
| 32 |
+
"/": {
|
| 33 |
+
"total": "133003395072",
|
| 34 |
+
"used": "83511459840"
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"memory": {
|
| 38 |
+
"total": "2071474651136"
|
| 39 |
+
},
|
| 40 |
+
"gpu_nvidia": [
|
| 41 |
+
{
|
| 42 |
+
"name": "NVIDIA H200",
|
| 43 |
+
"memoryTotal": "150754820096",
|
| 44 |
+
"cudaCores": 16896,
|
| 45 |
+
"architecture": "Hopper",
|
| 46 |
+
"uuid": "GPU-f9dd2b05-45b2-e3c3-43c1-419969cf660f"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"name": "NVIDIA H200",
|
| 50 |
+
"memoryTotal": "150754820096",
|
| 51 |
+
"cudaCores": 16896,
|
| 52 |
+
"architecture": "Hopper",
|
| 53 |
+
"uuid": "GPU-a79c6c73-bf1a-8760-8bed-c89a9a1ff315"
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"name": "NVIDIA H200",
|
| 57 |
+
"memoryTotal": "150754820096",
|
| 58 |
+
"cudaCores": 16896,
|
| 59 |
+
"architecture": "Hopper",
|
| 60 |
+
"uuid": "GPU-84d10e29-34bf-5102-3d30-eb56af3a556d"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA H200",
|
| 64 |
+
"memoryTotal": "150754820096",
|
| 65 |
+
"cudaCores": 16896,
|
| 66 |
+
"architecture": "Hopper",
|
| 67 |
+
"uuid": "GPU-0a7948c2-0a62-1c09-524b-179e3de36a59"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "NVIDIA H200",
|
| 71 |
+
"memoryTotal": "150754820096",
|
| 72 |
+
"cudaCores": 16896,
|
| 73 |
+
"architecture": "Hopper",
|
| 74 |
+
"uuid": "GPU-d4201f0e-1b44-4327-9747-b62cca7ab4bf"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA H200",
|
| 78 |
+
"memoryTotal": "150754820096",
|
| 79 |
+
"cudaCores": 16896,
|
| 80 |
+
"architecture": "Hopper",
|
| 81 |
+
"uuid": "GPU-8fa707f1-9cd9-9384-c9c9-3356b1ad04ec"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H200",
|
| 85 |
+
"memoryTotal": "150754820096",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper",
|
| 88 |
+
"uuid": "GPU-b096838e-c00a-4819-b204-35fab82f7d94"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"name": "NVIDIA H200",
|
| 92 |
+
"memoryTotal": "150754820096",
|
| 93 |
+
"cudaCores": 16896,
|
| 94 |
+
"architecture": "Hopper",
|
| 95 |
+
"uuid": "GPU-cbcadfdd-c45c-2cf6-4408-00f7bce853af"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"cudaVersion": "13.0",
|
| 99 |
+
"slurm": {
|
| 100 |
+
"cluster_name": "cluster",
|
| 101 |
+
"conf": "/var/spool/slurmd/conf-cache/slurm.conf",
|
| 102 |
+
"cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 103 |
+
"cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 104 |
+
"cpu_bind_type": "mask_cpu:",
|
| 105 |
+
"cpu_bind_verbose": "quiet",
|
| 106 |
+
"cpus_on_node": "128",
|
| 107 |
+
"cpus_per_task": "128",
|
| 108 |
+
"distribution": "cyclic",
|
| 109 |
+
"gpus_on_node": "8",
|
| 110 |
+
"gtids": "0",
|
| 111 |
+
"job_cpus_per_node": "128(x4)",
|
| 112 |
+
"job_end_time": "1777644571",
|
| 113 |
+
"job_gid": "1005",
|
| 114 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
| 115 |
+
"job_id": "29527",
|
| 116 |
+
"job_name": "stem",
|
| 117 |
+
"job_nodelist": "compute-node-[14,0,43-44]",
|
| 118 |
+
"job_num_nodes": "4",
|
| 119 |
+
"job_partition": "high",
|
| 120 |
+
"job_start_time": "1777471771",
|
| 121 |
+
"job_uid": "1005",
|
| 122 |
+
"job_user": "xun",
|
| 123 |
+
"jobid": "29527",
|
| 124 |
+
"launch_node_ipaddr": "172.27.49.7",
|
| 125 |
+
"localid": "0",
|
| 126 |
+
"nnodes": "4",
|
| 127 |
+
"nodeid": "0",
|
| 128 |
+
"nodelist": "compute-node-[14,0,43-44]",
|
| 129 |
+
"nprocs": "4",
|
| 130 |
+
"ntasks": "4",
|
| 131 |
+
"ntasks_per_node": "1",
|
| 132 |
+
"output_mode": "standard",
|
| 133 |
+
"prio_process": "0",
|
| 134 |
+
"procid": "0",
|
| 135 |
+
"srun_comm_host": "172.27.49.7",
|
| 136 |
+
"srun_comm_port": "46439",
|
| 137 |
+
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 138 |
+
"step_id": "0",
|
| 139 |
+
"step_launcher_port": "46439",
|
| 140 |
+
"step_nodelist": "compute-node-[14,0,43-44]",
|
| 141 |
+
"step_num_nodes": "4",
|
| 142 |
+
"step_num_tasks": "4",
|
| 143 |
+
"step_tasks_per_node": "1(x4)",
|
| 144 |
+
"stepid": "0",
|
| 145 |
+
"submit_dir": "/home/xun/rsadhukh/STEM",
|
| 146 |
+
"submit_host": "login-node-0",
|
| 147 |
+
"task_pid": "1153992",
|
| 148 |
+
"tasks_per_node": "1(x4)",
|
| 149 |
+
"topology_addr": "compute-node-14",
|
| 150 |
+
"topology_addr_pattern": "node",
|
| 151 |
+
"tres_per_task": "cpu:128",
|
| 152 |
+
"umask": "0000"
|
| 153 |
+
},
|
| 154 |
+
"writerId": "9u2zm8go5a3uc5rtfm8a0pkw2jgq3aas"
|
| 155 |
+
}
|
wandb/run-20260429_141040-a48q7rq3/logs/debug-core.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-29T14:10:40.817255983Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmpowh5z38_/port-1154327.txt","pid":1154327,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-29T14:10:40.817691432Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-1154327-1155293-1046663400/socket","Net":"unix"}}
|
| 3 |
+
{"time":"2026-04-29T14:10:40.817799249Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1154327}
|
| 4 |
+
{"time":"2026-04-29T14:10:40.993605114Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-29T14:10:41.005163272Z","level":"INFO","msg":"handleInformInit: received","streamId":"a48q7rq3","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-29T14:10:41.572316485Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"a48q7rq3","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-29T14:10:46.336160157Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 8 |
+
{"time":"2026-04-29T14:10:46.336722922Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 9 |
+
{"time":"2026-04-29T14:10:46.336728521Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 10 |
+
{"time":"2026-04-29T14:10:47.338902098Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-29T14:10:41.006690649Z","level":"INFO","msg":"wandb-core"}
|
| 2 |
+
{"time":"2026-04-29T14:10:41.007471115Z","level":"INFO","msg":"stream: starting","core version":"0.26.0"}
|
| 3 |
+
{"time":"2026-04-29T14:10:41.569129518Z","level":"INFO","msg":"stream: created new stream","id":"a48q7rq3"}
|
| 4 |
+
{"time":"2026-04-29T14:10:41.569186597Z","level":"INFO","msg":"handler: started"}
|
| 5 |
+
{"time":"2026-04-29T14:10:41.572303621Z","level":"INFO","msg":"stream: started"}
|
| 6 |
+
{"time":"2026-04-29T14:10:41.572316534Z","level":"INFO","msg":"writer: started","stream_id":"a48q7rq3"}
|
| 7 |
+
{"time":"2026-04-29T14:10:41.572338006Z","level":"INFO","msg":"sender: started"}
|
| 8 |
+
{"time":"2026-04-29T14:10:43.069674591Z","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
|
| 9 |
+
{"time":"2026-04-29T14:10:46.584197837Z","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
wandb/run-20260429_141040-a48q7rq3/logs/debug.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
|
| 2 |
+
2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Configure stats pid to 1154327
|
| 3 |
+
2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_141040-a48q7rq3/logs/debug.log
|
| 5 |
+
2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log
|
| 6 |
+
2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():848] calling init triggers
|
| 7 |
+
2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
|
| 9 |
+
2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():896] starting backend
|
| 10 |
+
2026-04-29 14:10:40,993 INFO MainThread:1154327 [wandb_init.py:init():911] sending inform_init request
|
| 11 |
+
2026-04-29 14:10:41,002 INFO MainThread:1154327 [wandb_init.py:init():919] backend started and connected
|
| 12 |
+
2026-04-29 14:10:41,003 INFO MainThread:1154327 [wandb_init.py:init():989] updated telemetry
|
| 13 |
+
2026-04-29 14:10:41,025 INFO MainThread:1154327 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-29 14:10:42,750 INFO MainThread:1154327 [wandb_init.py:init():1058] starting run threads in backend
|
| 15 |
+
2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_console_start():2542] atexit reg
|
| 16 |
+
2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2391] redirect: wrap_raw
|
| 17 |
+
2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2460] Wrapping output streams.
|
| 18 |
+
2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2483] Redirects installed.
|
| 19 |
+
2026-04-29 14:10:43,068 INFO MainThread:1154327 [wandb_init.py:init():1098] run started, returning control to user process
|
wandb/run-20260429_141040-a48q7rq3/run-a48q7rq3.wandb
ADDED
|
Binary file (7 Bytes). View file
|
|
|
wandb/run-20260429_153552-r20yn80u/files/config.yaml
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.26.0
|
| 4 |
+
e:
|
| 5 |
+
i4ocjyr9csg8kju0tej1pg06av2k8k96:
|
| 6 |
+
args:
|
| 7 |
+
- config=apps/main/configs/olmo2_1B_midfine.yaml
|
| 8 |
+
- dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final
|
| 9 |
+
- checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 10 |
+
- checkpoint.continue_training_from_init=true
|
| 11 |
+
- checkpoint.dump.every=5000
|
| 12 |
+
- checkpoint.eval.every=100000
|
| 13 |
+
- checkpoint.dump.keep=1
|
| 14 |
+
- data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
|
| 15 |
+
- data.node_local=false
|
| 16 |
+
- data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 17 |
+
- logging.wandb.name=midfine_base_final
|
| 18 |
+
cpu_count: 88
|
| 19 |
+
cpu_count_logical: 176
|
| 20 |
+
cudaVersion: "13.0"
|
| 21 |
+
disk:
|
| 22 |
+
/:
|
| 23 |
+
total: "133003395072"
|
| 24 |
+
used: "103744323584"
|
| 25 |
+
email: rsadhukh@andrew.cmu.edu
|
| 26 |
+
executable: /home/xun/rsadhukh/STEM/stem/bin/python
|
| 27 |
+
git:
|
| 28 |
+
commit: 7e450007299a777d774d6e2b598001cc7552c1b4
|
| 29 |
+
remote: https://github.com/Infini-AI-Lab/STEM.git
|
| 30 |
+
gpu: NVIDIA H200
|
| 31 |
+
gpu_count: 8
|
| 32 |
+
gpu_nvidia:
|
| 33 |
+
- architecture: Hopper
|
| 34 |
+
cudaCores: 16896
|
| 35 |
+
memoryTotal: "150754820096"
|
| 36 |
+
name: NVIDIA H200
|
| 37 |
+
uuid: GPU-dbeb9076-fd61-4013-987f-938d1db8b786
|
| 38 |
+
- architecture: Hopper
|
| 39 |
+
cudaCores: 16896
|
| 40 |
+
memoryTotal: "150754820096"
|
| 41 |
+
name: NVIDIA H200
|
| 42 |
+
uuid: GPU-5b9b54c7-efcf-6e85-08ed-f6e6f61cfa7a
|
| 43 |
+
- architecture: Hopper
|
| 44 |
+
cudaCores: 16896
|
| 45 |
+
memoryTotal: "150754820096"
|
| 46 |
+
name: NVIDIA H200
|
| 47 |
+
uuid: GPU-df8b695a-d295-cf7c-ab3b-b6d764b11fdf
|
| 48 |
+
- architecture: Hopper
|
| 49 |
+
cudaCores: 16896
|
| 50 |
+
memoryTotal: "150754820096"
|
| 51 |
+
name: NVIDIA H200
|
| 52 |
+
uuid: GPU-c7480abd-eae7-8916-7803-4e033e94aaa0
|
| 53 |
+
- architecture: Hopper
|
| 54 |
+
cudaCores: 16896
|
| 55 |
+
memoryTotal: "150754820096"
|
| 56 |
+
name: NVIDIA H200
|
| 57 |
+
uuid: GPU-91d17507-e0ee-813d-211c-6dbbe87e7f52
|
| 58 |
+
- architecture: Hopper
|
| 59 |
+
cudaCores: 16896
|
| 60 |
+
memoryTotal: "150754820096"
|
| 61 |
+
name: NVIDIA H200
|
| 62 |
+
uuid: GPU-fcbf89aa-71d3-6603-a80d-9bfbd3f063a2
|
| 63 |
+
- architecture: Hopper
|
| 64 |
+
cudaCores: 16896
|
| 65 |
+
memoryTotal: "150754820096"
|
| 66 |
+
name: NVIDIA H200
|
| 67 |
+
uuid: GPU-10b440b6-fedb-33fe-cad7-b1b1dfd65816
|
| 68 |
+
- architecture: Hopper
|
| 69 |
+
cudaCores: 16896
|
| 70 |
+
memoryTotal: "150754820096"
|
| 71 |
+
name: NVIDIA H200
|
| 72 |
+
uuid: GPU-7da58531-cbc7-dedf-3a29-6540eaf04fe7
|
| 73 |
+
host: compute-node-3
|
| 74 |
+
memory:
|
| 75 |
+
total: "2071474647040"
|
| 76 |
+
os: Linux-5.15.0-131-generic-x86_64-with-glibc2.35
|
| 77 |
+
program: -m apps.main.train
|
| 78 |
+
python: CPython 3.11.9
|
| 79 |
+
root: /home/xun/rsadhukh/STEM/logs/midfine_base_final
|
| 80 |
+
slurm:
|
| 81 |
+
cluster_name: cluster
|
| 82 |
+
conf: /var/spool/slurmd/conf-cache/slurm.conf
|
| 83 |
+
cpu_bind: quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF
|
| 84 |
+
cpu_bind_list: 0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF
|
| 85 |
+
cpu_bind_type: 'mask_cpu:'
|
| 86 |
+
cpu_bind_verbose: quiet
|
| 87 |
+
cpus_on_node: "128"
|
| 88 |
+
cpus_per_task: "128"
|
| 89 |
+
distribution: cyclic
|
| 90 |
+
gpus_on_node: "8"
|
| 91 |
+
gtids: "0"
|
| 92 |
+
job_cpus_per_node: 128(x4)
|
| 93 |
+
job_end_time: "1777649690"
|
| 94 |
+
job_gid: "1005"
|
| 95 |
+
job_gpus: 0,1,2,3,4,5,6,7
|
| 96 |
+
job_id: "29546"
|
| 97 |
+
job_name: stem
|
| 98 |
+
job_nodelist: compute-node-[3,7,46-47]
|
| 99 |
+
job_num_nodes: "4"
|
| 100 |
+
job_partition: high
|
| 101 |
+
job_start_time: "1777476890"
|
| 102 |
+
job_uid: "1005"
|
| 103 |
+
job_user: xun
|
| 104 |
+
jobid: "29546"
|
| 105 |
+
launch_node_ipaddr: 172.27.61.166
|
| 106 |
+
localid: "0"
|
| 107 |
+
nnodes: "4"
|
| 108 |
+
nodeid: "0"
|
| 109 |
+
nodelist: compute-node-[3,7,46-47]
|
| 110 |
+
nprocs: "4"
|
| 111 |
+
ntasks: "4"
|
| 112 |
+
ntasks_per_node: "1"
|
| 113 |
+
output_mode: standard
|
| 114 |
+
prio_process: "0"
|
| 115 |
+
procid: "0"
|
| 116 |
+
srun_comm_host: 172.27.61.166
|
| 117 |
+
srun_comm_port: "33673"
|
| 118 |
+
step_gpus: 0,1,2,3,4,5,6,7
|
| 119 |
+
step_id: "0"
|
| 120 |
+
step_launcher_port: "33673"
|
| 121 |
+
step_nodelist: compute-node-[3,7,46-47]
|
| 122 |
+
step_num_nodes: "4"
|
| 123 |
+
step_num_tasks: "4"
|
| 124 |
+
step_tasks_per_node: 1(x4)
|
| 125 |
+
stepid: "0"
|
| 126 |
+
submit_dir: /home/xun/rsadhukh/STEM
|
| 127 |
+
submit_host: login-node-0
|
| 128 |
+
task_pid: "469971"
|
| 129 |
+
tasks_per_node: 1(x4)
|
| 130 |
+
topology_addr: compute-node-3
|
| 131 |
+
topology_addr_pattern: node
|
| 132 |
+
tres_per_task: cpu:128
|
| 133 |
+
umask: "0000"
|
| 134 |
+
startedAt: "2026-04-29T15:35:52.106818Z"
|
| 135 |
+
writerId: i4ocjyr9csg8kju0tej1pg06av2k8k96
|
| 136 |
+
m: []
|
| 137 |
+
python_version: 3.11.9
|
| 138 |
+
t:
|
| 139 |
+
"1":
|
| 140 |
+
- 1
|
| 141 |
+
- 5
|
| 142 |
+
- 11
|
| 143 |
+
- 49
|
| 144 |
+
- 53
|
| 145 |
+
"2":
|
| 146 |
+
- 1
|
| 147 |
+
- 5
|
| 148 |
+
- 11
|
| 149 |
+
- 49
|
| 150 |
+
- 51
|
| 151 |
+
- 53
|
| 152 |
+
- 100
|
| 153 |
+
- 105
|
| 154 |
+
"3":
|
| 155 |
+
- 13
|
| 156 |
+
- 16
|
| 157 |
+
- 61
|
| 158 |
+
"4": 3.11.9
|
| 159 |
+
"5": 0.26.0
|
| 160 |
+
"6": 5.1.0
|
| 161 |
+
"12": 0.26.0
|
| 162 |
+
"13": linux-x86_64
|
| 163 |
+
async_eval_gpus:
|
| 164 |
+
value: null
|
| 165 |
+
checkpoint:
|
| 166 |
+
value:
|
| 167 |
+
continue_training_from_init: true
|
| 168 |
+
dump:
|
| 169 |
+
every: 5000
|
| 170 |
+
keep: 1
|
| 171 |
+
eval:
|
| 172 |
+
every: 100000
|
| 173 |
+
keep: 1
|
| 174 |
+
init_ckpt_path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 175 |
+
legacy_init_ckpt_lm_transformer: false
|
| 176 |
+
merge_lm_optim_seed_ckpt_path: null
|
| 177 |
+
path: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints
|
| 178 |
+
data:
|
| 179 |
+
value:
|
| 180 |
+
add_bos: true
|
| 181 |
+
add_eos: true
|
| 182 |
+
batch_size: 8
|
| 183 |
+
load_async: true
|
| 184 |
+
n_views: 2
|
| 185 |
+
node_local: false
|
| 186 |
+
packed_source_counts: null
|
| 187 |
+
prefetch_size: 1024
|
| 188 |
+
root_dir: /home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
|
| 189 |
+
seed: 42
|
| 190 |
+
seq_len: 4096
|
| 191 |
+
sources:
|
| 192 |
+
code-meta-reasoning: 0.46
|
| 193 |
+
common_crawl-high-quality: 22.5
|
| 194 |
+
cranecode: 10
|
| 195 |
+
cranemath: 5.63
|
| 196 |
+
dolmino-math: 10.7
|
| 197 |
+
dolmino_1-flan: 5
|
| 198 |
+
gemini-reasoning-traces: 0.25
|
| 199 |
+
general_reasoning_mix: 1.87
|
| 200 |
+
math-meta-reasoning: 0.38
|
| 201 |
+
megamatt: 1.73
|
| 202 |
+
nemotron-synth-qa: 5
|
| 203 |
+
olmocr_science_pdfs: 5
|
| 204 |
+
openthoughts2: 1.25
|
| 205 |
+
program_verifiable: 0.16
|
| 206 |
+
qwq-reasoning-traces: 1.87
|
| 207 |
+
reddit_to_flashcards: 5.9
|
| 208 |
+
stack_edu: 10
|
| 209 |
+
stem-heavy-crawl: 5
|
| 210 |
+
tinymath-mind: 0.9
|
| 211 |
+
tinymath-pot: 0.24
|
| 212 |
+
tulu-3-sft: 1.1
|
| 213 |
+
wiki_to_rcqa: 3
|
| 214 |
+
tokenizer:
|
| 215 |
+
name: huggingface
|
| 216 |
+
path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
|
| 217 |
+
track_packed_source_mixture: true
|
| 218 |
+
distributed:
|
| 219 |
+
value:
|
| 220 |
+
compile: true
|
| 221 |
+
compile_cache_size_limit: 8
|
| 222 |
+
detect_anomaly: false
|
| 223 |
+
dp_replicate: 32
|
| 224 |
+
dp_shard: 1
|
| 225 |
+
float8_filter: layers\.[0-9]+\.
|
| 226 |
+
float8_recipe: null
|
| 227 |
+
fsdp_type: full_shard
|
| 228 |
+
matmul_allow_tf32: false
|
| 229 |
+
model_dtype: bf16
|
| 230 |
+
selective_activation_checkpointing: false
|
| 231 |
+
spawn_method: forkserver
|
| 232 |
+
stem_parallel_size: 8
|
| 233 |
+
tp_size: 1
|
| 234 |
+
dump_dir:
|
| 235 |
+
value: /home/xun/rsadhukh/STEM/logs/midfine_base_final
|
| 236 |
+
env:
|
| 237 |
+
value:
|
| 238 |
+
ENABLE_INTRA_NODE_COMM: "1"
|
| 239 |
+
MKL_NUM_THREADS: "1"
|
| 240 |
+
MKL_SERVICE_FORCE_INTEL: GNU
|
| 241 |
+
NCCL_DEBUG: INFO
|
| 242 |
+
NCCL_IB_TIMEOUT: "22"
|
| 243 |
+
OMP_NUM_THREADS: "1"
|
| 244 |
+
TORCH_NCCL_ASYNC_ERROR_HANDLING: "1"
|
| 245 |
+
TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
|
| 246 |
+
eval:
|
| 247 |
+
value:
|
| 248 |
+
generator:
|
| 249 |
+
dtype: bf16
|
| 250 |
+
max_tokens: 16384
|
| 251 |
+
temperature: 1
|
| 252 |
+
top_p: 0.95
|
| 253 |
+
harness:
|
| 254 |
+
batch_size: 64
|
| 255 |
+
confirm_run_unsafe_code: true
|
| 256 |
+
tasks:
|
| 257 |
+
- dataset_path: /data/rsadhukh/eval_data/hellaswag
|
| 258 |
+
task: hellaswag
|
| 259 |
+
- dataset_path: /data/rsadhukh/eval_data/super_glue
|
| 260 |
+
task: boolq
|
| 261 |
+
- dataset_path: /data/rsadhukh/eval_data/piqa
|
| 262 |
+
task: piqa
|
| 263 |
+
- dataset_path: /data/rsadhukh/eval_data/winogrande
|
| 264 |
+
task: winogrande
|
| 265 |
+
- dataset_path: /data/rsadhukh/eval_data/openbookqa
|
| 266 |
+
task: openbookqa
|
| 267 |
+
- dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 268 |
+
task: arc_easy
|
| 269 |
+
- dataset_path: /data/rsadhukh/eval_data/ai2_arc
|
| 270 |
+
task: arc_challenge
|
| 271 |
+
validation: null
|
| 272 |
+
gc_collect_freq:
|
| 273 |
+
value: 1000
|
| 274 |
+
grad_acc_steps:
|
| 275 |
+
value: 2
|
| 276 |
+
logging:
|
| 277 |
+
value:
|
| 278 |
+
acc_freq: null
|
| 279 |
+
freq: 10
|
| 280 |
+
wandb:
|
| 281 |
+
allow_val_change: null
|
| 282 |
+
anonymous: null
|
| 283 |
+
config_exclude_keys: null
|
| 284 |
+
config_include_keys: null
|
| 285 |
+
dir: null
|
| 286 |
+
entity: null
|
| 287 |
+
force: null
|
| 288 |
+
fork_from: null
|
| 289 |
+
group: null
|
| 290 |
+
id: null
|
| 291 |
+
job_type: null
|
| 292 |
+
mode: null
|
| 293 |
+
monitor_gym: null
|
| 294 |
+
name: olmo2_1B_midfine
|
| 295 |
+
notes: null
|
| 296 |
+
project: stem
|
| 297 |
+
resume: null
|
| 298 |
+
resume_from: null
|
| 299 |
+
save_code: null
|
| 300 |
+
sync_tensorboard: null
|
| 301 |
+
tags: null
|
| 302 |
+
tensorboard: null
|
| 303 |
+
model:
|
| 304 |
+
value:
|
| 305 |
+
dim: 2048
|
| 306 |
+
ffn_dim_multiplier: 1.5
|
| 307 |
+
head_dim: 128
|
| 308 |
+
init_base_std: 0.02
|
| 309 |
+
init_std_factor: disabled
|
| 310 |
+
max_seqlen: 4096
|
| 311 |
+
multiple_of: 256
|
| 312 |
+
n_heads: 16
|
| 313 |
+
n_kv_heads: 16
|
| 314 |
+
n_layers: 16
|
| 315 |
+
norm_eps: 1e-06
|
| 316 |
+
rope_scaling: null
|
| 317 |
+
rope_theta: 500000
|
| 318 |
+
seed: 42
|
| 319 |
+
sliding_window: null
|
| 320 |
+
vocab_size: 100352
|
| 321 |
+
weight_tying: false
|
| 322 |
+
model_type:
|
| 323 |
+
value: olmo3
|
| 324 |
+
optim:
|
| 325 |
+
value:
|
| 326 |
+
annealing_step: 1000
|
| 327 |
+
beta1: 0.9
|
| 328 |
+
beta2: 0.95
|
| 329 |
+
clip: 1
|
| 330 |
+
cosine_theta: 1
|
| 331 |
+
cycle_length: 1
|
| 332 |
+
decay_fraction: 0.1
|
| 333 |
+
epsilon: 1e-08
|
| 334 |
+
exp_factor: 0.5
|
| 335 |
+
global_final_step: null
|
| 336 |
+
initial_token_offset: 0
|
| 337 |
+
lr: 7.44e-05
|
| 338 |
+
lr_min_ratio: 0
|
| 339 |
+
scheduler: linear
|
| 340 |
+
warmup: 0
|
| 341 |
+
weight_decay: 0.1
|
| 342 |
+
probe_freq:
|
| 343 |
+
value: 100
|
| 344 |
+
profiling:
|
| 345 |
+
value:
|
| 346 |
+
mem_steps: 2
|
| 347 |
+
mem_warmup: 100
|
| 348 |
+
profile_steps: 2
|
| 349 |
+
profile_warmup: 102
|
| 350 |
+
run: true
|
| 351 |
+
trace_folder: profiling
|
| 352 |
+
seed:
|
| 353 |
+
value: 777
|
| 354 |
+
stage_steps:
|
| 355 |
+
value: null
|
| 356 |
+
stem_up_proj_layers:
|
| 357 |
+
value: []
|
| 358 |
+
steps:
|
| 359 |
+
value: 50000
|
wandb/run-20260429_153552-r20yn80u/files/media/html/memory_trace_15050_79effaa90bfee7eb3207.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_153552-r20yn80u/files/media/html/profile_trace_15051_ae282608c6eeb7f48826.html
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<base target="_blank"><link rel="stylesheet" type="text/css" href="https://app.wandb.ai/normalize.css" />/home/xun/rsadhukh/STEM/logs/midfine_base_final/profiling/profile_CPU_CUDA_000104/rank00_compute-node-14_1060320.1777425671449116530.pt.trace.html.gz
|
wandb/run-20260429_153552-r20yn80u/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_153552-r20yn80u/files/requirements.txt
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DataProperty==1.1.0
|
| 2 |
+
absl-py==2.4.0
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.13.5
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
annotated-doc==0.0.4
|
| 7 |
+
annotated-types==0.7.0
|
| 8 |
+
antlr4-python3-runtime==4.9.3
|
| 9 |
+
anyio==4.13.0
|
| 10 |
+
argon2-cffi==25.1.0
|
| 11 |
+
argon2-cffi-bindings==25.1.0
|
| 12 |
+
arrow==1.4.0
|
| 13 |
+
asttokens==3.0.1
|
| 14 |
+
async-lru==2.3.0
|
| 15 |
+
attrs==26.1.0
|
| 16 |
+
babel==2.18.0
|
| 17 |
+
beautifulsoup4==4.14.3
|
| 18 |
+
bleach==6.3.0
|
| 19 |
+
blessed==1.38.0
|
| 20 |
+
blobfile==3.2.0
|
| 21 |
+
certifi==2026.2.25
|
| 22 |
+
cffi==2.0.0
|
| 23 |
+
chardet==5.2.0
|
| 24 |
+
charset-normalizer==3.4.7
|
| 25 |
+
click==8.3.2
|
| 26 |
+
colorama==0.4.6
|
| 27 |
+
comm==0.2.3
|
| 28 |
+
datasets==4.8.4
|
| 29 |
+
datatrove==0.9.0
|
| 30 |
+
debugpy==1.8.20
|
| 31 |
+
decorator==5.2.1
|
| 32 |
+
defusedxml==0.7.1
|
| 33 |
+
dill==0.4.1
|
| 34 |
+
evaluate==0.4.6
|
| 35 |
+
executing==2.2.1
|
| 36 |
+
fastjsonschema==2.21.2
|
| 37 |
+
filelock==3.28.0
|
| 38 |
+
fqdn==1.5.1
|
| 39 |
+
frozenlist==1.8.0
|
| 40 |
+
fsspec==2026.2.0
|
| 41 |
+
gitdb==4.0.12
|
| 42 |
+
GitPython==3.1.46
|
| 43 |
+
gpustat==1.1.1
|
| 44 |
+
h11==0.16.0
|
| 45 |
+
hf-xet==1.4.3
|
| 46 |
+
httpcore==1.0.9
|
| 47 |
+
httpx==0.28.1
|
| 48 |
+
huggingface_hub==1.11.0
|
| 49 |
+
humanize==4.15.0
|
| 50 |
+
idna==3.11
|
| 51 |
+
ipykernel==7.2.0
|
| 52 |
+
ipython==9.13.0
|
| 53 |
+
ipython_pygments_lexers==1.1.1
|
| 54 |
+
isoduration==20.11.0
|
| 55 |
+
jedi==0.19.2
|
| 56 |
+
Jinja2==3.1.6
|
| 57 |
+
joblib==1.5.3
|
| 58 |
+
json5==0.14.0
|
| 59 |
+
jsonlines==4.0.0
|
| 60 |
+
jsonpointer==3.1.1
|
| 61 |
+
jsonschema==4.26.0
|
| 62 |
+
jsonschema-specifications==2025.9.1
|
| 63 |
+
jupyter_client==8.8.0
|
| 64 |
+
jupyter_core==5.9.1
|
| 65 |
+
jupyter-events==0.12.1
|
| 66 |
+
jupyter-lsp==2.3.1
|
| 67 |
+
jupyter_server==2.17.0
|
| 68 |
+
jupyter_server_terminals==0.5.4
|
| 69 |
+
jupyterlab==4.5.6
|
| 70 |
+
jupyterlab_pygments==0.3.0
|
| 71 |
+
jupyterlab_server==2.28.0
|
| 72 |
+
lark==1.3.1
|
| 73 |
+
lm_eval==0.4.11
|
| 74 |
+
loguru==0.7.3
|
| 75 |
+
lxml==6.1.0
|
| 76 |
+
markdown-it-py==4.0.0
|
| 77 |
+
MarkupSafe==3.0.3
|
| 78 |
+
matplotlib-inline==0.2.1
|
| 79 |
+
mbstrdecoder==1.1.4
|
| 80 |
+
mdurl==0.1.2
|
| 81 |
+
mistune==3.2.0
|
| 82 |
+
more-itertools==11.0.2
|
| 83 |
+
mpmath==1.3.0
|
| 84 |
+
msgspec==0.21.1
|
| 85 |
+
multidict==6.7.1
|
| 86 |
+
multiprocess==0.70.19
|
| 87 |
+
nbclient==0.10.4
|
| 88 |
+
nbconvert==7.17.1
|
| 89 |
+
nbformat==5.10.4
|
| 90 |
+
nest-asyncio==1.6.0
|
| 91 |
+
networkx==3.6.1
|
| 92 |
+
nltk==3.9.4
|
| 93 |
+
notebook_shim==0.2.4
|
| 94 |
+
numpy==2.4.4
|
| 95 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 96 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 97 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 98 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 99 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 100 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 101 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 102 |
+
nvidia-curand-cu12==10.3.9.90
|
| 103 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 104 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 105 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 106 |
+
nvidia-ml-py==13.595.45
|
| 107 |
+
nvidia-nccl-cu12==2.27.3
|
| 108 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 109 |
+
nvidia-nvtx-cu12==12.8.90
|
| 110 |
+
objprint==0.3.0
|
| 111 |
+
omegaconf==2.3.0
|
| 112 |
+
orjson==3.11.8
|
| 113 |
+
overrides==7.7.0
|
| 114 |
+
packaging==26.1
|
| 115 |
+
pandas==3.0.2
|
| 116 |
+
pandocfilters==1.5.1
|
| 117 |
+
parso==0.8.6
|
| 118 |
+
pathvalidate==3.3.1
|
| 119 |
+
pexpect==4.9.0
|
| 120 |
+
pip==26.0.1
|
| 121 |
+
platformdirs==4.9.6
|
| 122 |
+
portalocker==3.2.0
|
| 123 |
+
prometheus_client==0.25.0
|
| 124 |
+
prompt_toolkit==3.0.52
|
| 125 |
+
propcache==0.4.1
|
| 126 |
+
protobuf==7.34.1
|
| 127 |
+
psutil==7.2.2
|
| 128 |
+
ptyprocess==0.7.0
|
| 129 |
+
pure_eval==0.2.3
|
| 130 |
+
pyarrow==23.0.1
|
| 131 |
+
pycparser==3.0
|
| 132 |
+
pycryptodomex==3.23.0
|
| 133 |
+
pydantic==2.13.2
|
| 134 |
+
pydantic_core==2.46.2
|
| 135 |
+
Pygments==2.20.0
|
| 136 |
+
pynvml==13.0.1
|
| 137 |
+
pytablewriter==1.2.1
|
| 138 |
+
python-dateutil==2.9.0.post0
|
| 139 |
+
python-json-logger==4.1.0
|
| 140 |
+
pytz==2026.1.post1
|
| 141 |
+
PyYAML==6.0.3
|
| 142 |
+
pyzmq==27.1.0
|
| 143 |
+
referencing==0.37.0
|
| 144 |
+
regex==2026.4.4
|
| 145 |
+
requests==2.33.1
|
| 146 |
+
rfc3339-validator==0.1.4
|
| 147 |
+
rfc3986-validator==0.1.1
|
| 148 |
+
rfc3987-syntax==1.1.0
|
| 149 |
+
rich==15.0.0
|
| 150 |
+
rouge_score==0.1.2
|
| 151 |
+
rpds-py==0.30.0
|
| 152 |
+
sacrebleu==2.6.0
|
| 153 |
+
safetensors==0.7.0
|
| 154 |
+
scikit-learn==1.8.0
|
| 155 |
+
scipy==1.17.1
|
| 156 |
+
Send2Trash==2.1.0
|
| 157 |
+
sentencepiece==0.2.1
|
| 158 |
+
sentry-sdk==2.58.0
|
| 159 |
+
setuptools==65.5.0
|
| 160 |
+
shellingham==1.5.4
|
| 161 |
+
six==1.17.0
|
| 162 |
+
smmap==5.0.3
|
| 163 |
+
soupsieve==2.8.3
|
| 164 |
+
sqlitedict==2.1.0
|
| 165 |
+
stack-data==0.6.3
|
| 166 |
+
sympy==1.14.0
|
| 167 |
+
tabledata==1.3.4
|
| 168 |
+
tabulate==0.10.0
|
| 169 |
+
tcolorpy==0.1.7
|
| 170 |
+
terminado==0.18.1
|
| 171 |
+
threadpoolctl==3.6.0
|
| 172 |
+
tiktoken==0.12.0
|
| 173 |
+
tinycss2==1.4.0
|
| 174 |
+
tokenizers==0.22.2
|
| 175 |
+
torch==2.8.0
|
| 176 |
+
tornado==6.5.5
|
| 177 |
+
tqdm==4.67.3
|
| 178 |
+
traitlets==5.14.3
|
| 179 |
+
transformers==5.1.0
|
| 180 |
+
triton==3.4.0
|
| 181 |
+
typepy==1.3.4
|
| 182 |
+
typer==0.24.1
|
| 183 |
+
typer-slim==0.24.0
|
| 184 |
+
typing_extensions==4.15.0
|
| 185 |
+
typing-inspection==0.4.2
|
| 186 |
+
tzdata==2026.2
|
| 187 |
+
uri-template==1.3.0
|
| 188 |
+
urllib3==2.6.3
|
| 189 |
+
viztracer==1.1.1
|
| 190 |
+
wandb==0.26.0
|
| 191 |
+
wcwidth==0.6.0
|
| 192 |
+
webcolors==25.10.0
|
| 193 |
+
webencodings==0.5.1
|
| 194 |
+
websocket-client==1.9.0
|
| 195 |
+
word2number==1.1
|
| 196 |
+
xformers==0.0.32.post1
|
| 197 |
+
xxhash==3.6.0
|
| 198 |
+
yarl==1.23.0
|
| 199 |
+
zstandard==0.25.0
|
wandb/run-20260429_153552-r20yn80u/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.9",
|
| 4 |
+
"startedAt": "2026-04-29T15:35:52.106818Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"config=apps/main/configs/olmo2_1B_midfine.yaml",
|
| 7 |
+
"dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 8 |
+
"checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 9 |
+
"checkpoint.continue_training_from_init=true",
|
| 10 |
+
"checkpoint.dump.every=5000",
|
| 11 |
+
"checkpoint.eval.every=100000",
|
| 12 |
+
"checkpoint.dump.keep=1",
|
| 13 |
+
"data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
|
| 14 |
+
"data.node_local=false",
|
| 15 |
+
"data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
|
| 16 |
+
"logging.wandb.name=midfine_base_final"
|
| 17 |
+
],
|
| 18 |
+
"program": "-m apps.main.train",
|
| 19 |
+
"git": {
|
| 20 |
+
"remote": "https://github.com/Infini-AI-Lab/STEM.git",
|
| 21 |
+
"commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
|
| 22 |
+
},
|
| 23 |
+
"email": "rsadhukh@andrew.cmu.edu",
|
| 24 |
+
"root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
|
| 25 |
+
"host": "compute-node-3",
|
| 26 |
+
"executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
|
| 27 |
+
"cpu_count": 88,
|
| 28 |
+
"cpu_count_logical": 176,
|
| 29 |
+
"gpu": "NVIDIA H200",
|
| 30 |
+
"gpu_count": 8,
|
| 31 |
+
"disk": {
|
| 32 |
+
"/": {
|
| 33 |
+
"total": "133003395072",
|
| 34 |
+
"used": "103744323584"
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"memory": {
|
| 38 |
+
"total": "2071474647040"
|
| 39 |
+
},
|
| 40 |
+
"gpu_nvidia": [
|
| 41 |
+
{
|
| 42 |
+
"name": "NVIDIA H200",
|
| 43 |
+
"memoryTotal": "150754820096",
|
| 44 |
+
"cudaCores": 16896,
|
| 45 |
+
"architecture": "Hopper",
|
| 46 |
+
"uuid": "GPU-dbeb9076-fd61-4013-987f-938d1db8b786"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"name": "NVIDIA H200",
|
| 50 |
+
"memoryTotal": "150754820096",
|
| 51 |
+
"cudaCores": 16896,
|
| 52 |
+
"architecture": "Hopper",
|
| 53 |
+
"uuid": "GPU-5b9b54c7-efcf-6e85-08ed-f6e6f61cfa7a"
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"name": "NVIDIA H200",
|
| 57 |
+
"memoryTotal": "150754820096",
|
| 58 |
+
"cudaCores": 16896,
|
| 59 |
+
"architecture": "Hopper",
|
| 60 |
+
"uuid": "GPU-df8b695a-d295-cf7c-ab3b-b6d764b11fdf"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA H200",
|
| 64 |
+
"memoryTotal": "150754820096",
|
| 65 |
+
"cudaCores": 16896,
|
| 66 |
+
"architecture": "Hopper",
|
| 67 |
+
"uuid": "GPU-c7480abd-eae7-8916-7803-4e033e94aaa0"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "NVIDIA H200",
|
| 71 |
+
"memoryTotal": "150754820096",
|
| 72 |
+
"cudaCores": 16896,
|
| 73 |
+
"architecture": "Hopper",
|
| 74 |
+
"uuid": "GPU-91d17507-e0ee-813d-211c-6dbbe87e7f52"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA H200",
|
| 78 |
+
"memoryTotal": "150754820096",
|
| 79 |
+
"cudaCores": 16896,
|
| 80 |
+
"architecture": "Hopper",
|
| 81 |
+
"uuid": "GPU-fcbf89aa-71d3-6603-a80d-9bfbd3f063a2"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H200",
|
| 85 |
+
"memoryTotal": "150754820096",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper",
|
| 88 |
+
"uuid": "GPU-10b440b6-fedb-33fe-cad7-b1b1dfd65816"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"name": "NVIDIA H200",
|
| 92 |
+
"memoryTotal": "150754820096",
|
| 93 |
+
"cudaCores": 16896,
|
| 94 |
+
"architecture": "Hopper",
|
| 95 |
+
"uuid": "GPU-7da58531-cbc7-dedf-3a29-6540eaf04fe7"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"cudaVersion": "13.0",
|
| 99 |
+
"slurm": {
|
| 100 |
+
"cluster_name": "cluster",
|
| 101 |
+
"conf": "/var/spool/slurmd/conf-cache/slurm.conf",
|
| 102 |
+
"cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 103 |
+
"cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
|
| 104 |
+
"cpu_bind_type": "mask_cpu:",
|
| 105 |
+
"cpu_bind_verbose": "quiet",
|
| 106 |
+
"cpus_on_node": "128",
|
| 107 |
+
"cpus_per_task": "128",
|
| 108 |
+
"distribution": "cyclic",
|
| 109 |
+
"gpus_on_node": "8",
|
| 110 |
+
"gtids": "0",
|
| 111 |
+
"job_cpus_per_node": "128(x4)",
|
| 112 |
+
"job_end_time": "1777649690",
|
| 113 |
+
"job_gid": "1005",
|
| 114 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
| 115 |
+
"job_id": "29546",
|
| 116 |
+
"job_name": "stem",
|
| 117 |
+
"job_nodelist": "compute-node-[3,7,46-47]",
|
| 118 |
+
"job_num_nodes": "4",
|
| 119 |
+
"job_partition": "high",
|
| 120 |
+
"job_start_time": "1777476890",
|
| 121 |
+
"job_uid": "1005",
|
| 122 |
+
"job_user": "xun",
|
| 123 |
+
"jobid": "29546",
|
| 124 |
+
"launch_node_ipaddr": "172.27.61.166",
|
| 125 |
+
"localid": "0",
|
| 126 |
+
"nnodes": "4",
|
| 127 |
+
"nodeid": "0",
|
| 128 |
+
"nodelist": "compute-node-[3,7,46-47]",
|
| 129 |
+
"nprocs": "4",
|
| 130 |
+
"ntasks": "4",
|
| 131 |
+
"ntasks_per_node": "1",
|
| 132 |
+
"output_mode": "standard",
|
| 133 |
+
"prio_process": "0",
|
| 134 |
+
"procid": "0",
|
| 135 |
+
"srun_comm_host": "172.27.61.166",
|
| 136 |
+
"srun_comm_port": "33673",
|
| 137 |
+
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 138 |
+
"step_id": "0",
|
| 139 |
+
"step_launcher_port": "33673",
|
| 140 |
+
"step_nodelist": "compute-node-[3,7,46-47]",
|
| 141 |
+
"step_num_nodes": "4",
|
| 142 |
+
"step_num_tasks": "4",
|
| 143 |
+
"step_tasks_per_node": "1(x4)",
|
| 144 |
+
"stepid": "0",
|
| 145 |
+
"submit_dir": "/home/xun/rsadhukh/STEM",
|
| 146 |
+
"submit_host": "login-node-0",
|
| 147 |
+
"task_pid": "469971",
|
| 148 |
+
"tasks_per_node": "1(x4)",
|
| 149 |
+
"topology_addr": "compute-node-3",
|
| 150 |
+
"topology_addr_pattern": "node",
|
| 151 |
+
"tres_per_task": "cpu:128",
|
| 152 |
+
"umask": "0000"
|
| 153 |
+
},
|
| 154 |
+
"writerId": "i4ocjyr9csg8kju0tej1pg06av2k8k96"
|
| 155 |
+
}
|
wandb/run-20260429_153552-r20yn80u/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"evals/piqa/acc,none":0.7486398258977149,"evals/hellaswag/acc_norm,none":0.6694881497709619,"data/source_fraction_observed/wiki_to_rcqa":0.030630072639507337,"memory/max_active_pct":46.8194972395729,"speed/wps":47602.85371169578,"evals/arc_challenge/acc_stderr,none":0.014434138713379983,"evals/boolq/acc_stderr,none":0.008060817222724517,"evals/boolq/acc,none":0.6938837920489297,"data/source_fraction_observed/megamatt":0.017663693539813355,"data/source_fraction_observed/math-meta-reasoning":0.0038804780352853807,"evals/winogrande/acc,none":0.665351223362273,"_runtime":49923.264815984,"data/source_fraction_observed/tinymath-mind":0.009189315824162799,"evals/piqa/acc_norm_stderr,none":0.010150090834551784,"evals/hellaswag/acc_stderr,none":0.004989562798280521,"memory/max_active_gib":65.45405149459839,"data/source_fraction_observed/common_crawl-high-quality":0.2297319930984784,"evals/arc_easy/acc,none":0.7563131313131313,"evals/openbookqa/acc_norm_stderr,none":0.021893529941665813,"evals/arc_challenge/acc_norm_stderr,none":0.014539646098471627,"data/source_fraction_observed/olmocr_science_pdfs":0.05105317905785341,"data/source_fraction_observed/tinymath-pot":0.0024504434987244546,"memory/power_draw":609672,"data/source_fraction_observed/stack_edu":0.10210445159604291,"evals/arc_easy/acc_norm_stderr,none":0.00880400984686553,"memory_trace":{"_type":"html-file","sha256":"79effaa90bfee7eb3207116787e0d32fc6e6609131fb886a51dbf18b98dc37e0","size":1160648,"path":"media/html/memory_trace_15050_79effaa90bfee7eb3207.html"},"data/source_fraction_observed/general_reasoning_mix":0.019094107596688604,"speed/FLOPS":4.1014138616573406e+14,"memory/max_reserved_gib":76.771484375,"evals/openbookqa/acc_stderr,none":0.019920483209566072,"speed/curr_iter_time":0.6719,"optim/total_tokens":104857600000,"speed/data_load_time":0.0866,"evals/openbookqa/acc_norm,none":0.396,"evals/winogrande/acc_stderr,none":0.013261823629558363,"data/source_fraction_observed/dolmino_1-flan":0.05105102658870274,"_wandb":{"runtime":49923},"acc_step":0,"loss/out":1.4885873794555664,"evals/hellaswag/acc,none":0.4953196574387572,"data/source_fraction_observed/cranecode":0.10210264673193477,"memory/max_reserved_pct":54.91489401645145,"profile_trace":{"_type":"html-file","sha256":"ae282608c6eeb7f488268f66f032340df98e913f5ce9638a3c2c7094ba3e8cff","size":254,"path":"media/html/profile_trace_15051_ae282608c6eeb7f48826.html"},"memory/num_alloc_retries":0,"data/source_fraction_observed/code-meta-reasoning":0.0046968633745099325,"global_step":50000,"data/source_fraction_observed/tulu-3-sft":0.011231371758863785,"evals/openbookqa/acc,none":0.272,"data/source_fraction_observed/openthoughts2":0.012764728611865424,"_step":50000,"evals/arc_easy/acc_stderr,none":0.00880917174472056,"evals/hellaswag/acc_norm_stderr,none":0.00469436096892941,"memory/num_ooms":0,"evals/piqa/acc_norm,none":0.7464635473340587,"data/source_fraction_observed/program_verifiable":0.0016341602295396964,"evals/arc_challenge/acc,none":0.42235494880546076,"data/source_fraction_observed/gemini-reasoning-traces":0.0025537766184913608,"data/source_fraction_observed/dolmino-math":0.10924986505265755,"data/source_fraction_observed/stem-heavy-crawl":0.051052169668858886,"evals/piqa/acc_stderr,none":0.010121156016819262,"optim/lr":1.4880000000014883e-09,"optim/grad_norm":0.14628654718399048,"evals/arc_challenge/acc_norm,none":0.45051194539249145,"_timestamp":1.7775268766324546e+09,"data/source_fraction_observed/qwq-reasoning-traces":0.01909534818756,"data/source_fraction_observed/reddit_to_flashcards":0.060235367293967584,"data/source_fraction_observed/nemotron-synth-qa":0.051050973802153475,"data/source_fraction_observed/cranemath":0.057483967194338154,"evals/arc_easy/acc_norm,none":0.7567340067340067}
|
wandb/run-20260429_153552-r20yn80u/logs/debug-core.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-04-29T15:35:52.224389621Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmp4zakewjj/port-470303.txt","pid":470303,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-04-29T15:35:52.224902018Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":470303}
|
| 3 |
+
{"time":"2026-04-29T15:35:52.224835671Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-470303-471249-3119761670/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-04-29T15:35:52.400060766Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-04-29T15:35:52.410260917Z","level":"INFO","msg":"handleInformInit: received","streamId":"r20yn80u","id":"1(@)"}
|
| 6 |
+
{"time":"2026-04-29T15:35:52.923314075Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"r20yn80u","id":"1(@)"}
|
| 7 |
+
{"time":"2026-04-29T15:35:59.077871941Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"2f4w6m7xtrwf"}
|
| 8 |
+
{"time":"2026-04-30T05:27:57.103687988Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 9 |
+
{"time":"2026-04-30T05:27:57.103779861Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
+
{"time":"2026-04-30T05:27:57.104393393Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2026-04-30T05:27:57.104400088Z","level":"INFO","msg":"server is shutting down"}
|
| 12 |
+
{"time":"2026-04-30T05:27:57.104470461Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/scratch/local/xun/tmp/wandb-470303-471249-3119761670/socket","Net":"unix"}}
|
| 13 |
+
{"time":"2026-04-30T05:27:58.995066197Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 14 |
+
{"time":"2026-04-30T05:27:58.995094706Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 15 |
+
{"time":"2026-04-30T05:27:58.99510986Z","level":"INFO","msg":"server is closed"}
|
wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/run-20260429_153552-r20yn80u/logs/debug.log
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
|
| 2 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Configure stats pid to 470303
|
| 3 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug.log
|
| 5 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
|
| 6 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():848] calling init triggers
|
| 7 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
|
| 9 |
+
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():896] starting backend
|
| 10 |
+
2026-04-29 15:35:52,400 INFO MainThread:470303 [wandb_init.py:init():911] sending inform_init request
|
| 11 |
+
2026-04-29 15:35:52,408 INFO MainThread:470303 [wandb_init.py:init():919] backend started and connected
|
| 12 |
+
2026-04-29 15:35:52,410 INFO MainThread:470303 [wandb_init.py:init():989] updated telemetry
|
| 13 |
+
2026-04-29 15:35:52,430 INFO MainThread:470303 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-04-29 15:35:53,838 INFO MainThread:470303 [wandb_init.py:init():1058] starting run threads in backend
|
| 15 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_console_start():2542] atexit reg
|
| 16 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2391] redirect: wrap_raw
|
| 17 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2460] Wrapping output streams.
|
| 18 |
+
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2483] Redirects installed.
|
| 19 |
+
2026-04-29 15:35:54,077 INFO MainThread:470303 [wandb_init.py:init():1098] run started, returning control to user process
|
| 20 |
+
2026-04-30 05:27:57,103 INFO wandb-AsyncioManager-main:470303 [service_client.py:_forward_responses():134] Reached EOF.
|
| 21 |
+
2026-04-30 05:27:57,104 INFO wandb-AsyncioManager-main:470303 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
|
| 22 |
+
2026-04-30 05:27:59,641 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
|
| 23 |
+
Traceback (most recent call last):
|
| 24 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
|
| 25 |
+
await fn()
|
| 26 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
|
| 27 |
+
await self._send_server_request(request)
|
| 28 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
|
| 29 |
+
await self._drain_writer()
|
| 30 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
|
| 31 |
+
await self._writer.drain()
|
| 32 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
|
| 33 |
+
await self._protocol._drain_helper()
|
| 34 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
|
| 35 |
+
raise ConnectionResetError('Connection lost')
|
| 36 |
+
ConnectionResetError: Connection lost
|
| 37 |
+
2026-04-30 05:27:59,660 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
|
| 38 |
+
Traceback (most recent call last):
|
| 39 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
|
| 40 |
+
await fn()
|
| 41 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
|
| 42 |
+
await self._send_server_request(request)
|
| 43 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 78, in _send_server_request
|
| 44 |
+
raise self._broken_exc.with_traceback(self._broken_tb)
|
| 45 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
|
| 46 |
+
await self._drain_writer()
|
| 47 |
+
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
|
| 48 |
+
await self._writer.drain()
|
| 49 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
|
| 50 |
+
await self._protocol._drain_helper()
|
| 51 |
+
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
|
| 52 |
+
raise ConnectionResetError('Connection lost')
|
| 53 |
+
ConnectionResetError: Connection lost
|