Rano23 commited on
Commit
30f3e88
·
verified ·
1 Parent(s): f359d91

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. checkpoints/0000050000/params.json +1 -0
  2. checkpoints/0000050000/train_state_00000.json +1 -0
  3. checkpoints/0000050000/train_state_00001.json +1 -0
  4. checkpoints/0000050000/train_state_00002.json +1 -0
  5. checkpoints/0000050000/train_state_00003.json +1 -0
  6. checkpoints/0000050000/train_state_00004.json +1 -0
  7. checkpoints/0000050000/train_state_00005.json +1 -0
  8. checkpoints/0000050000/train_state_00006.json +1 -0
  9. checkpoints/0000050000/train_state_00007.json +1 -0
  10. checkpoints/0000050000/train_state_00008.json +1 -0
  11. checkpoints/0000050000/train_state_00010.json +1 -0
  12. checkpoints/0000050000/train_state_00013.json +1 -0
  13. checkpoints/0000050000/train_state_00014.json +1 -0
  14. checkpoints/0000050000/train_state_00015.json +1 -0
  15. checkpoints/0000050000/train_state_00017.json +1 -0
  16. checkpoints/0000050000/train_state_00018.json +1 -0
  17. checkpoints/0000050000/train_state_00019.json +1 -0
  18. checkpoints/0000050000/train_state_00020.json +1 -0
  19. config.yaml +179 -0
  20. evals/0000050000/config.yaml +62 -0
  21. evals/0000050000/results.json +1 -0
  22. metrics.eval.jsonl +1 -0
  23. train.log +0 -0
  24. wandb/debug-internal.log +0 -0
  25. wandb/debug.log +53 -0
  26. wandb/run-20260429_011802-2wmkezq3/files/media/html/memory_trace_50_79effaa90bfee7eb3207.html +0 -0
  27. wandb/run-20260429_011802-2wmkezq3/files/media/html/profile_trace_51_ae282608c6eeb7f48826.html +1 -0
  28. wandb/run-20260429_011802-2wmkezq3/files/output.log +0 -0
  29. wandb/run-20260429_011802-2wmkezq3/files/requirements.txt +177 -0
  30. wandb/run-20260429_011802-2wmkezq3/files/wandb-metadata.json +155 -0
  31. wandb/run-20260429_011802-2wmkezq3/logs/debug-core.log +11 -0
  32. wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log +0 -0
  33. wandb/run-20260429_011802-2wmkezq3/logs/debug.log +19 -0
  34. wandb/run-20260429_141040-a48q7rq3/files/output.log +1 -0
  35. wandb/run-20260429_141040-a48q7rq3/files/requirements.txt +199 -0
  36. wandb/run-20260429_141040-a48q7rq3/files/wandb-metadata.json +155 -0
  37. wandb/run-20260429_141040-a48q7rq3/logs/debug-core.log +10 -0
  38. wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log +9 -0
  39. wandb/run-20260429_141040-a48q7rq3/logs/debug.log +19 -0
  40. wandb/run-20260429_141040-a48q7rq3/run-a48q7rq3.wandb +0 -0
  41. wandb/run-20260429_153552-r20yn80u/files/config.yaml +359 -0
  42. wandb/run-20260429_153552-r20yn80u/files/media/html/memory_trace_15050_79effaa90bfee7eb3207.html +0 -0
  43. wandb/run-20260429_153552-r20yn80u/files/media/html/profile_trace_15051_ae282608c6eeb7f48826.html +1 -0
  44. wandb/run-20260429_153552-r20yn80u/files/output.log +0 -0
  45. wandb/run-20260429_153552-r20yn80u/files/requirements.txt +199 -0
  46. wandb/run-20260429_153552-r20yn80u/files/wandb-metadata.json +155 -0
  47. wandb/run-20260429_153552-r20yn80u/files/wandb-summary.json +1 -0
  48. wandb/run-20260429_153552-r20yn80u/logs/debug-core.log +15 -0
  49. wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log +0 -0
  50. wandb/run-20260429_153552-r20yn80u/logs/debug.log +53 -0
checkpoints/0000050000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "olmo2_1B_midfine", "dump_dir": "/home/xun/rsadhukh/STEM/logs/midfine_base_final", "seed": 777, "model_type": "olmo3", "stem_up_proj_layers": [], "grad_acc_steps": 2, "gc_collect_freq": 1000, "probe_freq": 100, "steps": 50000, "stage_steps": null, "data": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "node_local": false, "batch_size": 8, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "track_packed_source_mixture": true, "packed_source_counts": {"code-meta-reasoning": 10977463, "common_crawl-high-quality": 536940263, "cranecode": 238637058, "cranemath": 134353949, "dolmino-math": 255343043, "dolmino_1-flan": 119319719, "gemini-reasoning-traces": 5969666, "general_reasoning_mix": 44625695, "math-meta-reasoning": 9068782, "megamatt": 41284629, "nemotron-synth-qa": 119319374, "olmocr_science_pdfs": 119319342, "openthoughts2": 29838881, "program_verifiable": 3819403, "qwq-reasoning-traces": 44631652, "reddit_to_flashcards": 140789161, "stack_edu": 238638082, "stem-heavy-crawl": 119319916, "tinymath-mind": 21478051, "tinymath-pot": 5727426, "tulu-3-sft": 26250698, "wiki_to_rcqa": 71590884}}, "optim": {"lr": 7.44e-05, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "linear", "warmup": 0, "lr_min_ratio": 0.0, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5, "initial_token_offset": 0, "global_final_step": null}, "model": {"dim": 2048, "n_layers": 16, "head_dim": 128, "n_heads": 16, "n_kv_heads": 16, "ffn_dim_multiplier": 1.5, "multiple_of": 256, "norm_eps": 1e-06, "rope_theta": 500000.0, "rope_scaling": null, "init_base_std": 0.02, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 100352, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 32, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver", "stem_parallel_size": 8}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 5000, "keep": 1}, "eval": {"every": 100000, "keep": 1}, "path": "/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints", "init_ckpt_path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/", "continue_training_from_init": true, "legacy_init_ckpt_lm_transformer": false, "merge_lm_optim_seed_ckpt_path": null}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2}, "logging": {"freq": 10, "acc_freq": null, "wandb": {"job_type": null, "dir": null, "project": "stem", "entity": null, "tags": null, "group": null, "name": "olmo2_1B_midfine", "notes": null, "config_exclude_keys": null, "config_include_keys": null, "anonymous": null, "mode": null, "allow_val_change": null, "resume": null, "force": null, "tensorboard": null, "sync_tensorboard": null, "monitor_gym": null, "save_code": null, "id": null, "fork_from": null, "resume_from": null}}, "async_eval_gpus": null, "eval": {"generator": {"max_tokens": 16384, "dtype": "bf16", "temperature": 1.0, "top_p": 0.95}, "harness": {"tasks": [{"task": "hellaswag", "dataset_path": "/data/rsadhukh/eval_data/hellaswag"}, {"task": "boolq", "dataset_path": "/data/rsadhukh/eval_data/super_glue"}, {"task": "piqa", "dataset_path": "/data/rsadhukh/eval_data/piqa"}, {"task": "winogrande", "dataset_path": "/data/rsadhukh/eval_data/winogrande"}, {"task": "openbookqa", "dataset_path": "/data/rsadhukh/eval_data/openbookqa"}, {"task": "arc_easy", "dataset_path": "/data/rsadhukh/eval_data/ai2_arc"}, {"task": "arc_challenge", "dataset_path": "/data/rsadhukh/eval_data/ai2_arc"}], "confirm_run_unsafe_code": true, "batch_size": 64}, "validation": null}}
checkpoints/0000050000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1030, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.00.jsonl", "position": 169184122, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.00.jsonl", "position": 103555217, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.00.jsonl", "position": 121315811, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.00.jsonl", "position": 93158712, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.00.jsonl", "position": 61186565, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.00.jsonl", "position": 5174957, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.00.jsonl", "position": 1546250, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.00.jsonl", "position": 1255920958, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.00.jsonl", "position": 26279313, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.00.jsonl", "position": 49452440, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.00.jsonl", "position": 2391376, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.00.jsonl", "position": 4049828, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.00.jsonl", "position": 601143, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.00.jsonl", "position": 14984817, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.00.jsonl", "position": 7215763, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.00.jsonl", "position": 19007656, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.00.jsonl", "position": 2371004, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.00.jsonl", "position": 9629373, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.00.jsonl", "position": 57425970, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.00.jsonl", "position": 180096848, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.00.jsonl", "position": 69872688, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.00.jsonl", "position": 503556493, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 132764577257282494100248622808569340063, "inc": 199517438996687927661581397869791268041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717362, "common_crawl-high-quality": 524179168, "cranecode": 232968050, "cranemath": 131159704, "dolmino-math": 249275298, "dolmino_1-flan": 116484402, "gemini-reasoning-traces": 5826646, "general_reasoning_mix": 43565000, "math-meta-reasoning": 8853292, "megamatt": 40304231, "nemotron-synth-qa": 116482462, "olmocr_science_pdfs": 116491847, "openthoughts2": 29121250, "program_verifiable": 3727783, "qwq-reasoning-traces": 43578408, "reddit_to_flashcards": 137439257, "stack_edu": 232970173, "stem-heavy-crawl": 116483563, "tinymath-mind": 20967277, "tinymath-pot": 5591237, "tulu-3-sft": 25625221, "wiki_to_rcqa": 69888715}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 190856181678508905526281042577254913524, "inc": 203371896531876761410193603683292290457}, "has_uint32": 0, "uinteger": 2500812634}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 985, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.01.jsonl", "position": 168720688, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.01.jsonl", "position": 110618352, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.01.jsonl", "position": 119384353, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.01.jsonl", "position": 85448056, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.01.jsonl", "position": 58608170, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.01.jsonl", "position": 6474706, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.01.jsonl", "position": 2024232, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.01.jsonl", "position": 1255733371, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.01.jsonl", "position": 25598945, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.01.jsonl", "position": 49983995, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.01.jsonl", "position": 2213392, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.01.jsonl", "position": 5370488, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.01.jsonl", "position": 1099892, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.01.jsonl", "position": 12654115, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.01.jsonl", "position": 7726124, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.01.jsonl", "position": 11163765, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.01.jsonl", "position": 2018846, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.01.jsonl", "position": 11701266, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.01.jsonl", "position": 63132979, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.01.jsonl", "position": 177519305, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.01.jsonl", "position": 62396313, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.01.jsonl", "position": 476242817, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 312829929264520745690347284204771532019, "inc": 70355339299095406607494999455309704455}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716459, "common_crawl-high-quality": 524177711, "cranecode": 232968196, "cranemath": 131161732, "dolmino-math": 249277186, "dolmino_1-flan": 116484725, "gemini-reasoning-traces": 5828249, "general_reasoning_mix": 43564962, "math-meta-reasoning": 8853356, "megamatt": 40303546, "nemotron-synth-qa": 116483943, "olmocr_science_pdfs": 116489141, "openthoughts2": 29120744, "program_verifiable": 3727513, "qwq-reasoning-traces": 43571770, "reddit_to_flashcards": 137442567, "stack_edu": 232968216, "stem-heavy-crawl": 116484314, "tinymath-mind": 20967123, "tinymath-pot": 5591618, "tulu-3-sft": 25627737, "wiki_to_rcqa": 69889583}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 85749820953316004262550710266339839890, "inc": 170649501745871541162088638073684795575}, "has_uint32": 1, "uinteger": 2932988183}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 555, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.02.jsonl", "position": 176228084, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.02.jsonl", "position": 129632377, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.02.jsonl", "position": 122739527, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.02.jsonl", "position": 91737533, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.02.jsonl", "position": 56597994, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.02.jsonl", "position": 5028790, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.02.jsonl", "position": 1803908, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.02.jsonl", "position": 1255796112, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.02.jsonl", "position": 24967558, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.02.jsonl", "position": 50283192, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.02.jsonl", "position": 1967757, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.02.jsonl", "position": 4589237, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.02.jsonl", "position": 1002074, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.02.jsonl", "position": 16371244, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.02.jsonl", "position": 6490169, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.02.jsonl", "position": 13450598, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.02.jsonl", "position": 2831925, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.02.jsonl", "position": 8059609, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.02.jsonl", "position": 60613291, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.02.jsonl", "position": 177954094, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.02.jsonl", "position": 71330956, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.02.jsonl", "position": 521030735, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 242607930559933803328842844375603027451, "inc": 121235836284466329520550355651886291025}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10718222, "common_crawl-high-quality": 524181486, "cranecode": 232968996, "cranemath": 131164862, "dolmino-math": 249275920, "dolmino_1-flan": 116484004, "gemini-reasoning-traces": 5826345, "general_reasoning_mix": 43566403, "math-meta-reasoning": 8852843, "megamatt": 40303557, "nemotron-synth-qa": 116482321, "olmocr_science_pdfs": 116490151, "openthoughts2": 29123805, "program_verifiable": 3728848, "qwq-reasoning-traces": 43566871, "reddit_to_flashcards": 137442228, "stack_edu": 232968086, "stem-heavy-crawl": 116482679, "tinymath-mind": 20967403, "tinymath-pot": 5591193, "tulu-3-sft": 25626062, "wiki_to_rcqa": 69888536}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 250410953641482978242998027896627465990, "inc": 90616052583066224228118909460993474701}, "has_uint32": 0, "uinteger": 3635088313}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 169, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.03.jsonl", "position": 170654791, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.03.jsonl", "position": 91809052, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.03.jsonl", "position": 110455016, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.03.jsonl", "position": 97889578, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.03.jsonl", "position": 57431495, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.03.jsonl", "position": 5536452, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.03.jsonl", "position": 1466681, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.03.jsonl", "position": 1255744355, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.03.jsonl", "position": 25367057, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.03.jsonl", "position": 49940637, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.03.jsonl", "position": 2845827, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.03.jsonl", "position": 4144517, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.03.jsonl", "position": 844780, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.03.jsonl", "position": 7450160, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.03.jsonl", "position": 8479977, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.03.jsonl", "position": 13761925, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.03.jsonl", "position": 1860938, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.03.jsonl", "position": 14422945, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.03.jsonl", "position": 55080209, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.03.jsonl", "position": 174271581, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.03.jsonl", "position": 67406397, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.03.jsonl", "position": 501500696, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 1517453642602235172476618770163698185, "inc": 125151366308830146461603246033367959405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717133, "common_crawl-high-quality": 524176962, "cranecode": 232969091, "cranemath": 131160607, "dolmino-math": 249277028, "dolmino_1-flan": 116482573, "gemini-reasoning-traces": 5825222, "general_reasoning_mix": 43565173, "math-meta-reasoning": 8853780, "megamatt": 40303862, "nemotron-synth-qa": 116484017, "olmocr_science_pdfs": 116485559, "openthoughts2": 29128678, "program_verifiable": 3728185, "qwq-reasoning-traces": 43572973, "reddit_to_flashcards": 137444086, "stack_edu": 232968559, "stem-heavy-crawl": 116483325, "tinymath-mind": 20967592, "tinymath-pot": 5591073, "tulu-3-sft": 25626621, "wiki_to_rcqa": 69889108}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 225979724602204095822997413923518164685, "inc": 50378060617174794402292948380426361097}, "has_uint32": 1, "uinteger": 4253124320}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00004.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1060, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.04.jsonl", "position": 169275751, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.04.jsonl", "position": 96547876, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.04.jsonl", "position": 109398714, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.04.jsonl", "position": 93408469, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.04.jsonl", "position": 61104233, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.04.jsonl", "position": 5458783, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.04.jsonl", "position": 1761347, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.04.jsonl", "position": 1255648101, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.04.jsonl", "position": 26651625, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.04.jsonl", "position": 50919354, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.04.jsonl", "position": 1786151, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.04.jsonl", "position": 3303958, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.04.jsonl", "position": 1017483, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.04.jsonl", "position": 13149844, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.04.jsonl", "position": 6798775, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.04.jsonl", "position": 16963025, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.04.jsonl", "position": 2481608, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.04.jsonl", "position": 13933845, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.04.jsonl", "position": 60123187, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.04.jsonl", "position": 172086600, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.04.jsonl", "position": 73441735, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.04.jsonl", "position": 474397686, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 299509270770031916446716497279612719220, "inc": 316940505624043840809154587289841925383}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10718107, "common_crawl-high-quality": 524179029, "cranecode": 232968763, "cranemath": 131160913, "dolmino-math": 249275587, "dolmino_1-flan": 116483658, "gemini-reasoning-traces": 5827106, "general_reasoning_mix": 43565076, "math-meta-reasoning": 8853501, "megamatt": 40304265, "nemotron-synth-qa": 116484560, "olmocr_science_pdfs": 116484274, "openthoughts2": 29124875, "program_verifiable": 3730007, "qwq-reasoning-traces": 43566573, "reddit_to_flashcards": 137435461, "stack_edu": 232979110, "stem-heavy-crawl": 116484556, "tinymath-mind": 20966966, "tinymath-pot": 5591964, "tulu-3-sft": 25626834, "wiki_to_rcqa": 69889131}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 236078094047468840435218590199867614684, "inc": 93583685013543266762176536914137130671}, "has_uint32": 0, "uinteger": 2213378322}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00005.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 517, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.05.jsonl", "position": 165270704, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.05.jsonl", "position": 110875001, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.05.jsonl", "position": 109407333, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.05.jsonl", "position": 93106183, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.05.jsonl", "position": 57342920, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.05.jsonl", "position": 5863139, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.05.jsonl", "position": 1397544, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.05.jsonl", "position": 1255942889, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.05.jsonl", "position": 26325478, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.05.jsonl", "position": 49875226, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.05.jsonl", "position": 2278746, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.05.jsonl", "position": 5572015, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.05.jsonl", "position": 1001385, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.05.jsonl", "position": 11253557, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.05.jsonl", "position": 9052417, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.05.jsonl", "position": 13987777, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.05.jsonl", "position": 506245, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.05.jsonl", "position": 12615069, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.05.jsonl", "position": 58437931, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.05.jsonl", "position": 177327447, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.05.jsonl", "position": 63448265, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.05.jsonl", "position": 498508745, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 304150823691374556939996871776077939637, "inc": 76256721842129589702307426791162152551}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717480, "common_crawl-high-quality": 524188091, "cranecode": 232966972, "cranemath": 131160476, "dolmino-math": 249274497, "dolmino_1-flan": 116483319, "gemini-reasoning-traces": 5830287, "general_reasoning_mix": 43565202, "math-meta-reasoning": 8853796, "megamatt": 40304571, "nemotron-synth-qa": 116482836, "olmocr_science_pdfs": 116487868, "openthoughts2": 29124066, "program_verifiable": 3727892, "qwq-reasoning-traces": 43566903, "reddit_to_flashcards": 137438488, "stack_edu": 232972672, "stem-heavy-crawl": 116483038, "tinymath-mind": 20967246, "tinymath-pot": 5591344, "tulu-3-sft": 25625385, "wiki_to_rcqa": 69888430}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 128588098932200963179261404019648935844, "inc": 22268798220195505414013091073155496089}, "has_uint32": 1, "uinteger": 4091980020}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00006.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 483, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.06.jsonl", "position": 175513919, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.06.jsonl", "position": 119913136, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.06.jsonl", "position": 113088935, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.06.jsonl", "position": 92242241, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.06.jsonl", "position": 56182271, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.06.jsonl", "position": 4485607, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.06.jsonl", "position": 1756960, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.06.jsonl", "position": 1255696572, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.06.jsonl", "position": 25259631, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.06.jsonl", "position": 50553295, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.06.jsonl", "position": 2585746, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.06.jsonl", "position": 3188909, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.06.jsonl", "position": 955594, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.06.jsonl", "position": 13384344, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.06.jsonl", "position": 7133854, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.06.jsonl", "position": 13017552, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.06.jsonl", "position": 1774383, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.06.jsonl", "position": 10681856, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.06.jsonl", "position": 59558626, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.06.jsonl", "position": 175162987, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.06.jsonl", "position": 61980645, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.06.jsonl", "position": 511837250, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 88967541265145310930449644189407446893, "inc": 88395164212990118791130299107903034087}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717747, "common_crawl-high-quality": 524179506, "cranecode": 232968095, "cranemath": 131162388, "dolmino-math": 249275950, "dolmino_1-flan": 116484296, "gemini-reasoning-traces": 5824972, "general_reasoning_mix": 43566328, "math-meta-reasoning": 8853803, "megamatt": 40303908, "nemotron-synth-qa": 116483858, "olmocr_science_pdfs": 116485538, "openthoughts2": 29122821, "program_verifiable": 3727710, "qwq-reasoning-traces": 43572466, "reddit_to_flashcards": 137444658, "stack_edu": 232967739, "stem-heavy-crawl": 116484718, "tinymath-mind": 20967234, "tinymath-pot": 5591654, "tulu-3-sft": 25626431, "wiki_to_rcqa": 69889073}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 48653964429580767134480002661016468688, "inc": 159974370505239494345331372639210830695}, "has_uint32": 1, "uinteger": 837376111}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00007.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 230, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.07.jsonl", "position": 178993908, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.07.jsonl", "position": 93661101, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.07.jsonl", "position": 113088199, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.07.jsonl", "position": 96214147, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.07.jsonl", "position": 59690564, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.07.jsonl", "position": 5700038, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.07.jsonl", "position": 1612363, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.07.jsonl", "position": 1255868845, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.07.jsonl", "position": 26986400, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.07.jsonl", "position": 51968967, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.07.jsonl", "position": 1631747, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.07.jsonl", "position": 4546589, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.07.jsonl", "position": 582704, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.07.jsonl", "position": 13174586, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.07.jsonl", "position": 9796597, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.07.jsonl", "position": 11680804, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.07.jsonl", "position": 2684246, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.07.jsonl", "position": 7643072, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.07.jsonl", "position": 61990705, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.07.jsonl", "position": 180390710, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.07.jsonl", "position": 66016263, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.07.jsonl", "position": 495406359, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 337829025199356394220275822052077951457, "inc": 322195181278880022317637923384652014617}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717886, "common_crawl-high-quality": 524176392, "cranecode": 232967839, "cranemath": 131160724, "dolmino-math": 249273811, "dolmino_1-flan": 116483260, "gemini-reasoning-traces": 5824211, "general_reasoning_mix": 43567443, "math-meta-reasoning": 8854450, "megamatt": 40304218, "nemotron-synth-qa": 116482875, "olmocr_science_pdfs": 116495386, "openthoughts2": 29128949, "program_verifiable": 3729274, "qwq-reasoning-traces": 43572187, "reddit_to_flashcards": 137437008, "stack_edu": 232969699, "stem-heavy-crawl": 116483337, "tinymath-mind": 20966971, "tinymath-pot": 5591018, "tulu-3-sft": 25625528, "wiki_to_rcqa": 69888680}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 211283518072739736885050693825571609459, "inc": 334003994887173083725390277461237639581}, "has_uint32": 0, "uinteger": 2129520558}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00008.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 155, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.08.jsonl", "position": 181847675, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.08.jsonl", "position": 118620031, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.08.jsonl", "position": 115152774, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.08.jsonl", "position": 93485013, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.08.jsonl", "position": 62089139, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.08.jsonl", "position": 5940802, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.08.jsonl", "position": 1836807, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.08.jsonl", "position": 1255656187, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.08.jsonl", "position": 24977680, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.08.jsonl", "position": 51685664, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.08.jsonl", "position": 1408077, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.08.jsonl", "position": 4149030, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.08.jsonl", "position": 721073, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.08.jsonl", "position": 9031490, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.08.jsonl", "position": 7370162, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.08.jsonl", "position": 13978000, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.08.jsonl", "position": 781819, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.08.jsonl", "position": 4089669, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.08.jsonl", "position": 60967384, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.08.jsonl", "position": 174206698, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.08.jsonl", "position": 63204528, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.08.jsonl", "position": 516359564, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 222457326697416819089385188638000029889, "inc": 232333011630326481733766413984974244825}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716611, "common_crawl-high-quality": 524178499, "cranecode": 232966609, "cranemath": 131162093, "dolmino-math": 249275842, "dolmino_1-flan": 116483883, "gemini-reasoning-traces": 5825253, "general_reasoning_mix": 43564737, "math-meta-reasoning": 8852922, "megamatt": 40302803, "nemotron-synth-qa": 116482437, "olmocr_science_pdfs": 116496133, "openthoughts2": 29125776, "program_verifiable": 3729065, "qwq-reasoning-traces": 43573173, "reddit_to_flashcards": 137440414, "stack_edu": 232967665, "stem-heavy-crawl": 116483824, "tinymath-mind": 20966813, "tinymath-pot": 5591042, "tulu-3-sft": 25626768, "wiki_to_rcqa": 69888859}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 299032876526771153828468208281902936020, "inc": 137127179917161464848280992898870611721}, "has_uint32": 1, "uinteger": 1502052356}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00010.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 13543, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.10.jsonl", "position": 175265944, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.10.jsonl", "position": 121743979, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.10.jsonl", "position": 111160799, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.10.jsonl", "position": 93444517, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.10.jsonl", "position": 59454435, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.10.jsonl", "position": 6012630, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.10.jsonl", "position": 1465622, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.10.jsonl", "position": 1255699065, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.10.jsonl", "position": 26284095, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.10.jsonl", "position": 49220860, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.10.jsonl", "position": 2012332, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.10.jsonl", "position": 5742655, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.10.jsonl", "position": 957674, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.10.jsonl", "position": 10241213, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.10.jsonl", "position": 5957290, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.10.jsonl", "position": 11461296, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.10.jsonl", "position": 2050981, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.10.jsonl", "position": 12589568, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.10.jsonl", "position": 60080169, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.10.jsonl", "position": 181118161, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.10.jsonl", "position": 59983809, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.10.jsonl", "position": 494733451, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 28093547970186331396263766593751657933, "inc": 262455344491392898496039790594093439699}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717569, "common_crawl-high-quality": 524180383, "cranecode": 232965732, "cranemath": 131160185, "dolmino-math": 249274229, "dolmino_1-flan": 116483372, "gemini-reasoning-traces": 5825079, "general_reasoning_mix": 43567081, "math-meta-reasoning": 8854578, "megamatt": 40302644, "nemotron-synth-qa": 116482333, "olmocr_science_pdfs": 116483332, "openthoughts2": 29125195, "program_verifiable": 3729421, "qwq-reasoning-traces": 43575502, "reddit_to_flashcards": 137439506, "stack_edu": 232965170, "stem-heavy-crawl": 116483438, "tinymath-mind": 20966814, "tinymath-pot": 5591233, "tulu-3-sft": 25626416, "wiki_to_rcqa": 69888621}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 235473873037057258417520760620699177061, "inc": 320333070685036219503813667660565218295}, "has_uint32": 1, "uinteger": 405884923}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00013.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1800, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.13.jsonl", "position": 174087877, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.13.jsonl", "position": 108225052, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.13.jsonl", "position": 127021689, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.13.jsonl", "position": 95316638, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.13.jsonl", "position": 54466205, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.13.jsonl", "position": 6133281, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.13.jsonl", "position": 1654547, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.13.jsonl", "position": 1255766540, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.13.jsonl", "position": 25647381, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.13.jsonl", "position": 50238500, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.13.jsonl", "position": 1699188, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.13.jsonl", "position": 5159402, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.13.jsonl", "position": 715353, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.13.jsonl", "position": 14753381, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.13.jsonl", "position": 10014180, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.13.jsonl", "position": 12622795, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.13.jsonl", "position": 2335795, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.13.jsonl", "position": 9495528, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.13.jsonl", "position": 58413331, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.13.jsonl", "position": 183520843, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.13.jsonl", "position": 64289959, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.13.jsonl", "position": 485303490, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 132025057233364719154679098857642617910, "inc": 74474439248560930919804404709578726689}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717316, "common_crawl-high-quality": 524178631, "cranecode": 232968594, "cranemath": 131161250, "dolmino-math": 249275030, "dolmino_1-flan": 116484307, "gemini-reasoning-traces": 5826836, "general_reasoning_mix": 43568343, "math-meta-reasoning": 8852935, "megamatt": 40305038, "nemotron-synth-qa": 116483779, "olmocr_science_pdfs": 116493800, "openthoughts2": 29123973, "program_verifiable": 3730237, "qwq-reasoning-traces": 43566124, "reddit_to_flashcards": 137438937, "stack_edu": 232966670, "stem-heavy-crawl": 116483776, "tinymath-mind": 20967369, "tinymath-pot": 5591158, "tulu-3-sft": 25626283, "wiki_to_rcqa": 69889190}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 314334276976028320770252057746092726092, "inc": 116345837432125979447787307728243085409}, "has_uint32": 0, "uinteger": 2351727945}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00014.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 68, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.14.jsonl", "position": 176758368, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.14.jsonl", "position": 93132936, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.14.jsonl", "position": 112642230, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.14.jsonl", "position": 97219845, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.14.jsonl", "position": 55568604, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.14.jsonl", "position": 5491293, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.14.jsonl", "position": 1876968, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.14.jsonl", "position": 1255805570, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.14.jsonl", "position": 26267980, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.14.jsonl", "position": 51643736, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.14.jsonl", "position": 2264880, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.14.jsonl", "position": 5453053, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.14.jsonl", "position": 852646, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.14.jsonl", "position": 13208096, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.14.jsonl", "position": 11275043, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.14.jsonl", "position": 14055259, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.14.jsonl", "position": 2172150, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.14.jsonl", "position": 14771471, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.14.jsonl", "position": 58513543, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.14.jsonl", "position": 182524110, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.14.jsonl", "position": 71903506, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.14.jsonl", "position": 519920795, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 255381197467711588087437858821645240560, "inc": 116685831757775029071911556784912414583}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717638, "common_crawl-high-quality": 524178128, "cranecode": 232967970, "cranemath": 131161394, "dolmino-math": 249275748, "dolmino_1-flan": 116483297, "gemini-reasoning-traces": 5827283, "general_reasoning_mix": 43563885, "math-meta-reasoning": 8854132, "megamatt": 40302842, "nemotron-synth-qa": 116482750, "olmocr_science_pdfs": 116495217, "openthoughts2": 29123641, "program_verifiable": 3727738, "qwq-reasoning-traces": 43576648, "reddit_to_flashcards": 137442725, "stack_edu": 232964692, "stem-heavy-crawl": 116481841, "tinymath-mind": 20967997, "tinymath-pot": 5591077, "tulu-3-sft": 25625761, "wiki_to_rcqa": 69888904}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239470938684036100765165062773953078142, "inc": 94545970901749447227174325208480182179}, "has_uint32": 1, "uinteger": 2722351717}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00015.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 179, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.15.jsonl", "position": 173713486, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.15.jsonl", "position": 110463185, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.15.jsonl", "position": 106563803, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.15.jsonl", "position": 94464152, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.15.jsonl", "position": 58075665, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.15.jsonl", "position": 6015268, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.15.jsonl", "position": 1941291, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.15.jsonl", "position": 1255938623, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.15.jsonl", "position": 25431252, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.15.jsonl", "position": 50923899, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.15.jsonl", "position": 2723627, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.15.jsonl", "position": 3919932, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.15.jsonl", "position": 1182999, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.15.jsonl", "position": 9116381, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.15.jsonl", "position": 5035635, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.15.jsonl", "position": 15851392, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.15.jsonl", "position": 915669, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.15.jsonl", "position": 15889044, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.15.jsonl", "position": 61176154, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.15.jsonl", "position": 187577205, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.15.jsonl", "position": 52985604, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.15.jsonl", "position": 491485676, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 210310075468178302712232204289748778423, "inc": 76775115875861029591577098614554519321}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716262, "common_crawl-high-quality": 524177843, "cranecode": 232966787, "cranemath": 131161342, "dolmino-math": 249274848, "dolmino_1-flan": 116483615, "gemini-reasoning-traces": 5825673, "general_reasoning_mix": 43565036, "math-meta-reasoning": 8853315, "megamatt": 40303314, "nemotron-synth-qa": 116482593, "olmocr_science_pdfs": 116484893, "openthoughts2": 29125702, "program_verifiable": 3729674, "qwq-reasoning-traces": 43566333, "reddit_to_flashcards": 137441925, "stack_edu": 232979321, "stem-heavy-crawl": 116484351, "tinymath-mind": 20966990, "tinymath-pot": 5591522, "tulu-3-sft": 25630934, "wiki_to_rcqa": 69888924}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 188849628256972546641584293984929704290, "inc": 277855105455093796469568551045482401825}, "has_uint32": 0, "uinteger": 1607433859}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00017.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 126, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.17.jsonl", "position": 168585760, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.17.jsonl", "position": 116391016, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.17.jsonl", "position": 118429992, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.17.jsonl", "position": 96777587, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.17.jsonl", "position": 56467618, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.17.jsonl", "position": 5487486, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.17.jsonl", "position": 1544757, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.17.jsonl", "position": 1256025172, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.17.jsonl", "position": 26272420, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.17.jsonl", "position": 49064652, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.17.jsonl", "position": 2197504, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.17.jsonl", "position": 4911378, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.17.jsonl", "position": 1108850, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.17.jsonl", "position": 10501019, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.17.jsonl", "position": 8507102, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.17.jsonl", "position": 10235169, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.17.jsonl", "position": 1319148, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.17.jsonl", "position": 8509743, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.17.jsonl", "position": 58515449, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.17.jsonl", "position": 181009735, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.17.jsonl", "position": 70561045, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.17.jsonl", "position": 487618469, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 281159287891267573308938063003501825062, "inc": 219509020097628037272727839178453712549}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717920, "common_crawl-high-quality": 524180536, "cranecode": 232968796, "cranemath": 131162408, "dolmino-math": 249275706, "dolmino_1-flan": 116483614, "gemini-reasoning-traces": 5826187, "general_reasoning_mix": 43565263, "math-meta-reasoning": 8852773, "megamatt": 40303411, "nemotron-synth-qa": 116483810, "olmocr_science_pdfs": 116491197, "openthoughts2": 29122305, "program_verifiable": 3730339, "qwq-reasoning-traces": 43570599, "reddit_to_flashcards": 137441957, "stack_edu": 232967706, "stem-heavy-crawl": 116482954, "tinymath-mind": 20966899, "tinymath-pot": 5591355, "tulu-3-sft": 25625852, "wiki_to_rcqa": 69889663}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 306088215367541730463663193923924069886, "inc": 223203856372306606023862000991848715983}, "has_uint32": 1, "uinteger": 2015769118}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00018.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 169, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.18.jsonl", "position": 177593162, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.18.jsonl", "position": 112024423, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.18.jsonl", "position": 108674929, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.18.jsonl", "position": 92783847, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.18.jsonl", "position": 60339824, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.18.jsonl", "position": 5374110, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.18.jsonl", "position": 1714677, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.18.jsonl", "position": 1255742819, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.18.jsonl", "position": 27000590, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.18.jsonl", "position": 47595014, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.18.jsonl", "position": 2168729, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.18.jsonl", "position": 4173716, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.18.jsonl", "position": 1079768, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.18.jsonl", "position": 10977216, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.18.jsonl", "position": 13318672, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.18.jsonl", "position": 19588889, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.18.jsonl", "position": 2880445, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.18.jsonl", "position": 9643767, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.18.jsonl", "position": 56497729, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.18.jsonl", "position": 171682864, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.18.jsonl", "position": 69648695, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.18.jsonl", "position": 510468579, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 176598571205868443551507785009731189475, "inc": 156889124676145633533243121458300926517}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717811, "common_crawl-high-quality": 524179086, "cranecode": 232967481, "cranemath": 131160917, "dolmino-math": 249275198, "dolmino_1-flan": 116483571, "gemini-reasoning-traces": 5826880, "general_reasoning_mix": 43566901, "math-meta-reasoning": 8852813, "megamatt": 40303435, "nemotron-synth-qa": 116483180, "olmocr_science_pdfs": 116488728, "openthoughts2": 29125477, "program_verifiable": 3728122, "qwq-reasoning-traces": 43578173, "reddit_to_flashcards": 137441529, "stack_edu": 232965423, "stem-heavy-crawl": 116483337, "tinymath-mind": 20966662, "tinymath-pot": 5591014, "tulu-3-sft": 25626319, "wiki_to_rcqa": 69889150}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 232054951445483104053298414178273037645, "inc": 97364009206775596287829320185459903201}, "has_uint32": 1, "uinteger": 2029988552}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00019.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 480, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.19.jsonl", "position": 174993083, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.19.jsonl", "position": 112064730, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.19.jsonl", "position": 102536595, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.19.jsonl", "position": 92591090, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.19.jsonl", "position": 56681747, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.19.jsonl", "position": 5018474, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.19.jsonl", "position": 1468262, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.19.jsonl", "position": 1255849812, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.19.jsonl", "position": 26997909, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.19.jsonl", "position": 49796011, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.19.jsonl", "position": 2174527, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.19.jsonl", "position": 3828069, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.19.jsonl", "position": 1103250, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.19.jsonl", "position": 12133736, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.19.jsonl", "position": 6510047, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.19.jsonl", "position": 16007350, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.19.jsonl", "position": 1033850, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.19.jsonl", "position": 7648502, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.19.jsonl", "position": 58685525, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.19.jsonl", "position": 171979147, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.19.jsonl", "position": 63462364, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.19.jsonl", "position": 471848585, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 286677816895253057066229023720408382481, "inc": 43481686913609198584180156906717904807}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10717007, "common_crawl-high-quality": 524178701, "cranecode": 232968255, "cranemath": 131159952, "dolmino-math": 249274890, "dolmino_1-flan": 116483358, "gemini-reasoning-traces": 5829935, "general_reasoning_mix": 43569350, "math-meta-reasoning": 8854296, "megamatt": 40304210, "nemotron-synth-qa": 116483911, "olmocr_science_pdfs": 116484675, "openthoughts2": 29122154, "program_verifiable": 3727795, "qwq-reasoning-traces": 43574624, "reddit_to_flashcards": 137440587, "stack_edu": 232968680, "stem-heavy-crawl": 116483944, "tinymath-mind": 20967742, "tinymath-pot": 5591349, "tulu-3-sft": 25626269, "wiki_to_rcqa": 69889212}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 123854162493351245575415109320812074887, "inc": 87592583178118397041772301245082262155}, "has_uint32": 1, "uinteger": 1529574907}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
checkpoints/0000050000/train_state_00020.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 50000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 433, "it_state": {"it_state": {"root_dir": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/", "sources": {"cranecode": 10.0, "stack_edu": 10.0, "cranemath": 5.63, "dolmino-math": 10.7, "megamatt": 1.73, "tinymath-mind": 0.9, "tinymath-pot": 0.24, "reddit_to_flashcards": 5.9, "wiki_to_rcqa": 3.0, "nemotron-synth-qa": 5.0, "math-meta-reasoning": 0.38, "code-meta-reasoning": 0.46, "program_verifiable": 0.16, "qwq-reasoning-traces": 1.87, "openthoughts2": 1.25, "general_reasoning_mix": 1.87, "gemini-reasoning-traces": 0.25, "tulu-3-sft": 1.1, "dolmino_1-flan": 5.0, "olmocr_science_pdfs": 5.0, "stem-heavy-crawl": 5.0, "common_crawl-high-quality": 22.5}, "source_to_state": {"cranecode": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranecode/cranecode.chunk.20.jsonl", "position": 167915599, "block_size": 1, "offset": 0, "current_iter": 1}, "stack_edu": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stack_edu/stack_edu.chunk.20.jsonl", "position": 122851356, "block_size": 1, "offset": 0, "current_iter": 1}, "cranemath": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/cranemath/cranemath.chunk.20.jsonl", "position": 111224126, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino-math": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino-math/dolmino-math.chunk.20.jsonl", "position": 100992607, "block_size": 1, "offset": 0, "current_iter": 1}, "megamatt": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/megamatt/megamatt.chunk.20.jsonl", "position": 61350136, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-mind": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-mind/tinymath-mind.chunk.20.jsonl", "position": 4966952, "block_size": 1, "offset": 0, "current_iter": 1}, "tinymath-pot": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tinymath-pot/tinymath-pot.chunk.20.jsonl", "position": 1796014, "block_size": 1, "offset": 0, "current_iter": 1}, "reddit_to_flashcards": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/reddit_to_flashcards/reddit_to_flashcards.chunk.20.jsonl", "position": 1255686843, "block_size": 1, "offset": 0, "current_iter": 0}, "wiki_to_rcqa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/wiki_to_rcqa/wiki_to_rcqa.chunk.20.jsonl", "position": 25418292, "block_size": 1, "offset": 0, "current_iter": 1}, "nemotron-synth-qa": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/nemotron-synth-qa/nemotron-synth-qa.chunk.20.jsonl", "position": 50324020, "block_size": 1, "offset": 0, "current_iter": 1}, "math-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/math-meta-reasoning/math-meta-reasoning.chunk.20.jsonl", "position": 1803998, "block_size": 1, "offset": 0, "current_iter": 1}, "code-meta-reasoning": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/code-meta-reasoning/code-meta-reasoning.chunk.20.jsonl", "position": 5460263, "block_size": 1, "offset": 0, "current_iter": 1}, "program_verifiable": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/program_verifiable/program_verifiable.chunk.20.jsonl", "position": 778003, "block_size": 1, "offset": 0, "current_iter": 1}, "qwq-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/qwq-reasoning-traces/qwq-reasoning-traces.chunk.20.jsonl", "position": 6434548, "block_size": 1, "offset": 0, "current_iter": 1}, "openthoughts2": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/openthoughts2/openthoughts2.chunk.20.jsonl", "position": 6012861, "block_size": 1, "offset": 0, "current_iter": 1}, "general_reasoning_mix": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/general_reasoning_mix/general_reasoning_mix.chunk.20.jsonl", "position": 19097313, "block_size": 1, "offset": 0, "current_iter": 1}, "gemini-reasoning-traces": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/gemini-reasoning-traces/gemini-reasoning-traces.chunk.20.jsonl", "position": 2046188, "block_size": 1, "offset": 0, "current_iter": 1}, "tulu-3-sft": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/tulu-3-sft/tulu-3-sft.chunk.20.jsonl", "position": 7941533, "block_size": 1, "offset": 0, "current_iter": 1}, "dolmino_1-flan": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/dolmino_1-flan/dolmino_1-flan.chunk.20.jsonl", "position": 59795006, "block_size": 1, "offset": 0, "current_iter": 1}, "olmocr_science_pdfs": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/olmocr_science_pdfs/olmocr_science_pdfs.chunk.20.jsonl", "position": 178358604, "block_size": 1, "offset": 0, "current_iter": 1}, "stem-heavy-crawl": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/stem-heavy-crawl/stem-heavy-crawl.chunk.20.jsonl", "position": 63159426, "block_size": 1, "offset": 0, "current_iter": 1}, "common_crawl-high-quality": {"file_path": "/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/common_crawl-high-quality/common_crawl-high-quality.chunk.20.jsonl", "position": 478691348, "block_size": 1, "offset": 0, "current_iter": 1}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 27459530917289031682824315422237314089, "inc": 295199436789328193413091494604713432465}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "huggingface", "path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/"}, "output_seq_len": 4096, "n_views": 2, "source_counts": {"code-meta-reasoning": 10716765, "common_crawl-high-quality": 524176807, "cranecode": 232969472, "cranemath": 131161012, "dolmino-math": 249272989, "dolmino_1-flan": 116483318, "gemini-reasoning-traces": 5827400, "general_reasoning_mix": 43567618, "math-meta-reasoning": 8854817, "megamatt": 40303390, "nemotron-synth-qa": 116482269, "olmocr_science_pdfs": 116491557, "openthoughts2": 29121964, "program_verifiable": 3727666, "qwq-reasoning-traces": 43580768, "reddit_to_flashcards": 137440787, "stack_edu": 232966003, "stem-heavy-crawl": 116483881, "tinymath-mind": 20966829, "tinymath-pot": 5591205, "tulu-3-sft": 25625678, "wiki_to_rcqa": 69888748}}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 231369944016304886428559638266269115931, "inc": 107548519163465652363222144173017049179}, "has_uint32": 1, "uinteger": 1707974445}, "batch_size": 8, "prefetch_size": 1024}, "scheduler": {"base_lrs": [7.44e-05], "last_epoch": 50000, "_step_count": 50001, "_is_initial": false, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
config.yaml ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: olmo2_1B_midfine
2
+ dump_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final
3
+ seed: 777
4
+ model_type: olmo3
5
+ stem_up_proj_layers: []
6
+ grad_acc_steps: 2
7
+ gc_collect_freq: 1000
8
+ probe_freq: 100
9
+ steps: 50000
10
+ stage_steps: null
11
+ data:
12
+ root_dir: /home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
13
+ sources:
14
+ cranecode: 10.0
15
+ stack_edu: 10.0
16
+ cranemath: 5.63
17
+ dolmino-math: 10.7
18
+ megamatt: 1.73
19
+ tinymath-mind: 0.9
20
+ tinymath-pot: 0.24
21
+ reddit_to_flashcards: 5.9
22
+ wiki_to_rcqa: 3.0
23
+ nemotron-synth-qa: 5.0
24
+ math-meta-reasoning: 0.38
25
+ code-meta-reasoning: 0.46
26
+ program_verifiable: 0.16
27
+ qwq-reasoning-traces: 1.87
28
+ openthoughts2: 1.25
29
+ general_reasoning_mix: 1.87
30
+ gemini-reasoning-traces: 0.25
31
+ tulu-3-sft: 1.1
32
+ dolmino_1-flan: 5.0
33
+ olmocr_science_pdfs: 5.0
34
+ stem-heavy-crawl: 5.0
35
+ common_crawl-high-quality: 22.5
36
+ node_local: false
37
+ batch_size: 8
38
+ seq_len: 4096
39
+ n_views: 2
40
+ seed: 42
41
+ add_bos: true
42
+ add_eos: true
43
+ load_async: true
44
+ prefetch_size: 1024
45
+ tokenizer:
46
+ name: huggingface
47
+ path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
48
+ track_packed_source_mixture: true
49
+ packed_source_counts: null
50
+ optim:
51
+ lr: 7.44e-05
52
+ weight_decay: 0.1
53
+ epsilon: 1.0e-08
54
+ beta1: 0.9
55
+ beta2: 0.95
56
+ clip: 1.0
57
+ scheduler: linear
58
+ warmup: 0
59
+ lr_min_ratio: 0.0
60
+ cycle_length: 1.0
61
+ cosine_theta: 1.0
62
+ annealing_step: 1000
63
+ decay_fraction: 0.1
64
+ exp_factor: 0.5
65
+ initial_token_offset: 0
66
+ global_final_step: null
67
+ model:
68
+ dim: 2048
69
+ n_layers: 16
70
+ head_dim: 128
71
+ n_heads: 16
72
+ n_kv_heads: 16
73
+ ffn_dim_multiplier: 1.5
74
+ multiple_of: 256
75
+ norm_eps: 1.0e-06
76
+ rope_theta: 500000.0
77
+ rope_scaling: null
78
+ init_base_std: 0.02
79
+ init_std_factor: disabled
80
+ max_seqlen: 4096
81
+ seed: 42
82
+ vocab_size: 100352
83
+ weight_tying: false
84
+ sliding_window: null
85
+ distributed:
86
+ dp_shard: 1
87
+ dp_replicate: 32
88
+ tp_size: 1
89
+ selective_activation_checkpointing: false
90
+ compile: true
91
+ fsdp_type: full_shard
92
+ model_dtype: bf16
93
+ float8_recipe: null
94
+ float8_filter: layers\.[0-9]+\.
95
+ matmul_allow_tf32: false
96
+ detect_anomaly: false
97
+ compile_cache_size_limit: 8
98
+ spawn_method: forkserver
99
+ stem_parallel_size: 8
100
+ env:
101
+ MKL_SERVICE_FORCE_INTEL: GNU
102
+ OMP_NUM_THREADS: '1'
103
+ MKL_NUM_THREADS: '1'
104
+ ENABLE_INTRA_NODE_COMM: '1'
105
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
106
+ NCCL_IB_TIMEOUT: '22'
107
+ NCCL_DEBUG: INFO
108
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
109
+ checkpoint:
110
+ dump:
111
+ every: 5000
112
+ keep: 1
113
+ eval:
114
+ every: 100000
115
+ keep: 1
116
+ path: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints
117
+ init_ckpt_path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
118
+ continue_training_from_init: true
119
+ legacy_init_ckpt_lm_transformer: false
120
+ merge_lm_optim_seed_ckpt_path: null
121
+ profiling:
122
+ run: true
123
+ trace_folder: profiling
124
+ mem_warmup: 100
125
+ mem_steps: 2
126
+ profile_warmup: 102
127
+ profile_steps: 2
128
+ logging:
129
+ freq: 10
130
+ acc_freq: null
131
+ wandb:
132
+ job_type: null
133
+ dir: null
134
+ project: stem
135
+ entity: null
136
+ tags: null
137
+ group: null
138
+ name: olmo2_1B_midfine
139
+ notes: null
140
+ config_exclude_keys: null
141
+ config_include_keys: null
142
+ anonymous: null
143
+ mode: null
144
+ allow_val_change: null
145
+ resume: null
146
+ force: null
147
+ tensorboard: null
148
+ sync_tensorboard: null
149
+ monitor_gym: null
150
+ save_code: null
151
+ id: null
152
+ fork_from: null
153
+ resume_from: null
154
+ async_eval_gpus: null
155
+ eval:
156
+ generator:
157
+ max_tokens: 16384
158
+ dtype: bf16
159
+ temperature: 1.0
160
+ top_p: 0.95
161
+ harness:
162
+ tasks:
163
+ - task: hellaswag
164
+ dataset_path: /data/rsadhukh/eval_data/hellaswag
165
+ - task: boolq
166
+ dataset_path: /data/rsadhukh/eval_data/super_glue
167
+ - task: piqa
168
+ dataset_path: /data/rsadhukh/eval_data/piqa
169
+ - task: winogrande
170
+ dataset_path: /data/rsadhukh/eval_data/winogrande
171
+ - task: openbookqa
172
+ dataset_path: /data/rsadhukh/eval_data/openbookqa
173
+ - task: arc_easy
174
+ dataset_path: /data/rsadhukh/eval_data/ai2_arc
175
+ - task: arc_challenge
176
+ dataset_path: /data/rsadhukh/eval_data/ai2_arc
177
+ confirm_run_unsafe_code: true
178
+ batch_size: 64
179
+ validation: null
evals/0000050000/config.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: evals
2
+ dump_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final/evals/0000050000
3
+ metric_log_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final
4
+ ckpt_dir: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints/0000050000
5
+ tokenizer_path: null
6
+ tokenizer_name: null
7
+ model_type: olmo3
8
+ generator:
9
+ temperature: 1.0
10
+ top_p: 0.95
11
+ top_k: null
12
+ max_gen_len: 512
13
+ max_tokens: 16384
14
+ max_prompt_len: null
15
+ until: []
16
+ compile_prefilling: false
17
+ reduce_generation_overhead: false
18
+ show_progress: false
19
+ dtype: bf16
20
+ device: cuda
21
+ harness:
22
+ tasks:
23
+ - task: hellaswag
24
+ dataset_path: /data/rsadhukh/eval_data/hellaswag
25
+ - task: boolq
26
+ dataset_path: /data/rsadhukh/eval_data/super_glue
27
+ - task: piqa
28
+ dataset_path: /data/rsadhukh/eval_data/piqa
29
+ - task: winogrande
30
+ dataset_path: /data/rsadhukh/eval_data/winogrande
31
+ - task: openbookqa
32
+ dataset_path: /data/rsadhukh/eval_data/openbookqa
33
+ - task: arc_easy
34
+ dataset_path: /data/rsadhukh/eval_data/ai2_arc
35
+ - task: arc_challenge
36
+ dataset_path: /data/rsadhukh/eval_data/ai2_arc
37
+ num_fewshot: null
38
+ device: null
39
+ use_cache: null
40
+ cache_requests: false
41
+ rewrite_requests_cache: false
42
+ delete_requests_cache: false
43
+ limit: null
44
+ bootstrap_iters: 100000
45
+ check_integrity: false
46
+ write_out: false
47
+ log_samples: true
48
+ system_instruction: null
49
+ apply_chat_template: false
50
+ fewshot_as_multiturn: false
51
+ gen_kwargs: null
52
+ verbosity: INFO
53
+ predict_only: false
54
+ random_seed: 0
55
+ numpy_random_seed: 1234
56
+ torch_random_seed: 1234
57
+ fewshot_random_seed: 1234
58
+ batch_size: 64
59
+ confirm_run_unsafe_code: true
60
+ validation: null
61
+ wandb: null
62
+ global_step: 50000
evals/0000050000/results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"arc_challenge": {"alias": "arc_challenge", "acc,none": 0.42235494880546076, "acc_stderr,none": 0.014434138713379983, "acc_norm,none": 0.45051194539249145, "acc_norm_stderr,none": 0.014539646098471627}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7563131313131313, "acc_stderr,none": 0.00880917174472056, "acc_norm,none": 0.7567340067340067, "acc_norm_stderr,none": 0.00880400984686553}, "boolq": {"alias": "boolq", "acc,none": 0.6938837920489297, "acc_stderr,none": 0.008060817222724517}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4953196574387572, "acc_stderr,none": 0.004989562798280521, "acc_norm,none": 0.6694881497709619, "acc_norm_stderr,none": 0.00469436096892941}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665813}, "piqa": {"alias": "piqa", "acc,none": 0.7486398258977149, "acc_stderr,none": 0.010121156016819262, "acc_norm,none": 0.7464635473340587, "acc_norm_stderr,none": 0.010150090834551784}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558363}}, "versions": {"arc_challenge": 1.0, "arc_easy": 1.0, "boolq": 2.0, "hellaswag": 1.0, "openbookqa": 1.0, "piqa": 1.0, "winogrande": 1.0}, "n-shot": {"arc_challenge": 0, "arc_easy": 0, "boolq": 0, "hellaswag": 0, "openbookqa": 0, "piqa": 0, "winogrande": 0}, "higher_is_better": {"arc_challenge": {"acc": true, "acc_norm": true}, "arc_easy": {"acc": true, "acc_norm": true}, "boolq": {"acc": true}, "hellaswag": {"acc": true, "acc_norm": true}, "openbookqa": {"acc": true, "acc_norm": true}, "piqa": {"acc": true, "acc_norm": true}, "winogrande": {"acc": true}}, "n-samples": {"hellaswag": {"original": 10042, "effective": 10042}, "boolq": {"original": 3270, "effective": 3270}, "piqa": {"original": 1838, "effective": 1838}, "winogrande": {"original": 1267, "effective": 1267}, "openbookqa": {"original": 500, "effective": 500}, "arc_easy": {"original": 2376, "effective": 2376}, "arc_challenge": {"original": 1172, "effective": 1172}}, "git_hash": "1620cbc4", "date": 1777526814.9856765, "pretty_env_info": "PyTorch version: 2.8.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 12.3.0-1ubuntu1~22.04.2) 12.3.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.22.1\nLibc version: glibc-2.35\n\nPython version: 3.11.9 (main, Nov 10 2025, 02:08:09) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-131-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 13.0.88\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 580.95.05\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 176\nOn-line CPU(s) list: 0-175\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8468V\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 44\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4800.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq vmx ssse3 fma cx16 pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 wbnoinvd arat avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b fsrm md_clear serialize tsxldtrk avx512_fp16 arch_capabilities\nVirtualization: VT-x\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 4.1 MiB (88 instances)\nL1i cache: 2.8 MiB (88 instances)\nL2 cache: 176 MiB (88 instances)\nL3 cache: 195 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-87\nNUMA node1 CPU(s): 88-175\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; RSB filling; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; TSX disabled\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.4\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.3\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.8.0\n[pip3] triton==3.4.0\n[conda] _anaconda_depends 2025.12 py313_mkl_0\n[conda] blas 1.0 mkl\n[conda] mkl 2025.0.0 hacee8c2_941\n[conda] mkl-service 2.5.2 py313hacdc0fc_0\n[conda] mkl_fft 2.1.1 py313h57662e1_0\n[conda] mkl_random 1.3.0 py313h23c847b_0\n[conda] numpy 2.3.5 py313h08c6c3d_0\n[conda] numpy-base 2.3.5 py313h00548fb_0\n[conda] numpydoc 1.9.0 py313h06a4308_0", "transformers_version": "5.1.0", "lm_eval_version": "0.4.11"}
metrics.eval.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"created_at": "2026-04-30T05:27:56.628379", "global_step": 50000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.42235494880546076, "acc_stderr,none": 0.014434138713379983, "acc_norm,none": 0.45051194539249145, "acc_norm_stderr,none": 0.014539646098471627}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7563131313131313, "acc_stderr,none": 0.00880917174472056, "acc_norm,none": 0.7567340067340067, "acc_norm_stderr,none": 0.00880400984686553}, "boolq": {"alias": "boolq", "acc,none": 0.6938837920489297, "acc_stderr,none": 0.008060817222724517}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4953196574387572, "acc_stderr,none": 0.004989562798280521, "acc_norm,none": 0.6694881497709619, "acc_norm_stderr,none": 0.00469436096892941}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665813}, "piqa": {"alias": "piqa", "acc,none": 0.7486398258977149, "acc_stderr,none": 0.010121156016819262, "acc_norm,none": 0.7464635473340587, "acc_norm_stderr,none": 0.010150090834551784}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558363}}
train.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Configure stats pid to 470303
3
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug.log
5
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
6
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():848] calling init triggers
7
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
8
+ config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
9
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():896] starting backend
10
+ 2026-04-29 15:35:52,400 INFO MainThread:470303 [wandb_init.py:init():911] sending inform_init request
11
+ 2026-04-29 15:35:52,408 INFO MainThread:470303 [wandb_init.py:init():919] backend started and connected
12
+ 2026-04-29 15:35:52,410 INFO MainThread:470303 [wandb_init.py:init():989] updated telemetry
13
+ 2026-04-29 15:35:52,430 INFO MainThread:470303 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
14
+ 2026-04-29 15:35:53,838 INFO MainThread:470303 [wandb_init.py:init():1058] starting run threads in backend
15
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_console_start():2542] atexit reg
16
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2391] redirect: wrap_raw
17
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2460] Wrapping output streams.
18
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2483] Redirects installed.
19
+ 2026-04-29 15:35:54,077 INFO MainThread:470303 [wandb_init.py:init():1098] run started, returning control to user process
20
+ 2026-04-30 05:27:57,103 INFO wandb-AsyncioManager-main:470303 [service_client.py:_forward_responses():134] Reached EOF.
21
+ 2026-04-30 05:27:57,104 INFO wandb-AsyncioManager-main:470303 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
22
+ 2026-04-30 05:27:59,641 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
23
+ Traceback (most recent call last):
24
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
25
+ await fn()
26
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
27
+ await self._send_server_request(request)
28
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
29
+ await self._drain_writer()
30
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
31
+ await self._writer.drain()
32
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
33
+ await self._protocol._drain_helper()
34
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
35
+ raise ConnectionResetError('Connection lost')
36
+ ConnectionResetError: Connection lost
37
+ 2026-04-30 05:27:59,660 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
38
+ Traceback (most recent call last):
39
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
40
+ await fn()
41
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
42
+ await self._send_server_request(request)
43
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 78, in _send_server_request
44
+ raise self._broken_exc.with_traceback(self._broken_tb)
45
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
46
+ await self._drain_writer()
47
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
48
+ await self._writer.drain()
49
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
50
+ await self._protocol._drain_helper()
51
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
52
+ raise ConnectionResetError('Connection lost')
53
+ ConnectionResetError: Connection lost
wandb/run-20260429_011802-2wmkezq3/files/media/html/memory_trace_50_79effaa90bfee7eb3207.html ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_011802-2wmkezq3/files/media/html/profile_trace_51_ae282608c6eeb7f48826.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <base target="_blank"><link rel="stylesheet" type="text/css" href="https://app.wandb.ai/normalize.css" />/home/xun/rsadhukh/STEM/logs/midfine_base_final/profiling/profile_CPU_CUDA_000104/rank00_compute-node-14_1060320.1777425671449116530.pt.trace.html.gz
wandb/run-20260429_011802-2wmkezq3/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_011802-2wmkezq3/files/requirements.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DataProperty==1.1.0
2
+ absl-py==2.4.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.13.5
5
+ aiosignal==1.4.0
6
+ annotated-doc==0.0.4
7
+ annotated-types==0.7.0
8
+ antlr4-python3-runtime==4.9.3
9
+ anyio==4.13.0
10
+ asttokens==3.0.1
11
+ async-lru==2.3.0
12
+ attrs==26.1.0
13
+ babel==2.18.0
14
+ bleach==6.3.0
15
+ blessed==1.38.0
16
+ blobfile==3.2.0
17
+ certifi==2026.2.25
18
+ chardet==5.2.0
19
+ charset-normalizer==3.4.7
20
+ click==8.3.2
21
+ colorama==0.4.6
22
+ comm==0.2.3
23
+ datasets==4.8.4
24
+ datatrove==0.9.0
25
+ debugpy==1.8.20
26
+ decorator==5.2.1
27
+ defusedxml==0.7.1
28
+ dill==0.4.1
29
+ evaluate==0.4.6
30
+ executing==2.2.1
31
+ fastjsonschema==2.21.2
32
+ filelock==3.28.0
33
+ fqdn==1.5.1
34
+ frozenlist==1.8.0
35
+ fsspec==2026.2.0
36
+ gitdb==4.0.12
37
+ GitPython==3.1.46
38
+ gpustat==1.1.1
39
+ h11==0.16.0
40
+ hf-xet==1.4.3
41
+ httpcore==1.0.9
42
+ httpx==0.28.1
43
+ huggingface_hub==1.11.0
44
+ humanize==4.15.0
45
+ idna==3.11
46
+ ipython_pygments_lexers==1.1.1
47
+ Jinja2==3.1.6
48
+ joblib==1.5.3
49
+ json5==0.14.0
50
+ jsonlines==4.0.0
51
+ jsonpointer==3.1.1
52
+ jupyter_core==5.9.1
53
+ jupyterlab_pygments==0.3.0
54
+ lark==1.3.1
55
+ lm_eval==0.4.11
56
+ loguru==0.7.3
57
+ lxml==6.1.0
58
+ markdown-it-py==4.0.0
59
+ MarkupSafe==3.0.3
60
+ matplotlib-inline==0.2.1
61
+ mbstrdecoder==1.1.4
62
+ mdurl==0.1.2
63
+ mistune==3.2.0
64
+ more-itertools==11.0.2
65
+ mpmath==1.3.0
66
+ msgspec==0.21.1
67
+ multidict==6.7.1
68
+ multiprocess==0.70.19
69
+ nest-asyncio==1.6.0
70
+ networkx==3.6.1
71
+ nltk==3.9.4
72
+ numpy==2.4.4
73
+ nvidia-cublas-cu12==12.8.4.1
74
+ nvidia-cuda-cupti-cu12==12.8.90
75
+ nvidia-cuda-nvrtc-cu12==12.8.93
76
+ nvidia-cuda-runtime-cu12==12.8.90
77
+ nvidia-cudnn-cu12==9.10.2.21
78
+ nvidia-cufft-cu12==11.3.3.83
79
+ nvidia-cufile-cu12==1.13.1.3
80
+ nvidia-curand-cu12==10.3.9.90
81
+ nvidia-cusolver-cu12==11.7.3.90
82
+ nvidia-cusparse-cu12==12.5.8.93
83
+ nvidia-cusparselt-cu12==0.7.1
84
+ nvidia-ml-py==13.595.45
85
+ nvidia-nccl-cu12==2.27.3
86
+ nvidia-nvjitlink-cu12==12.8.93
87
+ nvidia-nvtx-cu12==12.8.90
88
+ objprint==0.3.0
89
+ omegaconf==2.3.0
90
+ orjson==3.11.8
91
+ overrides==7.7.0
92
+ packaging==26.1
93
+ pandas==3.0.2
94
+ pandocfilters==1.5.1
95
+ parso==0.8.6
96
+ pathvalidate==3.3.1
97
+ pexpect==4.9.0
98
+ pip==26.0.1
99
+ platformdirs==4.9.6
100
+ portalocker==3.2.0
101
+ prometheus_client==0.25.0
102
+ prompt_toolkit==3.0.52
103
+ propcache==0.4.1
104
+ protobuf==7.34.1
105
+ psutil==7.2.2
106
+ ptyprocess==0.7.0
107
+ pure_eval==0.2.3
108
+ pyarrow==23.0.1
109
+ pycparser==3.0
110
+ pycryptodomex==3.23.0
111
+ pydantic==2.13.2
112
+ pydantic_core==2.46.2
113
+ Pygments==2.20.0
114
+ pynvml==13.0.1
115
+ pytablewriter==1.2.1
116
+ python-dateutil==2.9.0.post0
117
+ python-json-logger==4.1.0
118
+ pytz==2026.1.post1
119
+ PyYAML==6.0.3
120
+ pyzmq==27.1.0
121
+ referencing==0.37.0
122
+ regex==2026.4.4
123
+ requests==2.33.1
124
+ rfc3339-validator==0.1.4
125
+ rfc3986-validator==0.1.1
126
+ rfc3987-syntax==1.1.0
127
+ rich==15.0.0
128
+ rouge_score==0.1.2
129
+ rpds-py==0.30.0
130
+ sacrebleu==2.6.0
131
+ safetensors==0.7.0
132
+ scikit-learn==1.8.0
133
+ scipy==1.17.1
134
+ Send2Trash==2.1.0
135
+ sentencepiece==0.2.1
136
+ sentry-sdk==2.58.0
137
+ setuptools==65.5.0
138
+ shellingham==1.5.4
139
+ six==1.17.0
140
+ smmap==5.0.3
141
+ soupsieve==2.8.3
142
+ sqlitedict==2.1.0
143
+ stack-data==0.6.3
144
+ sympy==1.14.0
145
+ tabledata==1.3.4
146
+ tabulate==0.10.0
147
+ tcolorpy==0.1.7
148
+ terminado==0.18.1
149
+ threadpoolctl==3.6.0
150
+ tiktoken==0.12.0
151
+ tinycss2==1.4.0
152
+ tokenizers==0.22.2
153
+ torch==2.8.0
154
+ tornado==6.5.5
155
+ tqdm==4.67.3
156
+ traitlets==5.14.3
157
+ transformers==5.1.0
158
+ triton==3.4.0
159
+ typepy==1.3.4
160
+ typer==0.24.1
161
+ typer-slim==0.24.0
162
+ typing_extensions==4.15.0
163
+ typing-inspection==0.4.2
164
+ tzdata==2026.2
165
+ uri-template==1.3.0
166
+ urllib3==2.6.3
167
+ viztracer==1.1.1
168
+ wandb==0.26.0
169
+ wcwidth==0.6.0
170
+ webcolors==25.10.0
171
+ webencodings==0.5.1
172
+ websocket-client==1.9.0
173
+ word2number==1.1
174
+ xformers==0.0.32.post1
175
+ xxhash==3.6.0
176
+ yarl==1.23.0
177
+ zstandard==0.25.0
wandb/run-20260429_011802-2wmkezq3/files/wandb-metadata.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.9",
4
+ "startedAt": "2026-04-29T01:18:02.772404Z",
5
+ "args": [
6
+ "config=apps/main/configs/olmo2_1B_midfine.yaml",
7
+ "dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
8
+ "checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
9
+ "checkpoint.continue_training_from_init=true",
10
+ "checkpoint.dump.every=5000",
11
+ "checkpoint.eval.every=100000",
12
+ "checkpoint.dump.keep=1",
13
+ "data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
14
+ "data.node_local=false",
15
+ "data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
16
+ "logging.wandb.name=midfine_base_final"
17
+ ],
18
+ "program": "-m apps.main.train",
19
+ "git": {
20
+ "remote": "https://github.com/Infini-AI-Lab/STEM.git",
21
+ "commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
22
+ },
23
+ "email": "rsadhukh@andrew.cmu.edu",
24
+ "root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
25
+ "host": "compute-node-14",
26
+ "executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
27
+ "cpu_count": 88,
28
+ "cpu_count_logical": 176,
29
+ "gpu": "NVIDIA H200",
30
+ "gpu_count": 8,
31
+ "disk": {
32
+ "/": {
33
+ "total": "133003395072",
34
+ "used": "82473435136"
35
+ }
36
+ },
37
+ "memory": {
38
+ "total": "2071474651136"
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA H200",
43
+ "memoryTotal": "150754820096",
44
+ "cudaCores": 16896,
45
+ "architecture": "Hopper",
46
+ "uuid": "GPU-f9dd2b05-45b2-e3c3-43c1-419969cf660f"
47
+ },
48
+ {
49
+ "name": "NVIDIA H200",
50
+ "memoryTotal": "150754820096",
51
+ "cudaCores": 16896,
52
+ "architecture": "Hopper",
53
+ "uuid": "GPU-a79c6c73-bf1a-8760-8bed-c89a9a1ff315"
54
+ },
55
+ {
56
+ "name": "NVIDIA H200",
57
+ "memoryTotal": "150754820096",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper",
60
+ "uuid": "GPU-84d10e29-34bf-5102-3d30-eb56af3a556d"
61
+ },
62
+ {
63
+ "name": "NVIDIA H200",
64
+ "memoryTotal": "150754820096",
65
+ "cudaCores": 16896,
66
+ "architecture": "Hopper",
67
+ "uuid": "GPU-0a7948c2-0a62-1c09-524b-179e3de36a59"
68
+ },
69
+ {
70
+ "name": "NVIDIA H200",
71
+ "memoryTotal": "150754820096",
72
+ "cudaCores": 16896,
73
+ "architecture": "Hopper",
74
+ "uuid": "GPU-d4201f0e-1b44-4327-9747-b62cca7ab4bf"
75
+ },
76
+ {
77
+ "name": "NVIDIA H200",
78
+ "memoryTotal": "150754820096",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper",
81
+ "uuid": "GPU-8fa707f1-9cd9-9384-c9c9-3356b1ad04ec"
82
+ },
83
+ {
84
+ "name": "NVIDIA H200",
85
+ "memoryTotal": "150754820096",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper",
88
+ "uuid": "GPU-b096838e-c00a-4819-b204-35fab82f7d94"
89
+ },
90
+ {
91
+ "name": "NVIDIA H200",
92
+ "memoryTotal": "150754820096",
93
+ "cudaCores": 16896,
94
+ "architecture": "Hopper",
95
+ "uuid": "GPU-cbcadfdd-c45c-2cf6-4408-00f7bce853af"
96
+ }
97
+ ],
98
+ "cudaVersion": "13.0",
99
+ "slurm": {
100
+ "cluster_name": "cluster",
101
+ "conf": "/var/spool/slurmd/conf-cache/slurm.conf",
102
+ "cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
103
+ "cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
104
+ "cpu_bind_type": "mask_cpu:",
105
+ "cpu_bind_verbose": "quiet",
106
+ "cpus_on_node": "128",
107
+ "cpus_per_task": "128",
108
+ "distribution": "cyclic",
109
+ "gpus_on_node": "8",
110
+ "gtids": "0",
111
+ "job_cpus_per_node": "128(x4)",
112
+ "job_end_time": "1777598223",
113
+ "job_gid": "1005",
114
+ "job_gpus": "0,1,2,3,4,5,6,7",
115
+ "job_id": "29496",
116
+ "job_name": "stem",
117
+ "job_nodelist": "compute-node-[14,0,43-44]",
118
+ "job_num_nodes": "4",
119
+ "job_partition": "high",
120
+ "job_start_time": "1777425423",
121
+ "job_uid": "1005",
122
+ "job_user": "xun",
123
+ "jobid": "29496",
124
+ "launch_node_ipaddr": "172.27.49.7",
125
+ "localid": "0",
126
+ "nnodes": "4",
127
+ "nodeid": "0",
128
+ "nodelist": "compute-node-[14,0,43-44]",
129
+ "nprocs": "4",
130
+ "ntasks": "4",
131
+ "ntasks_per_node": "1",
132
+ "output_mode": "standard",
133
+ "prio_process": "0",
134
+ "procid": "0",
135
+ "srun_comm_host": "172.27.49.7",
136
+ "srun_comm_port": "44949",
137
+ "step_gpus": "0,1,2,3,4,5,6,7",
138
+ "step_id": "0",
139
+ "step_launcher_port": "44949",
140
+ "step_nodelist": "compute-node-[14,0,43-44]",
141
+ "step_num_nodes": "4",
142
+ "step_num_tasks": "4",
143
+ "step_tasks_per_node": "1(x4)",
144
+ "stepid": "0",
145
+ "submit_dir": "/home/xun/rsadhukh/STEM",
146
+ "submit_host": "login-node-0",
147
+ "task_pid": "1059988",
148
+ "tasks_per_node": "1(x4)",
149
+ "topology_addr": "compute-node-14",
150
+ "topology_addr_pattern": "node",
151
+ "tres_per_task": "cpu:128",
152
+ "umask": "0000"
153
+ },
154
+ "writerId": "qwoms3z2pk3wk1elrvn86tgqrzw8t43p"
155
+ }
wandb/run-20260429_011802-2wmkezq3/logs/debug-core.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-29T01:18:02.8977674Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmp8tk991yk/port-1060320.txt","pid":1060320,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-29T01:18:02.899120332Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1060320}
3
+ {"time":"2026-04-29T01:18:02.899097714Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-1060320-1061249-332575222/socket","Net":"unix"}}
4
+ {"time":"2026-04-29T01:18:03.07469245Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-29T01:18:03.084970348Z","level":"INFO","msg":"handleInformInit: received","streamId":"2wmkezq3","id":"1(@)"}
6
+ {"time":"2026-04-29T01:18:03.640108524Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2wmkezq3","id":"1(@)"}
7
+ {"time":"2026-04-29T01:18:09.282362815Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"3qs872as87kw"}
8
+ {"time":"2026-04-29T09:24:06.1057087Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2026-04-29T09:24:06.106154304Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2026-04-29T09:24:06.106165217Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
11
+ {"time":"2026-04-29T09:24:07.791740435Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_011802-2wmkezq3/logs/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Configure stats pid to 1060320
3
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_011802-2wmkezq3/logs/debug.log
5
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_011802-2wmkezq3/logs/debug-internal.log
6
+ 2026-04-29 01:18:02,801 INFO MainThread:1060320 [wandb_init.py:init():848] calling init triggers
7
+ 2026-04-29 01:18:02,802 INFO MainThread:1060320 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
8
+ config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
9
+ 2026-04-29 01:18:02,802 INFO MainThread:1060320 [wandb_init.py:init():896] starting backend
10
+ 2026-04-29 01:18:03,074 INFO MainThread:1060320 [wandb_init.py:init():911] sending inform_init request
11
+ 2026-04-29 01:18:03,083 INFO MainThread:1060320 [wandb_init.py:init():919] backend started and connected
12
+ 2026-04-29 01:18:03,085 INFO MainThread:1060320 [wandb_init.py:init():989] updated telemetry
13
+ 2026-04-29 01:18:03,119 INFO MainThread:1060320 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
14
+ 2026-04-29 01:18:04,059 INFO MainThread:1060320 [wandb_init.py:init():1058] starting run threads in backend
15
+ 2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_console_start():2542] atexit reg
16
+ 2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2391] redirect: wrap_raw
17
+ 2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2460] Wrapping output streams.
18
+ 2026-04-29 01:18:04,276 INFO MainThread:1060320 [wandb_run.py:_redirect():2483] Redirects installed.
19
+ 2026-04-29 01:18:04,281 INFO MainThread:1060320 [wandb_init.py:init():1098] run started, returning control to user process
wandb/run-20260429_141040-a48q7rq3/files/output.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 0: INFO 26-04-29 14:10:43.068959 - 0:00:40 - Loadi
wandb/run-20260429_141040-a48q7rq3/files/requirements.txt ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DataProperty==1.1.0
2
+ absl-py==2.4.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.13.5
5
+ aiosignal==1.4.0
6
+ annotated-doc==0.0.4
7
+ annotated-types==0.7.0
8
+ antlr4-python3-runtime==4.9.3
9
+ anyio==4.13.0
10
+ argon2-cffi==25.1.0
11
+ argon2-cffi-bindings==25.1.0
12
+ arrow==1.4.0
13
+ asttokens==3.0.1
14
+ async-lru==2.3.0
15
+ attrs==26.1.0
16
+ babel==2.18.0
17
+ beautifulsoup4==4.14.3
18
+ bleach==6.3.0
19
+ blessed==1.38.0
20
+ blobfile==3.2.0
21
+ certifi==2026.2.25
22
+ cffi==2.0.0
23
+ chardet==5.2.0
24
+ charset-normalizer==3.4.7
25
+ click==8.3.2
26
+ colorama==0.4.6
27
+ comm==0.2.3
28
+ datasets==4.8.4
29
+ datatrove==0.9.0
30
+ debugpy==1.8.20
31
+ decorator==5.2.1
32
+ defusedxml==0.7.1
33
+ dill==0.4.1
34
+ evaluate==0.4.6
35
+ executing==2.2.1
36
+ fastjsonschema==2.21.2
37
+ filelock==3.28.0
38
+ fqdn==1.5.1
39
+ frozenlist==1.8.0
40
+ fsspec==2026.2.0
41
+ gitdb==4.0.12
42
+ GitPython==3.1.46
43
+ gpustat==1.1.1
44
+ h11==0.16.0
45
+ hf-xet==1.4.3
46
+ httpcore==1.0.9
47
+ httpx==0.28.1
48
+ huggingface_hub==1.11.0
49
+ humanize==4.15.0
50
+ idna==3.11
51
+ ipykernel==7.2.0
52
+ ipython==9.13.0
53
+ ipython_pygments_lexers==1.1.1
54
+ isoduration==20.11.0
55
+ jedi==0.19.2
56
+ Jinja2==3.1.6
57
+ joblib==1.5.3
58
+ json5==0.14.0
59
+ jsonlines==4.0.0
60
+ jsonpointer==3.1.1
61
+ jsonschema==4.26.0
62
+ jsonschema-specifications==2025.9.1
63
+ jupyter_client==8.8.0
64
+ jupyter_core==5.9.1
65
+ jupyter-events==0.12.1
66
+ jupyter-lsp==2.3.1
67
+ jupyter_server==2.17.0
68
+ jupyter_server_terminals==0.5.4
69
+ jupyterlab==4.5.6
70
+ jupyterlab_pygments==0.3.0
71
+ jupyterlab_server==2.28.0
72
+ lark==1.3.1
73
+ lm_eval==0.4.11
74
+ loguru==0.7.3
75
+ lxml==6.1.0
76
+ markdown-it-py==4.0.0
77
+ MarkupSafe==3.0.3
78
+ matplotlib-inline==0.2.1
79
+ mbstrdecoder==1.1.4
80
+ mdurl==0.1.2
81
+ mistune==3.2.0
82
+ more-itertools==11.0.2
83
+ mpmath==1.3.0
84
+ msgspec==0.21.1
85
+ multidict==6.7.1
86
+ multiprocess==0.70.19
87
+ nbclient==0.10.4
88
+ nbconvert==7.17.1
89
+ nbformat==5.10.4
90
+ nest-asyncio==1.6.0
91
+ networkx==3.6.1
92
+ nltk==3.9.4
93
+ notebook_shim==0.2.4
94
+ numpy==2.4.4
95
+ nvidia-cublas-cu12==12.8.4.1
96
+ nvidia-cuda-cupti-cu12==12.8.90
97
+ nvidia-cuda-nvrtc-cu12==12.8.93
98
+ nvidia-cuda-runtime-cu12==12.8.90
99
+ nvidia-cudnn-cu12==9.10.2.21
100
+ nvidia-cufft-cu12==11.3.3.83
101
+ nvidia-cufile-cu12==1.13.1.3
102
+ nvidia-curand-cu12==10.3.9.90
103
+ nvidia-cusolver-cu12==11.7.3.90
104
+ nvidia-cusparse-cu12==12.5.8.93
105
+ nvidia-cusparselt-cu12==0.7.1
106
+ nvidia-ml-py==13.595.45
107
+ nvidia-nccl-cu12==2.27.3
108
+ nvidia-nvjitlink-cu12==12.8.93
109
+ nvidia-nvtx-cu12==12.8.90
110
+ objprint==0.3.0
111
+ omegaconf==2.3.0
112
+ orjson==3.11.8
113
+ overrides==7.7.0
114
+ packaging==26.1
115
+ pandas==3.0.2
116
+ pandocfilters==1.5.1
117
+ parso==0.8.6
118
+ pathvalidate==3.3.1
119
+ pexpect==4.9.0
120
+ pip==26.0.1
121
+ platformdirs==4.9.6
122
+ portalocker==3.2.0
123
+ prometheus_client==0.25.0
124
+ prompt_toolkit==3.0.52
125
+ propcache==0.4.1
126
+ protobuf==7.34.1
127
+ psutil==7.2.2
128
+ ptyprocess==0.7.0
129
+ pure_eval==0.2.3
130
+ pyarrow==23.0.1
131
+ pycparser==3.0
132
+ pycryptodomex==3.23.0
133
+ pydantic==2.13.2
134
+ pydantic_core==2.46.2
135
+ Pygments==2.20.0
136
+ pynvml==13.0.1
137
+ pytablewriter==1.2.1
138
+ python-dateutil==2.9.0.post0
139
+ python-json-logger==4.1.0
140
+ pytz==2026.1.post1
141
+ PyYAML==6.0.3
142
+ pyzmq==27.1.0
143
+ referencing==0.37.0
144
+ regex==2026.4.4
145
+ requests==2.33.1
146
+ rfc3339-validator==0.1.4
147
+ rfc3986-validator==0.1.1
148
+ rfc3987-syntax==1.1.0
149
+ rich==15.0.0
150
+ rouge_score==0.1.2
151
+ rpds-py==0.30.0
152
+ sacrebleu==2.6.0
153
+ safetensors==0.7.0
154
+ scikit-learn==1.8.0
155
+ scipy==1.17.1
156
+ Send2Trash==2.1.0
157
+ sentencepiece==0.2.1
158
+ sentry-sdk==2.58.0
159
+ setuptools==65.5.0
160
+ shellingham==1.5.4
161
+ six==1.17.0
162
+ smmap==5.0.3
163
+ soupsieve==2.8.3
164
+ sqlitedict==2.1.0
165
+ stack-data==0.6.3
166
+ sympy==1.14.0
167
+ tabledata==1.3.4
168
+ tabulate==0.10.0
169
+ tcolorpy==0.1.7
170
+ terminado==0.18.1
171
+ threadpoolctl==3.6.0
172
+ tiktoken==0.12.0
173
+ tinycss2==1.4.0
174
+ tokenizers==0.22.2
175
+ torch==2.8.0
176
+ tornado==6.5.5
177
+ tqdm==4.67.3
178
+ traitlets==5.14.3
179
+ transformers==5.1.0
180
+ triton==3.4.0
181
+ typepy==1.3.4
182
+ typer==0.24.1
183
+ typer-slim==0.24.0
184
+ typing_extensions==4.15.0
185
+ typing-inspection==0.4.2
186
+ tzdata==2026.2
187
+ uri-template==1.3.0
188
+ urllib3==2.6.3
189
+ viztracer==1.1.1
190
+ wandb==0.26.0
191
+ wcwidth==0.6.0
192
+ webcolors==25.10.0
193
+ webencodings==0.5.1
194
+ websocket-client==1.9.0
195
+ word2number==1.1
196
+ xformers==0.0.32.post1
197
+ xxhash==3.6.0
198
+ yarl==1.23.0
199
+ zstandard==0.25.0
wandb/run-20260429_141040-a48q7rq3/files/wandb-metadata.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.9",
4
+ "startedAt": "2026-04-29T14:10:40.707156Z",
5
+ "args": [
6
+ "config=apps/main/configs/olmo2_1B_midfine.yaml",
7
+ "dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
8
+ "checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
9
+ "checkpoint.continue_training_from_init=true",
10
+ "checkpoint.dump.every=5000",
11
+ "checkpoint.eval.every=100000",
12
+ "checkpoint.dump.keep=1",
13
+ "data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
14
+ "data.node_local=false",
15
+ "data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
16
+ "logging.wandb.name=midfine_base_final"
17
+ ],
18
+ "program": "-m apps.main.train",
19
+ "git": {
20
+ "remote": "https://github.com/Infini-AI-Lab/STEM.git",
21
+ "commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
22
+ },
23
+ "email": "rsadhukh@andrew.cmu.edu",
24
+ "root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
25
+ "host": "compute-node-14",
26
+ "executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
27
+ "cpu_count": 88,
28
+ "cpu_count_logical": 176,
29
+ "gpu": "NVIDIA H200",
30
+ "gpu_count": 8,
31
+ "disk": {
32
+ "/": {
33
+ "total": "133003395072",
34
+ "used": "83511459840"
35
+ }
36
+ },
37
+ "memory": {
38
+ "total": "2071474651136"
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA H200",
43
+ "memoryTotal": "150754820096",
44
+ "cudaCores": 16896,
45
+ "architecture": "Hopper",
46
+ "uuid": "GPU-f9dd2b05-45b2-e3c3-43c1-419969cf660f"
47
+ },
48
+ {
49
+ "name": "NVIDIA H200",
50
+ "memoryTotal": "150754820096",
51
+ "cudaCores": 16896,
52
+ "architecture": "Hopper",
53
+ "uuid": "GPU-a79c6c73-bf1a-8760-8bed-c89a9a1ff315"
54
+ },
55
+ {
56
+ "name": "NVIDIA H200",
57
+ "memoryTotal": "150754820096",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper",
60
+ "uuid": "GPU-84d10e29-34bf-5102-3d30-eb56af3a556d"
61
+ },
62
+ {
63
+ "name": "NVIDIA H200",
64
+ "memoryTotal": "150754820096",
65
+ "cudaCores": 16896,
66
+ "architecture": "Hopper",
67
+ "uuid": "GPU-0a7948c2-0a62-1c09-524b-179e3de36a59"
68
+ },
69
+ {
70
+ "name": "NVIDIA H200",
71
+ "memoryTotal": "150754820096",
72
+ "cudaCores": 16896,
73
+ "architecture": "Hopper",
74
+ "uuid": "GPU-d4201f0e-1b44-4327-9747-b62cca7ab4bf"
75
+ },
76
+ {
77
+ "name": "NVIDIA H200",
78
+ "memoryTotal": "150754820096",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper",
81
+ "uuid": "GPU-8fa707f1-9cd9-9384-c9c9-3356b1ad04ec"
82
+ },
83
+ {
84
+ "name": "NVIDIA H200",
85
+ "memoryTotal": "150754820096",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper",
88
+ "uuid": "GPU-b096838e-c00a-4819-b204-35fab82f7d94"
89
+ },
90
+ {
91
+ "name": "NVIDIA H200",
92
+ "memoryTotal": "150754820096",
93
+ "cudaCores": 16896,
94
+ "architecture": "Hopper",
95
+ "uuid": "GPU-cbcadfdd-c45c-2cf6-4408-00f7bce853af"
96
+ }
97
+ ],
98
+ "cudaVersion": "13.0",
99
+ "slurm": {
100
+ "cluster_name": "cluster",
101
+ "conf": "/var/spool/slurmd/conf-cache/slurm.conf",
102
+ "cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
103
+ "cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
104
+ "cpu_bind_type": "mask_cpu:",
105
+ "cpu_bind_verbose": "quiet",
106
+ "cpus_on_node": "128",
107
+ "cpus_per_task": "128",
108
+ "distribution": "cyclic",
109
+ "gpus_on_node": "8",
110
+ "gtids": "0",
111
+ "job_cpus_per_node": "128(x4)",
112
+ "job_end_time": "1777644571",
113
+ "job_gid": "1005",
114
+ "job_gpus": "0,1,2,3,4,5,6,7",
115
+ "job_id": "29527",
116
+ "job_name": "stem",
117
+ "job_nodelist": "compute-node-[14,0,43-44]",
118
+ "job_num_nodes": "4",
119
+ "job_partition": "high",
120
+ "job_start_time": "1777471771",
121
+ "job_uid": "1005",
122
+ "job_user": "xun",
123
+ "jobid": "29527",
124
+ "launch_node_ipaddr": "172.27.49.7",
125
+ "localid": "0",
126
+ "nnodes": "4",
127
+ "nodeid": "0",
128
+ "nodelist": "compute-node-[14,0,43-44]",
129
+ "nprocs": "4",
130
+ "ntasks": "4",
131
+ "ntasks_per_node": "1",
132
+ "output_mode": "standard",
133
+ "prio_process": "0",
134
+ "procid": "0",
135
+ "srun_comm_host": "172.27.49.7",
136
+ "srun_comm_port": "46439",
137
+ "step_gpus": "0,1,2,3,4,5,6,7",
138
+ "step_id": "0",
139
+ "step_launcher_port": "46439",
140
+ "step_nodelist": "compute-node-[14,0,43-44]",
141
+ "step_num_nodes": "4",
142
+ "step_num_tasks": "4",
143
+ "step_tasks_per_node": "1(x4)",
144
+ "stepid": "0",
145
+ "submit_dir": "/home/xun/rsadhukh/STEM",
146
+ "submit_host": "login-node-0",
147
+ "task_pid": "1153992",
148
+ "tasks_per_node": "1(x4)",
149
+ "topology_addr": "compute-node-14",
150
+ "topology_addr_pattern": "node",
151
+ "tres_per_task": "cpu:128",
152
+ "umask": "0000"
153
+ },
154
+ "writerId": "9u2zm8go5a3uc5rtfm8a0pkw2jgq3aas"
155
+ }
wandb/run-20260429_141040-a48q7rq3/logs/debug-core.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-29T14:10:40.817255983Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmpowh5z38_/port-1154327.txt","pid":1154327,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-29T14:10:40.817691432Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-1154327-1155293-1046663400/socket","Net":"unix"}}
3
+ {"time":"2026-04-29T14:10:40.817799249Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1154327}
4
+ {"time":"2026-04-29T14:10:40.993605114Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-29T14:10:41.005163272Z","level":"INFO","msg":"handleInformInit: received","streamId":"a48q7rq3","id":"1(@)"}
6
+ {"time":"2026-04-29T14:10:41.572316485Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"a48q7rq3","id":"1(@)"}
7
+ {"time":"2026-04-29T14:10:46.336160157Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
8
+ {"time":"2026-04-29T14:10:46.336722922Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
9
+ {"time":"2026-04-29T14:10:46.336728521Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
10
+ {"time":"2026-04-29T14:10:47.338902098Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-29T14:10:41.006690649Z","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-29T14:10:41.007471115Z","level":"INFO","msg":"stream: starting","core version":"0.26.0"}
3
+ {"time":"2026-04-29T14:10:41.569129518Z","level":"INFO","msg":"stream: created new stream","id":"a48q7rq3"}
4
+ {"time":"2026-04-29T14:10:41.569186597Z","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-29T14:10:41.572303621Z","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-29T14:10:41.572316534Z","level":"INFO","msg":"writer: started","stream_id":"a48q7rq3"}
7
+ {"time":"2026-04-29T14:10:41.572338006Z","level":"INFO","msg":"sender: started"}
8
+ {"time":"2026-04-29T14:10:43.069674591Z","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
9
+ {"time":"2026-04-29T14:10:46.584197837Z","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
wandb/run-20260429_141040-a48q7rq3/logs/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2
+ 2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Configure stats pid to 1154327
3
+ 2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-29 14:10:40,735 INFO MainThread:1154327 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_141040-a48q7rq3/logs/debug.log
5
+ 2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_141040-a48q7rq3/logs/debug-internal.log
6
+ 2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():848] calling init triggers
7
+ 2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
8
+ config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
9
+ 2026-04-29 14:10:40,736 INFO MainThread:1154327 [wandb_init.py:init():896] starting backend
10
+ 2026-04-29 14:10:40,993 INFO MainThread:1154327 [wandb_init.py:init():911] sending inform_init request
11
+ 2026-04-29 14:10:41,002 INFO MainThread:1154327 [wandb_init.py:init():919] backend started and connected
12
+ 2026-04-29 14:10:41,003 INFO MainThread:1154327 [wandb_init.py:init():989] updated telemetry
13
+ 2026-04-29 14:10:41,025 INFO MainThread:1154327 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
14
+ 2026-04-29 14:10:42,750 INFO MainThread:1154327 [wandb_init.py:init():1058] starting run threads in backend
15
+ 2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_console_start():2542] atexit reg
16
+ 2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2391] redirect: wrap_raw
17
+ 2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2460] Wrapping output streams.
18
+ 2026-04-29 14:10:43,064 INFO MainThread:1154327 [wandb_run.py:_redirect():2483] Redirects installed.
19
+ 2026-04-29 14:10:43,068 INFO MainThread:1154327 [wandb_init.py:init():1098] run started, returning control to user process
wandb/run-20260429_141040-a48q7rq3/run-a48q7rq3.wandb ADDED
Binary file (7 Bytes). View file
 
wandb/run-20260429_153552-r20yn80u/files/config.yaml ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.26.0
4
+ e:
5
+ i4ocjyr9csg8kju0tej1pg06av2k8k96:
6
+ args:
7
+ - config=apps/main/configs/olmo2_1B_midfine.yaml
8
+ - dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final
9
+ - checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
10
+ - checkpoint.continue_training_from_init=true
11
+ - checkpoint.dump.every=5000
12
+ - checkpoint.eval.every=100000
13
+ - checkpoint.dump.keep=1
14
+ - data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
15
+ - data.node_local=false
16
+ - data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
17
+ - logging.wandb.name=midfine_base_final
18
+ cpu_count: 88
19
+ cpu_count_logical: 176
20
+ cudaVersion: "13.0"
21
+ disk:
22
+ /:
23
+ total: "133003395072"
24
+ used: "103744323584"
25
+ email: rsadhukh@andrew.cmu.edu
26
+ executable: /home/xun/rsadhukh/STEM/stem/bin/python
27
+ git:
28
+ commit: 7e450007299a777d774d6e2b598001cc7552c1b4
29
+ remote: https://github.com/Infini-AI-Lab/STEM.git
30
+ gpu: NVIDIA H200
31
+ gpu_count: 8
32
+ gpu_nvidia:
33
+ - architecture: Hopper
34
+ cudaCores: 16896
35
+ memoryTotal: "150754820096"
36
+ name: NVIDIA H200
37
+ uuid: GPU-dbeb9076-fd61-4013-987f-938d1db8b786
38
+ - architecture: Hopper
39
+ cudaCores: 16896
40
+ memoryTotal: "150754820096"
41
+ name: NVIDIA H200
42
+ uuid: GPU-5b9b54c7-efcf-6e85-08ed-f6e6f61cfa7a
43
+ - architecture: Hopper
44
+ cudaCores: 16896
45
+ memoryTotal: "150754820096"
46
+ name: NVIDIA H200
47
+ uuid: GPU-df8b695a-d295-cf7c-ab3b-b6d764b11fdf
48
+ - architecture: Hopper
49
+ cudaCores: 16896
50
+ memoryTotal: "150754820096"
51
+ name: NVIDIA H200
52
+ uuid: GPU-c7480abd-eae7-8916-7803-4e033e94aaa0
53
+ - architecture: Hopper
54
+ cudaCores: 16896
55
+ memoryTotal: "150754820096"
56
+ name: NVIDIA H200
57
+ uuid: GPU-91d17507-e0ee-813d-211c-6dbbe87e7f52
58
+ - architecture: Hopper
59
+ cudaCores: 16896
60
+ memoryTotal: "150754820096"
61
+ name: NVIDIA H200
62
+ uuid: GPU-fcbf89aa-71d3-6603-a80d-9bfbd3f063a2
63
+ - architecture: Hopper
64
+ cudaCores: 16896
65
+ memoryTotal: "150754820096"
66
+ name: NVIDIA H200
67
+ uuid: GPU-10b440b6-fedb-33fe-cad7-b1b1dfd65816
68
+ - architecture: Hopper
69
+ cudaCores: 16896
70
+ memoryTotal: "150754820096"
71
+ name: NVIDIA H200
72
+ uuid: GPU-7da58531-cbc7-dedf-3a29-6540eaf04fe7
73
+ host: compute-node-3
74
+ memory:
75
+ total: "2071474647040"
76
+ os: Linux-5.15.0-131-generic-x86_64-with-glibc2.35
77
+ program: -m apps.main.train
78
+ python: CPython 3.11.9
79
+ root: /home/xun/rsadhukh/STEM/logs/midfine_base_final
80
+ slurm:
81
+ cluster_name: cluster
82
+ conf: /var/spool/slurmd/conf-cache/slurm.conf
83
+ cpu_bind: quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF
84
+ cpu_bind_list: 0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF
85
+ cpu_bind_type: 'mask_cpu:'
86
+ cpu_bind_verbose: quiet
87
+ cpus_on_node: "128"
88
+ cpus_per_task: "128"
89
+ distribution: cyclic
90
+ gpus_on_node: "8"
91
+ gtids: "0"
92
+ job_cpus_per_node: 128(x4)
93
+ job_end_time: "1777649690"
94
+ job_gid: "1005"
95
+ job_gpus: 0,1,2,3,4,5,6,7
96
+ job_id: "29546"
97
+ job_name: stem
98
+ job_nodelist: compute-node-[3,7,46-47]
99
+ job_num_nodes: "4"
100
+ job_partition: high
101
+ job_start_time: "1777476890"
102
+ job_uid: "1005"
103
+ job_user: xun
104
+ jobid: "29546"
105
+ launch_node_ipaddr: 172.27.61.166
106
+ localid: "0"
107
+ nnodes: "4"
108
+ nodeid: "0"
109
+ nodelist: compute-node-[3,7,46-47]
110
+ nprocs: "4"
111
+ ntasks: "4"
112
+ ntasks_per_node: "1"
113
+ output_mode: standard
114
+ prio_process: "0"
115
+ procid: "0"
116
+ srun_comm_host: 172.27.61.166
117
+ srun_comm_port: "33673"
118
+ step_gpus: 0,1,2,3,4,5,6,7
119
+ step_id: "0"
120
+ step_launcher_port: "33673"
121
+ step_nodelist: compute-node-[3,7,46-47]
122
+ step_num_nodes: "4"
123
+ step_num_tasks: "4"
124
+ step_tasks_per_node: 1(x4)
125
+ stepid: "0"
126
+ submit_dir: /home/xun/rsadhukh/STEM
127
+ submit_host: login-node-0
128
+ task_pid: "469971"
129
+ tasks_per_node: 1(x4)
130
+ topology_addr: compute-node-3
131
+ topology_addr_pattern: node
132
+ tres_per_task: cpu:128
133
+ umask: "0000"
134
+ startedAt: "2026-04-29T15:35:52.106818Z"
135
+ writerId: i4ocjyr9csg8kju0tej1pg06av2k8k96
136
+ m: []
137
+ python_version: 3.11.9
138
+ t:
139
+ "1":
140
+ - 1
141
+ - 5
142
+ - 11
143
+ - 49
144
+ - 53
145
+ "2":
146
+ - 1
147
+ - 5
148
+ - 11
149
+ - 49
150
+ - 51
151
+ - 53
152
+ - 100
153
+ - 105
154
+ "3":
155
+ - 13
156
+ - 16
157
+ - 61
158
+ "4": 3.11.9
159
+ "5": 0.26.0
160
+ "6": 5.1.0
161
+ "12": 0.26.0
162
+ "13": linux-x86_64
163
+ async_eval_gpus:
164
+ value: null
165
+ checkpoint:
166
+ value:
167
+ continue_training_from_init: true
168
+ dump:
169
+ every: 5000
170
+ keep: 1
171
+ eval:
172
+ every: 100000
173
+ keep: 1
174
+ init_ckpt_path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
175
+ legacy_init_ckpt_lm_transformer: false
176
+ merge_lm_optim_seed_ckpt_path: null
177
+ path: /home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints
178
+ data:
179
+ value:
180
+ add_bos: true
181
+ add_eos: true
182
+ batch_size: 8
183
+ load_async: true
184
+ n_views: 2
185
+ node_local: false
186
+ packed_source_counts: null
187
+ prefetch_size: 1024
188
+ root_dir: /home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/
189
+ seed: 42
190
+ seq_len: 4096
191
+ sources:
192
+ code-meta-reasoning: 0.46
193
+ common_crawl-high-quality: 22.5
194
+ cranecode: 10
195
+ cranemath: 5.63
196
+ dolmino-math: 10.7
197
+ dolmino_1-flan: 5
198
+ gemini-reasoning-traces: 0.25
199
+ general_reasoning_mix: 1.87
200
+ math-meta-reasoning: 0.38
201
+ megamatt: 1.73
202
+ nemotron-synth-qa: 5
203
+ olmocr_science_pdfs: 5
204
+ openthoughts2: 1.25
205
+ program_verifiable: 0.16
206
+ qwq-reasoning-traces: 1.87
207
+ reddit_to_flashcards: 5.9
208
+ stack_edu: 10
209
+ stem-heavy-crawl: 5
210
+ tinymath-mind: 0.9
211
+ tinymath-pot: 0.24
212
+ tulu-3-sft: 1.1
213
+ wiki_to_rcqa: 3
214
+ tokenizer:
215
+ name: huggingface
216
+ path: /data/rsadhukh/checkpoints/olmo2-1b-base-token4T/
217
+ track_packed_source_mixture: true
218
+ distributed:
219
+ value:
220
+ compile: true
221
+ compile_cache_size_limit: 8
222
+ detect_anomaly: false
223
+ dp_replicate: 32
224
+ dp_shard: 1
225
+ float8_filter: layers\.[0-9]+\.
226
+ float8_recipe: null
227
+ fsdp_type: full_shard
228
+ matmul_allow_tf32: false
229
+ model_dtype: bf16
230
+ selective_activation_checkpointing: false
231
+ spawn_method: forkserver
232
+ stem_parallel_size: 8
233
+ tp_size: 1
234
+ dump_dir:
235
+ value: /home/xun/rsadhukh/STEM/logs/midfine_base_final
236
+ env:
237
+ value:
238
+ ENABLE_INTRA_NODE_COMM: "1"
239
+ MKL_NUM_THREADS: "1"
240
+ MKL_SERVICE_FORCE_INTEL: GNU
241
+ NCCL_DEBUG: INFO
242
+ NCCL_IB_TIMEOUT: "22"
243
+ OMP_NUM_THREADS: "1"
244
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: "1"
245
+ TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
246
+ eval:
247
+ value:
248
+ generator:
249
+ dtype: bf16
250
+ max_tokens: 16384
251
+ temperature: 1
252
+ top_p: 0.95
253
+ harness:
254
+ batch_size: 64
255
+ confirm_run_unsafe_code: true
256
+ tasks:
257
+ - dataset_path: /data/rsadhukh/eval_data/hellaswag
258
+ task: hellaswag
259
+ - dataset_path: /data/rsadhukh/eval_data/super_glue
260
+ task: boolq
261
+ - dataset_path: /data/rsadhukh/eval_data/piqa
262
+ task: piqa
263
+ - dataset_path: /data/rsadhukh/eval_data/winogrande
264
+ task: winogrande
265
+ - dataset_path: /data/rsadhukh/eval_data/openbookqa
266
+ task: openbookqa
267
+ - dataset_path: /data/rsadhukh/eval_data/ai2_arc
268
+ task: arc_easy
269
+ - dataset_path: /data/rsadhukh/eval_data/ai2_arc
270
+ task: arc_challenge
271
+ validation: null
272
+ gc_collect_freq:
273
+ value: 1000
274
+ grad_acc_steps:
275
+ value: 2
276
+ logging:
277
+ value:
278
+ acc_freq: null
279
+ freq: 10
280
+ wandb:
281
+ allow_val_change: null
282
+ anonymous: null
283
+ config_exclude_keys: null
284
+ config_include_keys: null
285
+ dir: null
286
+ entity: null
287
+ force: null
288
+ fork_from: null
289
+ group: null
290
+ id: null
291
+ job_type: null
292
+ mode: null
293
+ monitor_gym: null
294
+ name: olmo2_1B_midfine
295
+ notes: null
296
+ project: stem
297
+ resume: null
298
+ resume_from: null
299
+ save_code: null
300
+ sync_tensorboard: null
301
+ tags: null
302
+ tensorboard: null
303
+ model:
304
+ value:
305
+ dim: 2048
306
+ ffn_dim_multiplier: 1.5
307
+ head_dim: 128
308
+ init_base_std: 0.02
309
+ init_std_factor: disabled
310
+ max_seqlen: 4096
311
+ multiple_of: 256
312
+ n_heads: 16
313
+ n_kv_heads: 16
314
+ n_layers: 16
315
+ norm_eps: 1e-06
316
+ rope_scaling: null
317
+ rope_theta: 500000
318
+ seed: 42
319
+ sliding_window: null
320
+ vocab_size: 100352
321
+ weight_tying: false
322
+ model_type:
323
+ value: olmo3
324
+ optim:
325
+ value:
326
+ annealing_step: 1000
327
+ beta1: 0.9
328
+ beta2: 0.95
329
+ clip: 1
330
+ cosine_theta: 1
331
+ cycle_length: 1
332
+ decay_fraction: 0.1
333
+ epsilon: 1e-08
334
+ exp_factor: 0.5
335
+ global_final_step: null
336
+ initial_token_offset: 0
337
+ lr: 7.44e-05
338
+ lr_min_ratio: 0
339
+ scheduler: linear
340
+ warmup: 0
341
+ weight_decay: 0.1
342
+ probe_freq:
343
+ value: 100
344
+ profiling:
345
+ value:
346
+ mem_steps: 2
347
+ mem_warmup: 100
348
+ profile_steps: 2
349
+ profile_warmup: 102
350
+ run: true
351
+ trace_folder: profiling
352
+ seed:
353
+ value: 777
354
+ stage_steps:
355
+ value: null
356
+ stem_up_proj_layers:
357
+ value: []
358
+ steps:
359
+ value: 50000
wandb/run-20260429_153552-r20yn80u/files/media/html/memory_trace_15050_79effaa90bfee7eb3207.html ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_153552-r20yn80u/files/media/html/profile_trace_15051_ae282608c6eeb7f48826.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <base target="_blank"><link rel="stylesheet" type="text/css" href="https://app.wandb.ai/normalize.css" />/home/xun/rsadhukh/STEM/logs/midfine_base_final/profiling/profile_CPU_CUDA_000104/rank00_compute-node-14_1060320.1777425671449116530.pt.trace.html.gz
wandb/run-20260429_153552-r20yn80u/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_153552-r20yn80u/files/requirements.txt ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DataProperty==1.1.0
2
+ absl-py==2.4.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.13.5
5
+ aiosignal==1.4.0
6
+ annotated-doc==0.0.4
7
+ annotated-types==0.7.0
8
+ antlr4-python3-runtime==4.9.3
9
+ anyio==4.13.0
10
+ argon2-cffi==25.1.0
11
+ argon2-cffi-bindings==25.1.0
12
+ arrow==1.4.0
13
+ asttokens==3.0.1
14
+ async-lru==2.3.0
15
+ attrs==26.1.0
16
+ babel==2.18.0
17
+ beautifulsoup4==4.14.3
18
+ bleach==6.3.0
19
+ blessed==1.38.0
20
+ blobfile==3.2.0
21
+ certifi==2026.2.25
22
+ cffi==2.0.0
23
+ chardet==5.2.0
24
+ charset-normalizer==3.4.7
25
+ click==8.3.2
26
+ colorama==0.4.6
27
+ comm==0.2.3
28
+ datasets==4.8.4
29
+ datatrove==0.9.0
30
+ debugpy==1.8.20
31
+ decorator==5.2.1
32
+ defusedxml==0.7.1
33
+ dill==0.4.1
34
+ evaluate==0.4.6
35
+ executing==2.2.1
36
+ fastjsonschema==2.21.2
37
+ filelock==3.28.0
38
+ fqdn==1.5.1
39
+ frozenlist==1.8.0
40
+ fsspec==2026.2.0
41
+ gitdb==4.0.12
42
+ GitPython==3.1.46
43
+ gpustat==1.1.1
44
+ h11==0.16.0
45
+ hf-xet==1.4.3
46
+ httpcore==1.0.9
47
+ httpx==0.28.1
48
+ huggingface_hub==1.11.0
49
+ humanize==4.15.0
50
+ idna==3.11
51
+ ipykernel==7.2.0
52
+ ipython==9.13.0
53
+ ipython_pygments_lexers==1.1.1
54
+ isoduration==20.11.0
55
+ jedi==0.19.2
56
+ Jinja2==3.1.6
57
+ joblib==1.5.3
58
+ json5==0.14.0
59
+ jsonlines==4.0.0
60
+ jsonpointer==3.1.1
61
+ jsonschema==4.26.0
62
+ jsonschema-specifications==2025.9.1
63
+ jupyter_client==8.8.0
64
+ jupyter_core==5.9.1
65
+ jupyter-events==0.12.1
66
+ jupyter-lsp==2.3.1
67
+ jupyter_server==2.17.0
68
+ jupyter_server_terminals==0.5.4
69
+ jupyterlab==4.5.6
70
+ jupyterlab_pygments==0.3.0
71
+ jupyterlab_server==2.28.0
72
+ lark==1.3.1
73
+ lm_eval==0.4.11
74
+ loguru==0.7.3
75
+ lxml==6.1.0
76
+ markdown-it-py==4.0.0
77
+ MarkupSafe==3.0.3
78
+ matplotlib-inline==0.2.1
79
+ mbstrdecoder==1.1.4
80
+ mdurl==0.1.2
81
+ mistune==3.2.0
82
+ more-itertools==11.0.2
83
+ mpmath==1.3.0
84
+ msgspec==0.21.1
85
+ multidict==6.7.1
86
+ multiprocess==0.70.19
87
+ nbclient==0.10.4
88
+ nbconvert==7.17.1
89
+ nbformat==5.10.4
90
+ nest-asyncio==1.6.0
91
+ networkx==3.6.1
92
+ nltk==3.9.4
93
+ notebook_shim==0.2.4
94
+ numpy==2.4.4
95
+ nvidia-cublas-cu12==12.8.4.1
96
+ nvidia-cuda-cupti-cu12==12.8.90
97
+ nvidia-cuda-nvrtc-cu12==12.8.93
98
+ nvidia-cuda-runtime-cu12==12.8.90
99
+ nvidia-cudnn-cu12==9.10.2.21
100
+ nvidia-cufft-cu12==11.3.3.83
101
+ nvidia-cufile-cu12==1.13.1.3
102
+ nvidia-curand-cu12==10.3.9.90
103
+ nvidia-cusolver-cu12==11.7.3.90
104
+ nvidia-cusparse-cu12==12.5.8.93
105
+ nvidia-cusparselt-cu12==0.7.1
106
+ nvidia-ml-py==13.595.45
107
+ nvidia-nccl-cu12==2.27.3
108
+ nvidia-nvjitlink-cu12==12.8.93
109
+ nvidia-nvtx-cu12==12.8.90
110
+ objprint==0.3.0
111
+ omegaconf==2.3.0
112
+ orjson==3.11.8
113
+ overrides==7.7.0
114
+ packaging==26.1
115
+ pandas==3.0.2
116
+ pandocfilters==1.5.1
117
+ parso==0.8.6
118
+ pathvalidate==3.3.1
119
+ pexpect==4.9.0
120
+ pip==26.0.1
121
+ platformdirs==4.9.6
122
+ portalocker==3.2.0
123
+ prometheus_client==0.25.0
124
+ prompt_toolkit==3.0.52
125
+ propcache==0.4.1
126
+ protobuf==7.34.1
127
+ psutil==7.2.2
128
+ ptyprocess==0.7.0
129
+ pure_eval==0.2.3
130
+ pyarrow==23.0.1
131
+ pycparser==3.0
132
+ pycryptodomex==3.23.0
133
+ pydantic==2.13.2
134
+ pydantic_core==2.46.2
135
+ Pygments==2.20.0
136
+ pynvml==13.0.1
137
+ pytablewriter==1.2.1
138
+ python-dateutil==2.9.0.post0
139
+ python-json-logger==4.1.0
140
+ pytz==2026.1.post1
141
+ PyYAML==6.0.3
142
+ pyzmq==27.1.0
143
+ referencing==0.37.0
144
+ regex==2026.4.4
145
+ requests==2.33.1
146
+ rfc3339-validator==0.1.4
147
+ rfc3986-validator==0.1.1
148
+ rfc3987-syntax==1.1.0
149
+ rich==15.0.0
150
+ rouge_score==0.1.2
151
+ rpds-py==0.30.0
152
+ sacrebleu==2.6.0
153
+ safetensors==0.7.0
154
+ scikit-learn==1.8.0
155
+ scipy==1.17.1
156
+ Send2Trash==2.1.0
157
+ sentencepiece==0.2.1
158
+ sentry-sdk==2.58.0
159
+ setuptools==65.5.0
160
+ shellingham==1.5.4
161
+ six==1.17.0
162
+ smmap==5.0.3
163
+ soupsieve==2.8.3
164
+ sqlitedict==2.1.0
165
+ stack-data==0.6.3
166
+ sympy==1.14.0
167
+ tabledata==1.3.4
168
+ tabulate==0.10.0
169
+ tcolorpy==0.1.7
170
+ terminado==0.18.1
171
+ threadpoolctl==3.6.0
172
+ tiktoken==0.12.0
173
+ tinycss2==1.4.0
174
+ tokenizers==0.22.2
175
+ torch==2.8.0
176
+ tornado==6.5.5
177
+ tqdm==4.67.3
178
+ traitlets==5.14.3
179
+ transformers==5.1.0
180
+ triton==3.4.0
181
+ typepy==1.3.4
182
+ typer==0.24.1
183
+ typer-slim==0.24.0
184
+ typing_extensions==4.15.0
185
+ typing-inspection==0.4.2
186
+ tzdata==2026.2
187
+ uri-template==1.3.0
188
+ urllib3==2.6.3
189
+ viztracer==1.1.1
190
+ wandb==0.26.0
191
+ wcwidth==0.6.0
192
+ webcolors==25.10.0
193
+ webencodings==0.5.1
194
+ websocket-client==1.9.0
195
+ word2number==1.1
196
+ xformers==0.0.32.post1
197
+ xxhash==3.6.0
198
+ yarl==1.23.0
199
+ zstandard==0.25.0
wandb/run-20260429_153552-r20yn80u/files/wandb-metadata.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-131-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.9",
4
+ "startedAt": "2026-04-29T15:35:52.106818Z",
5
+ "args": [
6
+ "config=apps/main/configs/olmo2_1B_midfine.yaml",
7
+ "dump_dir=/home/xun/rsadhukh/STEM/logs/midfine_base_final",
8
+ "checkpoint.init_ckpt_path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
9
+ "checkpoint.continue_training_from_init=true",
10
+ "checkpoint.dump.every=5000",
11
+ "checkpoint.eval.every=100000",
12
+ "checkpoint.dump.keep=1",
13
+ "data.root_dir=/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/",
14
+ "data.node_local=false",
15
+ "data.tokenizer.path=/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/",
16
+ "logging.wandb.name=midfine_base_final"
17
+ ],
18
+ "program": "-m apps.main.train",
19
+ "git": {
20
+ "remote": "https://github.com/Infini-AI-Lab/STEM.git",
21
+ "commit": "7e450007299a777d774d6e2b598001cc7552c1b4"
22
+ },
23
+ "email": "rsadhukh@andrew.cmu.edu",
24
+ "root": "/home/xun/rsadhukh/STEM/logs/midfine_base_final",
25
+ "host": "compute-node-3",
26
+ "executable": "/home/xun/rsadhukh/STEM/stem/bin/python",
27
+ "cpu_count": 88,
28
+ "cpu_count_logical": 176,
29
+ "gpu": "NVIDIA H200",
30
+ "gpu_count": 8,
31
+ "disk": {
32
+ "/": {
33
+ "total": "133003395072",
34
+ "used": "103744323584"
35
+ }
36
+ },
37
+ "memory": {
38
+ "total": "2071474647040"
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA H200",
43
+ "memoryTotal": "150754820096",
44
+ "cudaCores": 16896,
45
+ "architecture": "Hopper",
46
+ "uuid": "GPU-dbeb9076-fd61-4013-987f-938d1db8b786"
47
+ },
48
+ {
49
+ "name": "NVIDIA H200",
50
+ "memoryTotal": "150754820096",
51
+ "cudaCores": 16896,
52
+ "architecture": "Hopper",
53
+ "uuid": "GPU-5b9b54c7-efcf-6e85-08ed-f6e6f61cfa7a"
54
+ },
55
+ {
56
+ "name": "NVIDIA H200",
57
+ "memoryTotal": "150754820096",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper",
60
+ "uuid": "GPU-df8b695a-d295-cf7c-ab3b-b6d764b11fdf"
61
+ },
62
+ {
63
+ "name": "NVIDIA H200",
64
+ "memoryTotal": "150754820096",
65
+ "cudaCores": 16896,
66
+ "architecture": "Hopper",
67
+ "uuid": "GPU-c7480abd-eae7-8916-7803-4e033e94aaa0"
68
+ },
69
+ {
70
+ "name": "NVIDIA H200",
71
+ "memoryTotal": "150754820096",
72
+ "cudaCores": 16896,
73
+ "architecture": "Hopper",
74
+ "uuid": "GPU-91d17507-e0ee-813d-211c-6dbbe87e7f52"
75
+ },
76
+ {
77
+ "name": "NVIDIA H200",
78
+ "memoryTotal": "150754820096",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper",
81
+ "uuid": "GPU-fcbf89aa-71d3-6603-a80d-9bfbd3f063a2"
82
+ },
83
+ {
84
+ "name": "NVIDIA H200",
85
+ "memoryTotal": "150754820096",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper",
88
+ "uuid": "GPU-10b440b6-fedb-33fe-cad7-b1b1dfd65816"
89
+ },
90
+ {
91
+ "name": "NVIDIA H200",
92
+ "memoryTotal": "150754820096",
93
+ "cudaCores": 16896,
94
+ "architecture": "Hopper",
95
+ "uuid": "GPU-7da58531-cbc7-dedf-3a29-6540eaf04fe7"
96
+ }
97
+ ],
98
+ "cudaVersion": "13.0",
99
+ "slurm": {
100
+ "cluster_name": "cluster",
101
+ "conf": "/var/spool/slurmd/conf-cache/slurm.conf",
102
+ "cpu_bind": "quiet,mask_cpu:0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
103
+ "cpu_bind_list": "0x000000FFFFFFFFFFFFFFFF000000FFFFFFFFFFFFFFFF",
104
+ "cpu_bind_type": "mask_cpu:",
105
+ "cpu_bind_verbose": "quiet",
106
+ "cpus_on_node": "128",
107
+ "cpus_per_task": "128",
108
+ "distribution": "cyclic",
109
+ "gpus_on_node": "8",
110
+ "gtids": "0",
111
+ "job_cpus_per_node": "128(x4)",
112
+ "job_end_time": "1777649690",
113
+ "job_gid": "1005",
114
+ "job_gpus": "0,1,2,3,4,5,6,7",
115
+ "job_id": "29546",
116
+ "job_name": "stem",
117
+ "job_nodelist": "compute-node-[3,7,46-47]",
118
+ "job_num_nodes": "4",
119
+ "job_partition": "high",
120
+ "job_start_time": "1777476890",
121
+ "job_uid": "1005",
122
+ "job_user": "xun",
123
+ "jobid": "29546",
124
+ "launch_node_ipaddr": "172.27.61.166",
125
+ "localid": "0",
126
+ "nnodes": "4",
127
+ "nodeid": "0",
128
+ "nodelist": "compute-node-[3,7,46-47]",
129
+ "nprocs": "4",
130
+ "ntasks": "4",
131
+ "ntasks_per_node": "1",
132
+ "output_mode": "standard",
133
+ "prio_process": "0",
134
+ "procid": "0",
135
+ "srun_comm_host": "172.27.61.166",
136
+ "srun_comm_port": "33673",
137
+ "step_gpus": "0,1,2,3,4,5,6,7",
138
+ "step_id": "0",
139
+ "step_launcher_port": "33673",
140
+ "step_nodelist": "compute-node-[3,7,46-47]",
141
+ "step_num_nodes": "4",
142
+ "step_num_tasks": "4",
143
+ "step_tasks_per_node": "1(x4)",
144
+ "stepid": "0",
145
+ "submit_dir": "/home/xun/rsadhukh/STEM",
146
+ "submit_host": "login-node-0",
147
+ "task_pid": "469971",
148
+ "tasks_per_node": "1(x4)",
149
+ "topology_addr": "compute-node-3",
150
+ "topology_addr_pattern": "node",
151
+ "tres_per_task": "cpu:128",
152
+ "umask": "0000"
153
+ },
154
+ "writerId": "i4ocjyr9csg8kju0tej1pg06av2k8k96"
155
+ }
wandb/run-20260429_153552-r20yn80u/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"evals/piqa/acc,none":0.7486398258977149,"evals/hellaswag/acc_norm,none":0.6694881497709619,"data/source_fraction_observed/wiki_to_rcqa":0.030630072639507337,"memory/max_active_pct":46.8194972395729,"speed/wps":47602.85371169578,"evals/arc_challenge/acc_stderr,none":0.014434138713379983,"evals/boolq/acc_stderr,none":0.008060817222724517,"evals/boolq/acc,none":0.6938837920489297,"data/source_fraction_observed/megamatt":0.017663693539813355,"data/source_fraction_observed/math-meta-reasoning":0.0038804780352853807,"evals/winogrande/acc,none":0.665351223362273,"_runtime":49923.264815984,"data/source_fraction_observed/tinymath-mind":0.009189315824162799,"evals/piqa/acc_norm_stderr,none":0.010150090834551784,"evals/hellaswag/acc_stderr,none":0.004989562798280521,"memory/max_active_gib":65.45405149459839,"data/source_fraction_observed/common_crawl-high-quality":0.2297319930984784,"evals/arc_easy/acc,none":0.7563131313131313,"evals/openbookqa/acc_norm_stderr,none":0.021893529941665813,"evals/arc_challenge/acc_norm_stderr,none":0.014539646098471627,"data/source_fraction_observed/olmocr_science_pdfs":0.05105317905785341,"data/source_fraction_observed/tinymath-pot":0.0024504434987244546,"memory/power_draw":609672,"data/source_fraction_observed/stack_edu":0.10210445159604291,"evals/arc_easy/acc_norm_stderr,none":0.00880400984686553,"memory_trace":{"_type":"html-file","sha256":"79effaa90bfee7eb3207116787e0d32fc6e6609131fb886a51dbf18b98dc37e0","size":1160648,"path":"media/html/memory_trace_15050_79effaa90bfee7eb3207.html"},"data/source_fraction_observed/general_reasoning_mix":0.019094107596688604,"speed/FLOPS":4.1014138616573406e+14,"memory/max_reserved_gib":76.771484375,"evals/openbookqa/acc_stderr,none":0.019920483209566072,"speed/curr_iter_time":0.6719,"optim/total_tokens":104857600000,"speed/data_load_time":0.0866,"evals/openbookqa/acc_norm,none":0.396,"evals/winogrande/acc_stderr,none":0.013261823629558363,"data/source_fraction_observed/dolmino_1-flan":0.05105102658870274,"_wandb":{"runtime":49923},"acc_step":0,"loss/out":1.4885873794555664,"evals/hellaswag/acc,none":0.4953196574387572,"data/source_fraction_observed/cranecode":0.10210264673193477,"memory/max_reserved_pct":54.91489401645145,"profile_trace":{"_type":"html-file","sha256":"ae282608c6eeb7f488268f66f032340df98e913f5ce9638a3c2c7094ba3e8cff","size":254,"path":"media/html/profile_trace_15051_ae282608c6eeb7f48826.html"},"memory/num_alloc_retries":0,"data/source_fraction_observed/code-meta-reasoning":0.0046968633745099325,"global_step":50000,"data/source_fraction_observed/tulu-3-sft":0.011231371758863785,"evals/openbookqa/acc,none":0.272,"data/source_fraction_observed/openthoughts2":0.012764728611865424,"_step":50000,"evals/arc_easy/acc_stderr,none":0.00880917174472056,"evals/hellaswag/acc_norm_stderr,none":0.00469436096892941,"memory/num_ooms":0,"evals/piqa/acc_norm,none":0.7464635473340587,"data/source_fraction_observed/program_verifiable":0.0016341602295396964,"evals/arc_challenge/acc,none":0.42235494880546076,"data/source_fraction_observed/gemini-reasoning-traces":0.0025537766184913608,"data/source_fraction_observed/dolmino-math":0.10924986505265755,"data/source_fraction_observed/stem-heavy-crawl":0.051052169668858886,"evals/piqa/acc_stderr,none":0.010121156016819262,"optim/lr":1.4880000000014883e-09,"optim/grad_norm":0.14628654718399048,"evals/arc_challenge/acc_norm,none":0.45051194539249145,"_timestamp":1.7775268766324546e+09,"data/source_fraction_observed/qwq-reasoning-traces":0.01909534818756,"data/source_fraction_observed/reddit_to_flashcards":0.060235367293967584,"data/source_fraction_observed/nemotron-synth-qa":0.051050973802153475,"data/source_fraction_observed/cranemath":0.057483967194338154,"evals/arc_easy/acc_norm,none":0.7567340067340067}
wandb/run-20260429_153552-r20yn80u/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-29T15:35:52.224389621Z","level":"INFO","msg":"main: starting server","port-filename":"/scratch/local/xun/tmp/tmp4zakewjj/port-470303.txt","pid":470303,"detached":false,"idle-timeout":600000000000,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-29T15:35:52.224902018Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":470303}
3
+ {"time":"2026-04-29T15:35:52.224835671Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/scratch/local/xun/tmp/wandb-470303-471249-3119761670/socket","Net":"unix"}}
4
+ {"time":"2026-04-29T15:35:52.400060766Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-29T15:35:52.410260917Z","level":"INFO","msg":"handleInformInit: received","streamId":"r20yn80u","id":"1(@)"}
6
+ {"time":"2026-04-29T15:35:52.923314075Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"r20yn80u","id":"1(@)"}
7
+ {"time":"2026-04-29T15:35:59.077871941Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"2f4w6m7xtrwf"}
8
+ {"time":"2026-04-30T05:27:57.103687988Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
9
+ {"time":"2026-04-30T05:27:57.103779861Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2026-04-30T05:27:57.104393393Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2026-04-30T05:27:57.104400088Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2026-04-30T05:27:57.104470461Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/scratch/local/xun/tmp/wandb-470303-471249-3119761670/socket","Net":"unix"}}
13
+ {"time":"2026-04-30T05:27:58.995066197Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
14
+ {"time":"2026-04-30T05:27:58.995094706Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2026-04-30T05:27:58.99510986Z","level":"INFO","msg":"server is closed"}
wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260429_153552-r20yn80u/logs/debug.log ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Configure stats pid to 470303
3
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug.log
5
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
6
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():848] calling init triggers
7
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
8
+ config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
9
+ 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():896] starting backend
10
+ 2026-04-29 15:35:52,400 INFO MainThread:470303 [wandb_init.py:init():911] sending inform_init request
11
+ 2026-04-29 15:35:52,408 INFO MainThread:470303 [wandb_init.py:init():919] backend started and connected
12
+ 2026-04-29 15:35:52,410 INFO MainThread:470303 [wandb_init.py:init():989] updated telemetry
13
+ 2026-04-29 15:35:52,430 INFO MainThread:470303 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
14
+ 2026-04-29 15:35:53,838 INFO MainThread:470303 [wandb_init.py:init():1058] starting run threads in backend
15
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_console_start():2542] atexit reg
16
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2391] redirect: wrap_raw
17
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2460] Wrapping output streams.
18
+ 2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2483] Redirects installed.
19
+ 2026-04-29 15:35:54,077 INFO MainThread:470303 [wandb_init.py:init():1098] run started, returning control to user process
20
+ 2026-04-30 05:27:57,103 INFO wandb-AsyncioManager-main:470303 [service_client.py:_forward_responses():134] Reached EOF.
21
+ 2026-04-30 05:27:57,104 INFO wandb-AsyncioManager-main:470303 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
22
+ 2026-04-30 05:27:59,641 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
23
+ Traceback (most recent call last):
24
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
25
+ await fn()
26
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
27
+ await self._send_server_request(request)
28
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
29
+ await self._drain_writer()
30
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
31
+ await self._writer.drain()
32
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
33
+ await self._protocol._drain_helper()
34
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
35
+ raise ConnectionResetError('Connection lost')
36
+ ConnectionResetError: Connection lost
37
+ 2026-04-30 05:27:59,660 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
38
+ Traceback (most recent call last):
39
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
40
+ await fn()
41
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
42
+ await self._send_server_request(request)
43
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 78, in _send_server_request
44
+ raise self._broken_exc.with_traceback(self._broken_tb)
45
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
46
+ await self._drain_writer()
47
+ File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
48
+ await self._writer.drain()
49
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
50
+ await self._protocol._drain_helper()
51
+ File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
52
+ raise ConnectionResetError('Connection lost')
53
+ ConnectionResetError: Connection lost