File size: 9,190 Bytes
30f3e88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | 2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Configure stats pid to 470303
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_setup.py:_flush():81] Loading settings from environment variables
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():721] Logging user logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug.log
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to /home/xun/rsadhukh/STEM/logs/midfine_base_final/wandb/run-20260429_153552-r20yn80u/logs/debug-internal.log
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():848] calling init triggers
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
config: {'dump_dir': '/home/xun/rsadhukh/STEM/logs/midfine_base_final', 'seed': 777, 'model_type': 'olmo3', 'stem_up_proj_layers': [], 'grad_acc_steps': 2, 'gc_collect_freq': 1000, 'probe_freq': 100, 'steps': 50000, 'stage_steps': None, 'data': {'root_dir': '/home/xun/rsadhukh/STEM/data/dolmino-mix_shuffled/', 'sources': {'cranecode': 10.0, 'stack_edu': 10.0, 'cranemath': 5.63, 'dolmino-math': 10.7, 'megamatt': 1.73, 'tinymath-mind': 0.9, 'tinymath-pot': 0.24, 'reddit_to_flashcards': 5.9, 'wiki_to_rcqa': 3.0, 'nemotron-synth-qa': 5.0, 'math-meta-reasoning': 0.38, 'code-meta-reasoning': 0.46, 'program_verifiable': 0.16, 'qwq-reasoning-traces': 1.87, 'openthoughts2': 1.25, 'general_reasoning_mix': 1.87, 'gemini-reasoning-traces': 0.25, 'tulu-3-sft': 1.1, 'dolmino_1-flan': 5.0, 'olmocr_science_pdfs': 5.0, 'stem-heavy-crawl': 5.0, 'common_crawl-high-quality': 22.5}, 'node_local': False, 'batch_size': 8, 'seq_len': 4096, 'n_views': 2, 'seed': 42, 'add_bos': True, 'add_eos': True, 'load_async': True, 'prefetch_size': 1024, 'tokenizer': {'name': 'huggingface', 'path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/'}, 'track_packed_source_mixture': True, 'packed_source_counts': None}, 'optim': {'lr': 7.44e-05, 'weight_decay': 0.1, 'epsilon': 1e-08, 'beta1': 0.9, 'beta2': 0.95, 'clip': 1.0, 'scheduler': 'linear', 'warmup': 0, 'lr_min_ratio': 0.0, 'cycle_length': 1.0, 'cosine_theta': 1.0, 'annealing_step': 1000, 'decay_fraction': 0.1, 'exp_factor': 0.5, 'initial_token_offset': 0, 'global_final_step': None}, 'model': {'dim': 2048, 'n_layers': 16, 'head_dim': 128, 'n_heads': 16, 'n_kv_heads': 16, 'ffn_dim_multiplier': 1.5, 'multiple_of': 256, 'norm_eps': 1e-06, 'rope_theta': 500000.0, 'rope_scaling': None, 'init_base_std': 0.02, 'init_std_factor': 'disabled', 'max_seqlen': 4096, 'seed': 42, 'vocab_size': 100352, 'weight_tying': False, 'sliding_window': None}, 'distributed': {'dp_shard': 1, 'dp_replicate': 32, 'tp_size': 1, 'selective_activation_checkpointing': False, 'compile': True, 'fsdp_type': 'full_shard', 'model_dtype': 'bf16', 'float8_recipe': None, 'float8_filter': 'layers\\.[0-9]+\\.', 'matmul_allow_tf32': False, 'detect_anomaly': False, 'compile_cache_size_limit': 8, 'spawn_method': 'forkserver', 'stem_parallel_size': 8}, 'env': {'MKL_SERVICE_FORCE_INTEL': 'GNU', 'OMP_NUM_THREADS': '1', 'MKL_NUM_THREADS': '1', 'ENABLE_INTRA_NODE_COMM': '1', 'TORCH_NCCL_AVOID_RECORD_STREAMS': '1', 'NCCL_IB_TIMEOUT': '22', 'NCCL_DEBUG': 'INFO', 'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1'}, 'checkpoint': {'dump': {'every': 5000, 'keep': 1}, 'eval': {'every': 100000, 'keep': 1}, 'path': '/home/xun/rsadhukh/STEM/logs/midfine_base_final/checkpoints', 'init_ckpt_path': '/data/rsadhukh/checkpoints/olmo2-1b-base-token4T/', 'continue_training_from_init': True, 'legacy_init_ckpt_lm_transformer': False, 'merge_lm_optim_seed_ckpt_path': None}, 'profiling': {'run': True, 'trace_folder': 'profiling', 'mem_warmup': 100, 'mem_steps': 2, 'profile_warmup': 102, 'profile_steps': 2}, 'logging': {'freq': 10, 'acc_freq': None, 'wandb': {'job_type': None, 'dir': None, 'project': 'stem', 'entity': None, 'tags': None, 'group': None, 'name': 'olmo2_1B_midfine', 'notes': None, 'config_exclude_keys': None, 'config_include_keys': None, 'anonymous': None, 'mode': None, 'allow_val_change': None, 'resume': None, 'force': None, 'tensorboard': None, 'sync_tensorboard': None, 'monitor_gym': None, 'save_code': None, 'id': None, 'fork_from': None, 'resume_from': None}}, 'async_eval_gpus': None, 'eval': {'generator': {'max_tokens': 16384, 'dtype': 'bf16', 'temperature': 1.0, 'top_p': 0.95}, 'harness': {'tasks': [{'task': 'hellaswag', 'dataset_path': '/data/rsadhukh/eval_data/hellaswag'}, {'task': 'boolq', 'dataset_path': '/data/rsadhukh/eval_data/super_glue'}, {'task': 'piqa', 'dataset_path': '/data/rsadhukh/eval_data/piqa'}, {'task': 'winogrande', 'dataset_path': '/data/rsadhukh/eval_data/winogrande'}, {'task': 'openbookqa', 'dataset_path': '/data/rsadhukh/eval_data/openbookqa'}, {'task': 'arc_easy', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}, {'task': 'arc_challenge', 'dataset_path': '/data/rsadhukh/eval_data/ai2_arc'}], 'confirm_run_unsafe_code': True, 'batch_size': 64}, 'validation': None}, '_wandb': {}}
2026-04-29 15:35:52,136 INFO MainThread:470303 [wandb_init.py:init():896] starting backend
2026-04-29 15:35:52,400 INFO MainThread:470303 [wandb_init.py:init():911] sending inform_init request
2026-04-29 15:35:52,408 INFO MainThread:470303 [wandb_init.py:init():919] backend started and connected
2026-04-29 15:35:52,410 INFO MainThread:470303 [wandb_init.py:init():989] updated telemetry
2026-04-29 15:35:52,430 INFO MainThread:470303 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
2026-04-29 15:35:53,838 INFO MainThread:470303 [wandb_init.py:init():1058] starting run threads in backend
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_console_start():2542] atexit reg
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2391] redirect: wrap_raw
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2460] Wrapping output streams.
2026-04-29 15:35:54,071 INFO MainThread:470303 [wandb_run.py:_redirect():2483] Redirects installed.
2026-04-29 15:35:54,077 INFO MainThread:470303 [wandb_init.py:init():1098] run started, returning control to user process
2026-04-30 05:27:57,103 INFO wandb-AsyncioManager-main:470303 [service_client.py:_forward_responses():134] Reached EOF.
2026-04-30 05:27:57,104 INFO wandb-AsyncioManager-main:470303 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
2026-04-30 05:27:59,641 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
Traceback (most recent call last):
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
await fn()
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
await self._send_server_request(request)
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
await self._drain_writer()
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
await self._writer.drain()
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
await self._protocol._drain_helper()
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
raise ConnectionResetError('Connection lost')
ConnectionResetError: Connection lost
2026-04-30 05:27:59,660 ERROR wandb-AsyncioManager-main:470303 [asyncio_manager.py:fn_wrap_exceptions():184] Uncaught exception in run_soon callback.
Traceback (most recent call last):
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_manager.py", line 182, in fn_wrap_exceptions
await fn()
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 45, in publish
await self._send_server_request(request)
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 78, in _send_server_request
raise self._broken_exc.with_traceback(self._broken_tb)
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 87, in _send_server_request
await self._drain_writer()
File "/home/xun/rsadhukh/STEM/stem/lib/python3.11/site-packages/wandb/sdk/lib/service/service_client.py", line 96, in _drain_writer
await self._writer.drain()
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 392, in drain
await self._protocol._drain_helper()
File "/opt/pyenv/versions/3.11.9/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
raise ConnectionResetError('Connection lost')
ConnectionResetError: Connection lost
|