kajuma commited on Dec 26, 2025

Commit

c5a630d

verified ·

1 Parent(s): 37125c5

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.gitattributes +3 -0
args.json +654 -0
images/batch-size vs samples.png +0 -0
images/batch-size.png +0 -0
images/grad-norm vs samples.png +0 -0
images/grad-norm.png +0 -0
images/iteration-time.png +0 -0
images/learning-rate vs samples.png +0 -0
images/learning-rate.png +0 -0
images/lm loss vs samples.png +0 -0
images/lm loss.png +0 -0
images/loss-scale vs samples.png +0 -0
images/loss-scale.png +0 -0
images/mem-allocated-bytes.png +0 -0
images/mem-allocated-count.png +0 -0
images/mem-max-allocated-bytes.png +0 -0
images/mem-reserved-bytes.png +0 -0
latest_checkpointed_iteration.txt +1 -0
latest_wandb_artifact_path.txt +1 -0
logging.jsonl +3 -0
runs/events.out.tfevents.1766547916.36fd00e7b21c.611253.0 +3 -0
wandb/wandb/debug-internal.log +0 -0
wandb/wandb/debug.log +26 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/files/config.yaml +1779 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/files/output.log +3 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/files/requirements.txt +219 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-metadata.json +275 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-summary.json +1 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-core.log +16 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log +0 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log +26 -0
wandb/wandb/run-20251224_034518-gd3q7mjv/run-gd3q7mjv.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -6510,3 +6510,6 @@ checkpoints/iter_0038100/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
 checkpoints/iter_0038100/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
 checkpoints/iter_0038100/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
 checkpoints/iter_0038100/__7_1.distcp filter=lfs diff=lfs merge=lfs -text

 checkpoints/iter_0038100/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
 checkpoints/iter_0038100/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
 checkpoints/iter_0038100/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
+logging.jsonl filter=lfs diff=lfs merge=lfs -text
+wandb/wandb/run-20251224_034518-gd3q7mjv/files/output.log filter=lfs diff=lfs merge=lfs -text
+wandb/wandb/run-20251224_034518-gd3q7mjv/run-gd3q7mjv.wandb filter=lfs diff=lfs merge=lfs -text

args.json ADDED Viewed

	@@ -0,0 +1,654 @@

+{
+  "use_ray": false,
+  "ray_exp_name": null,
+  "device_groups": null,
+  "model": "Qwen/Qwen3-0.6B-Base",
+  "model_type": "qwen3",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen3",
+  "system": null,
+  "max_length": 4096,
+  "truncation_strategy": "right",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": false,
+  "padding_free": true,
+  "padding_side": "right",
+  "loss_scale": "all",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [],
+  "val_dataset": [],
+  "cached_dataset": [
+    "/workspace/full"
+  ],
+  "cached_val_dataset": [],
+  "split_dataset_ratio": 0.0,
+  "data_seed": 42,
+  "dataset_num_proc": 32,
+  "load_from_cache_file": false,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "remove_unused_columns": true,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": null,
+  "temperature": null,
+  "top_k": 50,
+  "top_p": 0.9,
+  "repetition_penalty": 1.0,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": true,
+  "packing_length": 4096,
+  "packing_num_proc": 1,
+  "lazy_tokenize": false,
+  "custom_register_path": [],
+  "use_hf": true,
+  "hub_token": null,
+  "ddp_timeout": 18000000,
+  "ddp_backend": null,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "adapter_load": null,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "use_rslora": false,
+  "rlhf_type": null,
+  "ref_load": null,
+  "ref_adapter_load": null,
+  "beta": 0.1,
+  "rpo_alpha": null,
+  "reference_free": false,
+  "label_smoothing": 0.0,
+  "f_divergence_type": "reverse_kl",
+  "loss_type": null,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "calculate_KL": null,
+  "center_rewards_coefficient": null,
+  "generation_batch_size": null,
+  "steps_per_generation": null,
+  "num_generations": 8,
+  "max_completion_length": 512,
+  "importance_sampling_level": "token",
+  "tau_pos": 1.0,
+  "tau_neg": 1.05,
+  "epsilon": 0.2,
+  "epsilon_high": null,
+  "delta": null,
+  "use_vllm": true,
+  "vllm_mode": null,
+  "vllm_enable_prefix_caching": true,
+  "vllm_gpu_memory_utilization": 0.9,
+  "vllm_tensor_parallel_size": 1,
+  "vllm_max_model_len": null,
+  "vllm_enforce_eager": false,
+  "vllm_limit_mm_per_prompt": null,
+  "vllm_disable_cascade_attn": false,
+  "vllm_max_num_seqs": null,
+  "vllm_mm_processor_cache_gb": null,
+  "vllm_engine_kwargs": null,
+  "sleep_level": 0,
+  "offload_optimizer": false,
+  "offload_model": false,
+  "offload_bridge": false,
+  "vllm_server_base_url": null,
+  "vllm_server_host": null,
+  "vllm_server_port": [
+    8000
+  ],
+  "vllm_server_timeout": 240.0,
+  "vllm_server_group_port": null,
+  "reward_funcs": [],
+  "reward_weights": null,
+  "cosine_min_len_value_wrong": -0.5,
+  "cosine_max_len_value_wrong": 0.0,
+  "cosine_min_len_value_correct": 1.0,
+  "cosine_max_len_value_correct": 0.5,
+  "cosine_max_len": null,
+  "repetition_n_grams": 3,
+  "repetition_max_penalty": -1.0,
+  "soft_max_length": null,
+  "soft_cache_length": null,
+  "dynamic_sample": false,
+  "max_resample_times": 3,
+  "overlong_filter": false,
+  "scale_rewards": "group",
+  "advantage_estimator": "grpo",
+  "kl_in_reward": false,
+  "wandb_log_unique_prompts": null,
+  "log_completions": false,
+  "rollout_importance_sampling_mode": null,
+  "rollout_importance_sampling_threshold": 2.0,
+  "log_rollout_offpolicy_metrics": false,
+  "off_policy_sequence_mask_delta": null,
+  "reward_model": null,
+  "reward_model_plugin": null,
+  "sync_ref_model": false,
+  "ref_model_sync_steps": 512,
+  "ref_model_mixup_alpha": 0.6,
+  "async_generate": false,
+  "move_model_batches": null,
+  "multi_turn_scheduler": null,
+  "max_turns": null,
+  "completion_length_limit_scope": "per_round",
+  "vllm_server_pass_dataset": false,
+  "log_entropy": false,
+  "top_entropy_quantile": 1.0,
+  "num_iterations": 1,
+  "check_model": true,
+  "padded_vocab_size": 151936,
+  "initialize_embedding": false,
+  "mlp_padding_free": false,
+  "load_safetensors": false,
+  "save_safetensors": false,
+  "ref_model": null,
+  "ref_adapters": [],
+  "merge_lora": false,
+  "max_shard_size": "5GB",
+  "train_dataloader_shuffle": true,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": true,
+  "dataloader_prefetch_factor": 10,
+  "architectures": "Qwen3ForCausalLM",
+  "llm_architectures": "Qwen3ForCausalLM",
+  "max_epochs": null,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "patch_size": 1,
+  "save_strategy": "steps",
+  "original_max_position_embeddings": null,
+  "partial_rotary_factor": null,
+  "use_shared_expert_gate": false,
+  "vit_gradient_checkpointing": true,
+  "vit_lr": null,
+  "aligner_lr": null,
+  "gradient_checkpointing_kwargs": null,
+  "linear_num_value_heads": null,
+  "linear_num_key_heads": null,
+  "linear_key_head_dim": null,
+  "linear_value_head_dim": null,
+  "linear_conv_kernel_dim": null,
+  "layer_types": null,
+  "mrope_interleaved": false,
+  "micro_batch_size": 4,
+  "global_batch_size": 256,
+  "recompute_granularity": "full",
+  "recompute_method": "uniform",
+  "recompute_num_layers": 1,
+  "recompute_modules": [
+    "core_attn"
+  ],
+  "use_cpu_initialization": false,
+  "deterministic_mode": false,
+  "train_iters": 38100,
+  "log_interval": 1,
+  "tensorboard_dir": "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs",
+  "no_masked_softmax_fusion": false,
+  "no_bias_dropout_fusion": false,
+  "no_bias_swiglu_fusion": false,
+  "no_rope_fusion": false,
+  "no_gradient_accumulation_fusion": false,
+  "cross_entropy_loss_fusion": true,
+  "cross_entropy_fusion_impl": "native",
+  "calculate_per_token_loss": true,
+  "use_flash_attn": false,
+  "attention_backend": "flash",
+  "optimizer": "adam",
+  "optimizer_cpu_offload": false,
+  "optimizer_offload_fraction": 1.0,
+  "use_precision_aware_optimizer": true,
+  "main_grads_dtype": "fp32",
+  "main_params_dtype": "fp32",
+  "exp_avg_dtype": "fp32",
+  "exp_avg_sq_dtype": "fp32",
+  "dataloader_type": "cyclic",
+  "manual_gc": false,
+  "manual_gc_interval": 0,
+  "lr": 0.0001,
+  "lr_decay_style": "cosine",
+  "lr_decay_iters": null,
+  "lr_warmup_iters": 0,
+  "lr_warmup_fraction": 0.05,
+  "min_lr": 3e-06,
+  "weight_decay": 0.1,
+  "clip_grad": 1.0,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_eps": 1e-08,
+  "sgd_momentum": 0.9,
+  "save": "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709",
+  "save_interval": 100,
+  "save_retain_interval": null,
+  "no_save_optim": false,
+  "no_save_rng": false,
+  "load": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
+  "no_load_optim": false,
+  "no_load_rng": false,
+  "finetune": true,
+  "ckpt_format": "torch_dist",
+  "no_initialization": true,
+  "auto_detect_ckpt_format": true,
+  "exit_on_missing_checkpoint": true,
+  "async_save": false,
+  "use_persistent_ckpt_worker": false,
+  "ckpt_fully_parallel_load": false,
+  "ckpt_assume_constant_structure": false,
+  "distributed_backend": "nccl",
+  "local_rank": 0,
+  "use_distributed_optimizer": true,
+  "tensor_model_parallel_size": 1,
+  "pipeline_model_parallel_size": 1,
+  "decoder_first_pipeline_num_layers": null,
+  "decoder_last_pipeline_num_layers": null,
+  "account_for_embedding_in_pipeline_split": false,
+  "account_for_loss_in_pipeline_split": false,
+  "sequence_parallel": false,
+  "context_parallel_size": 1,
+  "tp_comm_overlap": false,
+  "overlap_grad_reduce": true,
+  "overlap_param_gather": true,
+  "distributed_timeout_minutes": 300000,
+  "num_layers_per_virtual_pipeline_stage": null,
+  "num_virtual_stages_per_pipeline_rank": null,
+  "microbatch_group_size_per_virtual_pipeline_stage": null,
+  "pipeline_model_parallel_layout": null,
+  "num_layers": 28,
+  "hidden_size": 1024,
+  "ffn_hidden_size": 3072,
+  "num_attention_heads": 16,
+  "group_query_attention": true,
+  "num_query_groups": 8,
+  "softmax_type": null,
+  "window_size": null,
+  "window_attn_skip_freq": null,
+  "max_position_embeddings": 32768,
+  "position_embedding_type": "rope",
+  "mrope_section": null,
+  "rotary_base": 1000000,
+  "rotary_percent": 1.0,
+  "rotary_interleaved": false,
+  "normalization": "RMSNorm",
+  "norm_epsilon": 1e-06,
+  "swiglu": true,
+  "quick_geglu": false,
+  "activation_func_clamp_value": null,
+  "glu_linear_offset": null,
+  "untie_embeddings_and_output_weights": false,
+  "disable_bias_linear": true,
+  "add_qkv_bias": false,
+  "attention_dropout": 0.0,
+  "hidden_dropout": 0.0,
+  "kv_channels": 128,
+  "qk_layernorm": true,
+  "qk_l2_norm": null,
+  "no_rope_freq": null,
+  "moe_apply_probs_on_input": null,
+  "transformer_impl": "transformer_engine",
+  "num_experts": null,
+  "moe_layer_freq": "1",
+  "moe_ffn_hidden_size": null,
+  "moe_shared_expert_intermediate_size": null,
+  "moe_router_topk": 2,
+  "moe_router_num_groups": null,
+  "moe_router_group_topk": null,
+  "moe_router_pre_softmax": false,
+  "moe_router_dtype": "fp32",
+  "moe_router_score_function": "softmax",
+  "moe_router_bias_update_rate": null,
+  "moe_router_enable_expert_bias": false,
+  "moe_router_topk_scaling_factor": null,
+  "moe_router_load_balancing_type": "aux_loss",
+  "expert_model_parallel_size": 1,
+  "expert_tensor_parallel_size": 1,
+  "moe_token_dispatcher_type": null,
+  "moe_enable_deepep": false,
+  "moe_grouped_gemm": true,
+  "moe_permute_fusion": false,
+  "moe_aux_loss_coeff": 0.0,
+  "moe_z_loss_coeff": null,
+  "moe_shared_expert_overlap": false,
+  "moe_layer_recompute": false,
+  "moe_expert_capacity_factor": null,
+  "moe_pad_expert_input_to_capacity": false,
+  "moe_token_drop_policy": null,
+  "multi_latent_attention": false,
+  "q_lora_rank": null,
+  "kv_lora_rank": 32,
+  "qk_head_dim": 128,
+  "qk_pos_emb_head_dim": 64,
+  "mtp_num_layers": null,
+  "mtp_loss_scaling_factor": 0.1,
+  "fp8_format": null,
+  "fp8_recipe": "delayed",
+  "fp8_amax_history_len": 1024,
+  "fp8_amax_compute_algo": "max",
+  "fp8_param_gather": false,
+  "fp16": false,
+  "bf16": true,
+  "apply_query_key_layer_scaling": false,
+  "attention_softmax_in_fp32": true,
+  "log_params_norm": false,
+  "log_throughput": false,
+  "tensorboard_log_interval": 1,
+  "tensorboard_queue_size": 50,
+  "log_timers_to_tensorboard": true,
+  "no_log_learning_rate_to_tensorboard": false,
+  "log_validation_ppl_to_tensorboard": true,
+  "log_memory_to_tensorboard": true,
+  "logging_level": "20",
+  "wandb_project": "plt",
+  "wandb_exp_name": "baseline",
+  "wandb_save_dir": null,
+  "eval_iters": -1,
+  "eval_interval": 100,
+  "seq_length": 4096,
+  "num_workers": 32,
+  "no_data_sharding": false,
+  "megatron_extra_kwargs": {},
+  "add_version": true,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen3-0.6B-Base",
+  "model_info": "ModelInfo(model_type='qwen3', model_dir='/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B-Base', hf_model_id='Qwen/Qwen3-0.6B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-Base', hf_model_id='Qwen/Qwen3-1.7B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Base', hf_model_id='Qwen/Qwen3-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-Base', hf_model_id='Qwen/Qwen3-8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-Base', hf_model_id='Qwen/Qwen3-14B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B', hf_model_id='Qwen/Qwen3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B', hf_model_id='Qwen/Qwen3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B', hf_model_id='Qwen/Qwen3-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B-FP8', hf_model_id='Qwen/Qwen3-0.6B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-FP8', hf_model_id='Qwen/Qwen3-1.7B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-FP8', hf_model_id='Qwen/Qwen3-4B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-FP8', hf_model_id='Qwen/Qwen3-8B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-FP8', hf_model_id='Qwen/Qwen3-14B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-FP8', hf_model_id='Qwen/Qwen3-32B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-AWQ', hf_model_id='Qwen/Qwen3-4B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-AWQ', hf_model_id='Qwen/Qwen3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-AWQ', hf_model_id='Qwen/Qwen3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-AWQ', hf_model_id='Qwen/Qwen3-32B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-32B-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f555d60fe20>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
+  "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
+  "_val_dataset_exists": [],
+  "hub": "<class 'swift.hub.hub.HFHub'>",
+  "megatron_model_meta": "MegatronModelMeta(megatron_model_type='gpt', model_types=['qwen2', 'qwen2_5', 'qwq', 'qwq_preview', 'qwen2_5_math', 'llama', 'llama3', 'llama3_1', 'llama3_2', 'longwriter_llama3_1', 'codefuse_codellama', 'marco_o1', 'deepseek', 'deepseek_r1_distill', 'yi', 'yi_coder', 'sus', 'skywork_o1', 'openbuddy_llama', 'openbuddy_llama3', 'megrez', 'reflection', 'numina', 'ziya', 'mengzi3', 'qwen3', 'qwen3_thinking', 'qwen3_nothinking', 'qwen2_moe', 'qwen3_moe', 'qwen3_moe_thinking', 'qwen3_coder', 'internlm3', 'mimo', 'mimo_rl', 'moonlight', 'kimi_k2', 'deepseek_moe', 'deepseek_v2', 'deepseek_v2_5', 'deepseek_r1', 'dots1', 'ernie', 'glm4_5', 'deepseek_v3_1', 'ernie_thinking', 'gpt_oss'], is_multimodal=False, bridge_cls=<class 'swift.megatron.model.gpt_bridge.GPTBridge'>, model_cls=<class 'swift.megatron.model.gpt_model.GPTModel'>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x7f54e0d9dc60>, visual_cls=None, extra_args_provider=None)",
+  "extra_args": {
+    "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
+    "is_multimodal": false,
+    "hf_model_type": "qwen3",
+    "use_ray": false,
+    "ray_exp_name": null,
+    "device_groups": null,
+    "model": "Qwen/Qwen3-0.6B-Base",
+    "model_type": "qwen3",
+    "model_revision": null,
+    "task_type": "causal_lm",
+    "torch_dtype": "bfloat16",
+    "attn_impl": null,
+    "new_special_tokens": [],
+    "num_labels": null,
+    "problem_type": null,
+    "rope_scaling": null,
+    "device_map": null,
+    "max_memory": {},
+    "max_model_len": null,
+    "local_repo_path": null,
+    "init_strategy": null,
+    "template": "qwen3",
+    "system": null,
+    "max_length": 4096,
+    "truncation_strategy": "right",
+    "max_pixels": null,
+    "agent_template": null,
+    "norm_bbox": null,
+    "use_chat_template": false,
+    "padding_free": true,
+    "padding_side": "right",
+    "sequence_parallel_size": 1,
+    "response_prefix": null,
+    "template_backend": "swift",
+    "dataset": [],
+    "val_dataset": [],
+    "cached_dataset": [
+      "/workspace/full"
+    ],
+    "cached_val_dataset": [],
+    "split_dataset_ratio": 0.0,
+    "data_seed": 42,
+    "dataset_num_proc": 32,
+    "load_from_cache_file": false,
+    "dataset_shuffle": true,
+    "val_dataset_shuffle": false,
+    "streaming": false,
+    "interleave_prob": null,
+    "stopping_strategy": "first_exhausted",
+    "shuffle_buffer_size": 1000,
+    "download_mode": "reuse_dataset_if_exists",
+    "columns": {},
+    "strict": false,
+    "remove_unused_columns": true,
+    "model_name": null,
+    "model_author": null,
+    "custom_dataset_info": [],
+    "quant_method": null,
+    "quant_bits": null,
+    "hqq_axis": null,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "bnb_4bit_quant_storage": null,
+    "max_new_tokens": null,
+    "temperature": null,
+    "top_k": 50,
+    "top_p": 0.9,
+    "repetition_penalty": 1.0,
+    "num_beams": 1,
+    "stream": false,
+    "stop_words": [],
+    "logprobs": false,
+    "top_logprobs": null,
+    "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
+    "lora_modules": [],
+    "tuner_backend": "peft",
+    "train_type": "full",
+    "adapters": [],
+    "external_plugins": [],
+    "model_kwargs": {},
+    "load_args": false,
+    "load_data_args": false,
+    "packing": true,
+    "packing_length": 4096,
+    "packing_num_proc": 1,
+    "lazy_tokenize": false,
+    "custom_register_path": [],
+    "use_hf": true,
+    "hub_token": null,
+    "ddp_timeout": 18000000,
+    "ddp_backend": null,
+    "ignore_args_error": false,
+    "use_swift_lora": false,
+    "freeze_llm": false,
+    "freeze_vit": true,
+    "freeze_aligner": true,
+    "freeze_parameters": [],
+    "freeze_parameters_regex": null,
+    "freeze_parameters_ratio": 0.0,
+    "trainable_parameters": [],
+    "trainable_parameters_regex": null,
+    "adapter_load": null,
+    "target_modules": [
+      "all-linear"
+    ],
+    "target_regex": null,
+    "modules_to_save": [],
+    "lora_rank": 8,
+    "lora_alpha": 32,
+    "lora_dropout": 0.05,
+    "lora_bias": "none",
+    "lora_dtype": null,
+    "use_rslora": false,
+    "rlhf_type": null,
+    "ref_load": null,
+    "ref_adapter_load": null,
+    "beta": 0.1,
+    "rpo_alpha": null,
+    "reference_free": false,
+    "label_smoothing": 0.0,
+    "f_divergence_type": "reverse_kl",
+    "loss_type": null,
+    "desirable_weight": 1.0,
+    "undesirable_weight": 1.0,
+    "calculate_KL": null,
+    "center_rewards_coefficient": null,
+    "generation_batch_size": null,
+    "steps_per_generation": null,
+    "num_generations": 8,
+    "max_completion_length": 512,
+    "importance_sampling_level": "token",
+    "tau_pos": 1.0,
+    "tau_neg": 1.05,
+    "epsilon": 0.2,
+    "epsilon_high": null,
+    "delta": null,
+    "use_vllm": true,
+    "vllm_mode": null,
+    "vllm_enable_prefix_caching": true,
+    "vllm_gpu_memory_utilization": 0.9,
+    "vllm_tensor_parallel_size": 1,
+    "vllm_max_model_len": null,
+    "vllm_enforce_eager": false,
+    "vllm_limit_mm_per_prompt": null,
+    "vllm_disable_cascade_attn": false,
+    "vllm_max_num_seqs": null,
+    "vllm_mm_processor_cache_gb": null,
+    "vllm_engine_kwargs": null,
+    "sleep_level": 0,
+    "offload_optimizer": false,
+    "offload_model": false,
+    "offload_bridge": false,
+    "vllm_server_base_url": null,
+    "vllm_server_host": null,
+    "vllm_server_port": [
+      8000
+    ],
+    "vllm_server_timeout": 240.0,
+    "vllm_server_group_port": null,
+    "reward_funcs": [],
+    "reward_weights": null,
+    "cosine_min_len_value_wrong": -0.5,
+    "cosine_max_len_value_wrong": 0.0,
+    "cosine_min_len_value_correct": 1.0,
+    "cosine_max_len_value_correct": 0.5,
+    "cosine_max_len": null,
+    "repetition_n_grams": 3,
+    "repetition_max_penalty": -1.0,
+    "soft_max_length": null,
+    "soft_cache_length": null,
+    "dynamic_sample": false,
+    "max_resample_times": 3,
+    "overlong_filter": false,
+    "scale_rewards": "group",
+    "advantage_estimator": "grpo",
+    "kl_in_reward": false,
+    "wandb_log_unique_prompts": null,
+    "log_completions": false,
+    "rollout_importance_sampling_mode": null,
+    "rollout_importance_sampling_threshold": 2.0,
+    "log_rollout_offpolicy_metrics": false,
+    "off_policy_sequence_mask_delta": null,
+    "reward_model": null,
+    "reward_model_plugin": null,
+    "sync_ref_model": false,
+    "ref_model_sync_steps": 512,
+    "ref_model_mixup_alpha": 0.6,
+    "async_generate": false,
+    "move_model_batches": null,
+    "multi_turn_scheduler": null,
+    "max_turns": null,
+    "completion_length_limit_scope": "per_round",
+    "vllm_server_pass_dataset": false,
+    "log_entropy": false,
+    "top_entropy_quantile": 1.0,
+    "num_iterations": 1,
+    "check_model": true,
+    "padded_vocab_size": 151936,
+    "initialize_embedding": false,
+    "mlp_padding_free": false,
+    "load_safetensors": false,
+    "save_safetensors": false,
+    "ref_model": null,
+    "ref_adapters": [],
+    "merge_lora": false,
+    "max_shard_size": "5GB",
+    "train_dataloader_shuffle": true,
+    "dataloader_pin_memory": true,
+    "dataloader_persistent_workers": true,
+    "dataloader_prefetch_factor": 10,
+    "architectures": "Qwen3ForCausalLM",
+    "llm_architectures": "Qwen3ForCausalLM",
+    "max_epochs": null,
+    "enable_dft_loss": false,
+    "enable_channel_loss": false,
+    "patch_size": 1,
+    "save_strategy": "steps",
+    "original_max_position_embeddings": null,
+    "partial_rotary_factor": null,
+    "use_shared_expert_gate": false,
+    "vit_gradient_checkpointing": true,
+    "vit_lr": null,
+    "aligner_lr": null,
+    "gradient_checkpointing_kwargs": null,
+    "linear_num_value_heads": null,
+    "linear_num_key_heads": null,
+    "linear_key_head_dim": null,
+    "linear_value_head_dim": null,
+    "linear_conv_kernel_dim": null,
+    "layer_types": null,
+    "mrope_interleaved": false,
+    "add_version": true
+  }
+}

images/batch-size vs samples.png ADDED Viewed

images/batch-size.png ADDED Viewed

images/grad-norm vs samples.png ADDED Viewed

images/grad-norm.png ADDED Viewed

images/iteration-time.png ADDED Viewed

images/learning-rate vs samples.png ADDED Viewed

images/learning-rate.png ADDED Viewed

images/lm loss vs samples.png ADDED Viewed

images/lm loss.png ADDED Viewed

images/loss-scale vs samples.png ADDED Viewed

images/loss-scale.png ADDED Viewed

images/mem-allocated-bytes.png ADDED Viewed

images/mem-allocated-count.png ADDED Viewed

images/mem-max-allocated-bytes.png ADDED Viewed

images/mem-reserved-bytes.png ADDED Viewed

latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 38100

latest_wandb_artifact_path.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tepic/plt

logging.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6ef2e8418b81fb20b5b7952eef83e998df430d4468f8555615d55079c3d2b56
+size 11152685

runs/events.out.tfevents.1766547916.36fd00e7b21c.611253.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a936c171e3fadb62874e1a23844e5a673a393745d4a7d60006a4f4703e0a95a5
+size 32283810

wandb/wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,26 @@

+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Configure stats pid to 611253
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():841] calling init triggers
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 4, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 38100, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 4096, 'encoder_seq_length': 4096, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'baseline', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 4096, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/full'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 4096, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': [], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': None, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 1, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():889] starting backend
+2025-12-24 03:45:19,067 INFO    MainThread:611253 [wandb_init.py:init():892] sending inform_init request
+2025-12-24 03:45:19,070 INFO    MainThread:611253 [wandb_init.py:init():900] backend started and connected
+2025-12-24 03:45:19,074 INFO    MainThread:611253 [wandb_init.py:init():970] updated telemetry
+2025-12-24 03:45:19,081 INFO    MainThread:611253 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2025-12-24 03:45:19,580 INFO    MainThread:611253 [wandb_init.py:init():1041] starting run threads in backend
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_console_start():2521] atexit reg
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2461] Redirects installed.
+2025-12-24 03:45:19,742 INFO    MainThread:611253 [wandb_init.py:init():1081] run started, returning control to user process
+2025-12-26 07:36:28,290 INFO    MainThread:611253 [wandb_run.py:_finish():2287] finishing run tepic/plt/gd3q7mjv
+2025-12-26 07:36:28,292 INFO    MainThread:611253 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
+2025-12-26 07:36:28,293 INFO    MainThread:611253 [wandb_run.py:_restore():2468] restore
+2025-12-26 07:36:28,293 INFO    MainThread:611253 [wandb_run.py:_restore():2474] restore done
+2025-12-26 07:36:29,517 INFO    MainThread:611253 [wandb_run.py:_footer_sync_info():3862] logging synced files

wandb/wandb/run-20251224_034518-gd3q7mjv/files/config.yaml ADDED Viewed

	@@ -0,0 +1,1779 @@

+_wandb:
+    value:
+        cli_version: 0.23.1
+        e:
+            5bh5hk313ky3l0v9f9cesb7o1x31upc6:
+                args:
+                    - --seed
+                    - "42"
+                    - --micro-batch-size
+                    - "4"
+                    - --global-batch-size
+                    - "256"
+                    - --recompute-granularity
+                    - full
+                    - --recompute-method
+                    - uniform
+                    - --recompute-num-layers
+                    - "1"
+                    - --recompute-modules
+                    - core_attn
+                    - --train-iters
+                    - "38100"
+                    - --log-interval
+                    - "1"
+                    - --tensorboard-dir
+                    - /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs
+                    - --cross-entropy-loss-fusion
+                    - --cross-entropy-fusion-impl
+                    - native
+                    - --calculate-per-token-loss
+                    - --attention-backend
+                    - flash
+                    - --optimizer
+                    - adam
+                    - --optimizer-offload-fraction
+                    - "1.0"
+                    - --use-precision-aware-optimizer
+                    - --main-grads-dtype
+                    - fp32
+                    - --main-params-dtype
+                    - fp32
+                    - --exp-avg-dtype
+                    - fp32
+                    - --exp-avg-sq-dtype
+                    - fp32
+                    - --dataloader-type
+                    - cyclic
+                    - --manual-gc-interval
+                    - "0"
+                    - --lr
+                    - "0.0001"
+                    - --lr-decay-style
+                    - cosine
+                    - --lr-warmup-iters
+                    - "0"
+                    - --lr-warmup-fraction
+                    - "0.05"
+                    - --min-lr
+                    - "3e-06"
+                    - --weight-decay
+                    - "0.1"
+                    - --clip-grad
+                    - "1.0"
+                    - --adam-beta1
+                    - "0.9"
+                    - --adam-beta2
+                    - "0.95"
+                    - --adam-eps
+                    - "1e-08"
+                    - --sgd-momentum
+                    - "0.9"
+                    - --save
+                    - /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709
+                    - --save-interval
+                    - "100"
+                    - --load
+                    - /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
+                    - --finetune
+                    - --ckpt-format
+                    - torch_dist
+                    - --no-initialization
+                    - --auto-detect-ckpt-format
+                    - --exit-on-missing-checkpoint
+                    - --distributed-backend
+                    - nccl
+                    - --local-rank
+                    - "7"
+                    - --use-distributed-optimizer
+                    - --tensor-model-parallel-size
+                    - "1"
+                    - --pipeline-model-parallel-size
+                    - "1"
+                    - --context-parallel-size
+                    - "1"
+                    - --overlap-grad-reduce
+                    - --overlap-param-gather
+                    - --distributed-timeout-minutes
+                    - "300000"
+                    - --num-layers
+                    - "28"
+                    - --hidden-size
+                    - "1024"
+                    - --ffn-hidden-size
+                    - "3072"
+                    - --num-attention-heads
+                    - "16"
+                    - --group-query-attention
+                    - --num-query-groups
+                    - "8"
+                    - --max-position-embeddings
+                    - "32768"
+                    - --position-embedding-type
+                    - rope
+                    - --rotary-base
+                    - "1000000"
+                    - --rotary-percent
+                    - "1.0"
+                    - --normalization
+                    - RMSNorm
+                    - --norm-epsilon
+                    - "1e-06"
+                    - --swiglu
+                    - --disable-bias-linear
+                    - --attention-dropout
+                    - "0.0"
+                    - --hidden-dropout
+                    - "0.0"
+                    - --kv-channels
+                    - "128"
+                    - --qk-layernorm
+                    - --transformer-impl
+                    - transformer_engine
+                    - --moe-layer-freq
+                    - "1"
+                    - --moe-router-topk
+                    - "2"
+                    - --moe-router-dtype
+                    - fp32
+                    - --moe-router-score-function
+                    - softmax
+                    - --moe-router-load-balancing-type
+                    - aux_loss
+                    - --expert-model-parallel-size
+                    - "1"
+                    - --expert-tensor-parallel-size
+                    - "1"
+                    - --moe-token-dispatcher-type
+                    - alltoall
+                    - --moe-grouped-gemm
+                    - --moe-aux-loss-coeff
+                    - "0.0"
+                    - --moe-token-drop-policy
+                    - probs
+                    - --kv-lora-rank
+                    - "32"
+                    - --qk-head-dim
+                    - "128"
+                    - --qk-pos-emb-head-dim
+                    - "64"
+                    - --mtp-loss-scaling-factor
+                    - "0.1"
+                    - --fp8-recipe
+                    - delayed
+                    - --fp8-amax-history-len
+                    - "1024"
+                    - --fp8-amax-compute-algo
+                    - max
+                    - --bf16
+                    - --attention-softmax-in-fp32
+                    - --tensorboard-log-interval
+                    - "1"
+                    - --tensorboard-queue-size
+                    - "50"
+                    - --log-timers-to-tensorboard
+                    - --log-validation-ppl-to-tensorboard
+                    - --log-memory-to-tensorboard
+                    - --logging-level
+                    - "20"
+                    - --wandb-project
+                    - plt
+                    - --wandb-exp-name
+                    - baseline
+                    - --eval-iters
+                    - "-1"
+                    - --eval-interval
+                    - "100"
+                    - --seq-length
+                    - "4096"
+                    - --num-workers
+                    - "32"
+                codePath: swift/cli/_megatron/pt.py
+                codePathLocal: swift/cli/_megatron/pt.py
+                cpu_count: 72
+                cpu_count_logical: 144
+                cudaVersion: "13.0"
+                disk:
+                    /:
+                        total: "7669363507200"
+                        used: "983051857920"
+                email: kazuma826826@gmail.com
+                executable: /venv/main/bin/python3.12
+                git:
+                    commit: ea7cc214b68fb511dd83bff83a504b7f43053577
+                    remote: https://github.com/weak-kajuma/halcyon-recipe2.git
+                gpu: NVIDIA GeForce RTX 5090
+                gpu_count: 8
+                gpu_nvidia:
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-23ca8669-46fc-19eb-348b-e51e591c150d
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b
+                    - architecture: Blackwell
+                      cudaCores: 21760
+                      memoryTotal: "34190917632"
+                      name: NVIDIA GeForce RTX 5090
+                      uuid: GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef
+                host: 36fd00e7b21c
+                memory:
+                    total: "540643262464"
+                os: Linux-6.8.0-58-generic-x86_64-with-glibc2.39
+                program: /workspace/halcyon-recipe2/swift/cli/_megatron/pt.py
+                python: CPython 3.12.12
+                root: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb
+                startedAt: "2025-12-24T03:45:18.795219Z"
+                writerId: 5bh5hk313ky3l0v9f9cesb7o1x31upc6
+        m: []
+        python_version: 3.12.12
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 61
+            "4": 3.12.12
+            "5": 0.23.1
+            "6": 4.57.3
+            "12": 0.23.1
+            "13": linux-x86_64
+account_for_embedding_in_pipeline_split:
+    value: false
+account_for_loss_in_pipeline_split:
+    value: false
+accumulate_allreduce_grads_in_fp32:
+    value: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_eps:
+    value: 1e-08
+adapter_load:
+    value: null
+adapters:
+    value: []
+add_bias_linear:
+    value: false
+add_position_embedding:
+    value: true
+add_qkv_bias:
+    value: false
+add_version:
+    value: true
+adlr_autoresume:
+    value: false
+adlr_autoresume_interval:
+    value: 1000
+advantage_estimator:
+    value: grpo
+agent_template:
+    value: null
+align_grad_reduce:
+    value: true
+align_param_gather:
+    value: false
+aligner_lr:
+    value: null
+app_tag_run_name:
+    value: null
+app_tag_run_version:
+    value: 0.0.0
+apply_layernorm_1p:
+    value: false
+apply_query_key_layer_scaling:
+    value: false
+apply_residual_connection_post_layernorm:
+    value: false
+apply_rope_fusion:
+    value: true
+architectures:
+    value: Qwen3ForCausalLM
+async_generate:
+    value: false
+async_save:
+    value: null
+async_tensor_model_parallel_allreduce:
+    value: true
+attention_backend:
+    value: flash
+attention_dropout:
+    value: 0
+attention_softmax_in_fp32:
+    value: true
+attn_impl:
+    value: null
+auto_detect_ckpt_format:
+    value: true
+barrier_with_L1_time:
+    value: true
+bert_binary_head:
+    value: true
+bert_embedder_type:
+    value: megatron
+bert_load:
+    value: null
+beta:
+    value: 0.1
+bf16:
+    value: true
+bias_dropout_fusion:
+    value: true
+bias_gelu_fusion:
+    value: false
+bias_swiglu_fusion:
+    value: true
+biencoder_projection_dim:
+    value: 0
+biencoder_shared_query_context_model:
+    value: false
+block_data_path:
+    value: null
+bnb_4bit_compute_dtype:
+    value: torch.bfloat16
+bnb_4bit_quant_storage:
+    value: null
+bnb_4bit_quant_type:
+    value: nf4
+bnb_4bit_use_double_quant:
+    value: true
+cache_mla_latents:
+    value: false
+cached_dataset:
+    value:
+        - /workspace/full
+cached_val_dataset:
+    value: []
+calc_ft_timeouts:
+    value: false
+calculate_KL:
+    value: null
+calculate_per_token_loss:
+    value: true
+center_rewards_coefficient:
+    value: null
+check_for_large_grads:
+    value: false
+check_for_nan_in_loss_and_grad:
+    value: true
+check_for_spiky_loss:
+    value: false
+check_model:
+    value: true
+check_weight_hash_across_dp_replicas_interval:
+    value: null
+ckpt_assume_constant_structure:
+    value: false
+ckpt_convert_format:
+    value: null
+ckpt_convert_save:
+    value: null
+ckpt_convert_update_legacy_dist_opt_format:
+    value: false
+ckpt_dir:
+    value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
+ckpt_format:
+    value: torch_dist
+ckpt_fully_parallel_load:
+    value: false
+ckpt_fully_parallel_save:
+    value: true
+ckpt_fully_parallel_save_deprecated:
+    value: false
+ckpt_step:
+    value: null
+classes_fraction:
+    value: 1
+clip_grad:
+    value: 1
+clone_scatter_output_in_embedding:
+    value: true
+completion_length_limit_scope:
+    value: per_round
+config_logger_dir:
+    value: ""
+consumed_train_samples:
+    value: 0
+consumed_valid_samples:
+    value: 0
+context_parallel_size:
+    value: 1
+cosine_max_len:
+    value: null
+cosine_max_len_value_correct:
+    value: 0.5
+cosine_max_len_value_wrong:
+    value: 0
+cosine_min_len_value_correct:
+    value: 1
+cosine_min_len_value_wrong:
+    value: -0.5
+cp_comm_type:
+    value:
+        - p2p
+create_attention_mask_in_dataloader:
+    value: true
+cross_entropy_fusion_impl:
+    value: native
+cross_entropy_loss_fusion:
+    value: true
+cuda_graph_scope:
+    value: full
+cuda_graph_warmup_steps:
+    value: 3
+custom_dataset_info:
+    value: []
+custom_register_path:
+    value: []
+data_args_path:
+    value: null
+data_cache_path:
+    value: null
+data_parallel_random_init:
+    value: false
+data_parallel_sharding_strategy:
+    value: no_shard
+data_parallel_size:
+    value: 8
+data_path:
+    value: null
+data_per_class_fraction:
+    value: 1
+data_seed:
+    value: 42
+data_sharding:
+    value: true
+dataloader_persistent_workers:
+    value: true
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: 10
+dataloader_type:
+    value: cyclic
+dataset:
+    value: []
+dataset_num_proc:
+    value: 32
+dataset_shuffle:
+    value: true
+ddp_average_in_collective:
+    value: false
+ddp_backend:
+    value: null
+ddp_bucket_size:
+    value: null
+ddp_num_buckets:
+    value: null
+ddp_pad_buckets_for_high_nccl_busbw:
+    value: false
+ddp_timeout:
+    value: 18000000
+decoder_first_pipeline_num_layers:
+    value: null
+decoder_last_pipeline_num_layers:
+    value: null
+decoder_num_layers:
+    value: null
+decoder_seq_length:
+    value: null
+decoupled_lr:
+    value: null
+decoupled_min_lr:
+    value: null
+decrease_batch_size_if_needed:
+    value: false
+defer_embedding_wgrad_compute:
+    value: false
+delay_wgrad_compute:
+    value: false
+delta:
+    value: null
+deprecated_use_mcore_models:
+    value: false
+desirable_weight:
+    value: 1
+deterministic_mode:
+    value: false
+device_groups:
+    value: null
+device_map:
+    value: null
+dino_bottleneck_size:
+    value: 256
+dino_freeze_last_layer:
+    value: 1
+dino_head_hidden_size:
+    value: 2048
+dino_local_crops_number:
+    value: 10
+dino_local_img_size:
+    value: 96
+dino_norm_last_layer:
+    value: false
+dino_teacher_temp:
+    value: 0.07
+dino_warmup_teacher_temp:
+    value: 0.04
+dino_warmup_teacher_temp_epochs:
+    value: 30
+disable_bf16_reduced_precision_matmul:
+    value: false
+disable_mamba_mem_eff_path:
+    value: false
+disable_straggler_on_startup:
+    value: false
+dist_ckpt_format_deprecated:
+    value: null
+dist_ckpt_strictness:
+    value: assume_ok_unexpected
+distribute_saved_activations:
+    value: false
+distributed_backend:
+    value: nccl
+distributed_timeout_minutes:
+    value: 300000
+download_mode:
+    value: reuse_dataset_if_exists
+dynamic_sample:
+    value: false
+embedding_init_method_std:
+    value: null
+embedding_path:
+    value: null
+empty_unused_memory_level:
+    value: 0
+enable_channel_loss:
+    value: false
+enable_cuda_graph:
+    value: false
+enable_dft_loss:
+    value: false
+enable_experimental:
+    value: false
+enable_ft_package:
+    value: false
+enable_full_sharding_in_hsdp:
+    value: false
+enable_gloo_process_groups:
+    value: true
+enable_msc:
+    value: true
+enable_one_logger:
+    value: true
+encoder_num_layers:
+    value: 28
+encoder_seq_length:
+    value: 4096
+end_weight_decay:
+    value: 0.1
+eod_mask_loss:
+    value: false
+epsilon:
+    value: 0.2
+epsilon_high:
+    value: null
+error_injection_rate:
+    value: 0
+error_injection_type:
+    value: transient_error
+eval_interval:
+    value: 100
+eval_iters:
+    value: -1
+evidence_data_path:
+    value: null
+exit_duration_in_mins:
+    value: null
+exit_interval:
+    value: null
+exit_on_missing_checkpoint:
+    value: true
+exit_signal_handler:
+    value: false
+exp_avg_dtype:
+    value: torch.float32
+exp_avg_sq_dtype:
+    value: torch.float32
+expert_model_parallel_size:
+    value: 1
+expert_tensor_parallel_size:
+    value: 1
+external_cuda_graph:
+    value: false
+external_plugins:
+    value: []
+f_divergence_type:
+    value: reverse_kl
+ffn_hidden_size:
+    value: 3072
+finetune:
+    value: true
+first_last_layers_bf16:
+    value: false
+flash_decode:
+    value: false
+fp8:
+    value: null
+fp8_amax_compute_algo:
+    value: max
+fp8_amax_history_len:
+    value: 1024
+fp8_interval:
+    value: 1
+fp8_margin:
+    value: 0
+fp8_param_gather:
+    value: false
+fp8_recipe:
+    value: delayed
+fp8_wgrad:
+    value: true
+fp16:
+    value: false
+fp16_lm_cross_entropy:
+    value: false
+fp32_residual_connection:
+    value: false
+freeze_aligner:
+    value: true
+freeze_llm:
+    value: false
+freeze_parameters:
+    value: []
+freeze_parameters_ratio:
+    value: 0
+freeze_parameters_regex:
+    value: null
+freeze_vit:
+    value: true
+fsdp_double_buffer:
+    value: false
+full_validation:
+    value: false
+generation_batch_size:
+    value: null
+global_batch_size:
+    value: 256
+grad_reduce_in_bf16:
+    value: false
+gradient_accumulation_fusion:
+    value: true
+gradient_checkpointing_kwargs:
+    value: null
+gradient_reduce_div_fusion:
+    value: true
+group_query_attention:
+    value: true
+head_lr_mult:
+    value: 1
+heterogeneous_layers_config_encoded_json:
+    value: null
+heterogeneous_layers_config_path:
+    value: null
+hf_model_type:
+    value: qwen3
+hidden_dropout:
+    value: 0
+hidden_size:
+    value: 1024
+hierarchical_context_parallel_sizes:
+    value: null
+high_priority_stream_groups:
+    value: []
+hqq_axis:
+    value: null
+hub_token:
+    value: null
+hybrid_attention_ratio:
+    value: 0
+hybrid_mlp_ratio:
+    value: 0
+hybrid_override_pattern:
+    value: null
+hysteresis:
+    value: 2
+ict_head_size:
+    value: null
+ict_load:
+    value: null
+ignore_args_error:
+    value: false
+img_h:
+    value: 224
+img_w:
+    value: 224
+importance_sampling_level:
+    value: token
+indexer_batch_size:
+    value: 128
+indexer_log_interval:
+    value: 1000
+inference_batch_times_seqlen_threshold:
+    value: -1
+inference_dynamic_batching:
+    value: false
+inference_dynamic_batching_buffer_guaranteed_fraction:
+    value: 0.2
+inference_dynamic_batching_buffer_overflow_factor:
+    value: null
+inference_dynamic_batching_buffer_size_gb:
+    value: 40
+inference_dynamic_batching_chunk_size:
+    value: 256
+inference_dynamic_batching_max_requests_override:
+    value: null
+inference_dynamic_batching_max_tokens_override:
+    value: null
+inference_dynamic_batching_num_cuda_graphs:
+    value: 16
+inference_max_batch_size:
+    value: 8
+inference_max_seq_length:
+    value: 2560
+inference_rng_tracker:
+    value: false
+init_method_std:
+    value: 0.02
+init_method_xavier_uniform:
+    value: false
+init_model_with_meta_device:
+    value: false
+init_strategy:
+    value: null
+initial_loss_scale:
+    value: 4294967296
+initialize_embedding:
+    value: false
+inprocess_active_world_size:
+    value: 8
+inprocess_barrier_timeout:
+    value: 120
+inprocess_completion_timeout:
+    value: 120
+inprocess_empty_cuda_cache:
+    value: false
+inprocess_granularity:
+    value: node
+inprocess_hard_timeout:
+    value: 90
+inprocess_heartbeat_interval:
+    value: 30
+inprocess_heartbeat_timeout:
+    value: 60
+inprocess_last_call_wait:
+    value: 1
+inprocess_max_iterations:
+    value: null
+inprocess_monitor_process_interval:
+    value: 1
+inprocess_monitor_thread_interval:
+    value: 1
+inprocess_progress_watchdog_interval:
+    value: 1
+inprocess_restart:
+    value: false
+inprocess_soft_timeout:
+    value: 60
+inprocess_termination_grace_time:
+    value: 1
+interleave_prob:
+    value: null
+is_hybrid_model:
+    value: false
+is_multimodal:
+    value: false
+iter_per_epoch:
+    value: 1250
+iterations_to_skip:
+    value: []
+keep_fp8_transpose_cache:
+    value: false
+kitchen_config_file:
+    value: null
+kitchen_recipe_number:
+    value: null
+kl_in_reward:
+    value: false
+kv_channels:
+    value: 128
+kv_lora_rank:
+    value: 32
+label_smoothing:
+    value: 0
+layer_types:
+    value: null
+lazy_mpu_init:
+    value: null
+lazy_tokenize:
+    value: false
+linear_conv_kernel_dim:
+    value: null
+linear_key_head_dim:
+    value: null
+linear_num_key_heads:
+    value: null
+linear_num_value_heads:
+    value: null
+linear_value_head_dim:
+    value: null
+llm_architectures:
+    value: Qwen3ForCausalLM
+load:
+    value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore
+load_args:
+    value: false
+load_data_args:
+    value: false
+load_from_cache_file:
+    value: false
+load_main_params_from_ckpt:
+    value: null
+load_model_opt_format:
+    value: false
+load_safetensors:
+    value: false
+local_rank:
+    value: 7
+local_repo_path:
+    value: null
+log_completions:
+    value: false
+log_energy:
+    value: false
+log_entropy:
+    value: false
+log_interval:
+    value: 1
+log_loss_scale_to_tensorboard:
+    value: true
+log_memory_to_tensorboard:
+    value: true
+log_num_zeros_in_grad:
+    value: false
+log_params_norm:
+    value: false
+log_progress:
+    value: false
+log_rollout_offpolicy_metrics:
+    value: false
+log_straggler:
+    value: false
+log_throughput:
+    value: false
+log_timers_to_tensorboard:
+    value: true
+log_validation_ppl_to_tensorboard:
+    value: true
+log_world_size_to_tensorboard:
+    value: false
+logging_level:
+    value: 20
+logprobs:
+    value: false
+lora_alpha:
+    value: 32
+lora_bias:
+    value: none
+lora_dropout:
+    value: 0.05
+lora_dtype:
+    value: null
+lora_modules:
+    value: []
+lora_rank:
+    value: 8
+loss_scale:
+    value: null
+loss_scale_window:
+    value: 1000
+loss_type:
+    value: null
+lr:
+    value: 0.0001
+lr_decay_iters:
+    value: null
+lr_decay_samples:
+    value: null
+lr_decay_style:
+    value: cosine
+lr_warmup_fraction:
+    value: 0.05
+lr_warmup_init:
+    value: 0
+lr_warmup_iters:
+    value: 0
+lr_warmup_samples:
+    value: 0
+lr_wsd_decay_iters:
+    value: null
+lr_wsd_decay_samples:
+    value: null
+lr_wsd_decay_style:
+    value: exponential
+main_grads_dtype:
+    value: torch.float32
+main_params_dtype:
+    value: torch.float32
+make_vocab_size_divisible_by:
+    value: 128
+mamba_head_dim:
+    value: 64
+mamba_num_groups:
+    value: 8
+mamba_num_heads:
+    value: null
+mamba_state_dim:
+    value: 128
+manual_gc:
+    value: false
+manual_gc_eval:
+    value: true
+manual_gc_interval:
+    value: 0
+mask_factor:
+    value: 1
+mask_prob:
+    value: 0.15
+mask_type:
+    value: random
+masked_softmax_fusion:
+    value: true
+max_completion_length:
+    value: 512
+max_epochs:
+    value: null
+max_length:
+    value: 4096
+max_model_len:
+    value: null
+max_new_tokens:
+    value: null
+max_pixels:
+    value: null
+max_position_embeddings:
+    value: 32768
+max_resample_times:
+    value: 3
+max_shard_size:
+    value: 5GB
+max_tokens_to_oom:
+    value: 12000
+max_turns:
+    value: null
+memory_snapshot_path:
+    value: snapshot.pickle
+merge_file:
+    value: null
+merge_lora:
+    value: false
+micro_batch_size:
+    value: 4
+microbatch_group_size_per_vp_stage:
+    value: null
+mid_level_dataset_surplus:
+    value: 0.005
+min_loss_scale:
+    value: 1
+min_lr:
+    value: 3e-06
+mlp_chunks_for_prefill:
+    value: 1
+mlp_padding_free:
+    value: false
+mmap_bin_files:
+    value: true
+mock_data:
+    value: false
+model:
+    value: Qwen/Qwen3-0.6B-Base
+model_author:
+    value: null
+model_dir:
+    value: /workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd
+model_name:
+    value: null
+model_revision:
+    value: null
+model_type:
+    value: qwen3
+modules_to_save:
+    value: []
+moe_apply_probs_on_input:
+    value: false
+moe_aux_loss_coeff:
+    value: 0
+moe_deepep_num_sms:
+    value: 20
+moe_enable_deepep:
+    value: false
+moe_expert_capacity_factor:
+    value: null
+moe_extended_tp:
+    value: false
+moe_ffn_hidden_size:
+    value: null
+moe_grouped_gemm:
+    value: true
+moe_input_jitter_eps:
+    value: null
+moe_layer_freq:
+    value: 1
+moe_layer_recompute:
+    value: false
+moe_pad_expert_input_to_capacity:
+    value: false
+moe_per_layer_logging:
+    value: false
+moe_permute_fusion:
+    value: false
+moe_router_bias_update_rate:
+    value: 0.001
+moe_router_dtype:
+    value: fp32
+moe_router_enable_expert_bias:
+    value: false
+moe_router_force_load_balancing:
+    value: false
+moe_router_fusion:
+    value: false
+moe_router_group_topk:
+    value: null
+moe_router_load_balancing_type:
+    value: aux_loss
+moe_router_num_groups:
+    value: null
+moe_router_padding_for_fp8:
+    value: false
+moe_router_pre_softmax:
+    value: false
+moe_router_score_function:
+    value: softmax
+moe_router_topk:
+    value: 2
+moe_router_topk_scaling_factor:
+    value: null
+moe_shared_expert_intermediate_size:
+    value: null
+moe_shared_expert_overlap:
+    value: false
+moe_token_dispatcher_type:
+    value: alltoall
+moe_token_drop_policy:
+    value: probs
+moe_upcycling_granularity:
+    value: 1
+moe_use_legacy_grouped_gemm:
+    value: false
+moe_use_upcycling:
+    value: false
+moe_z_loss_coeff:
+    value: null
+move_model_batches:
+    value: null
+mrope_interleaved:
+    value: false
+mrope_section:
+    value: null
+mscale:
+    value: 1
+mscale_all_dim:
+    value: 0
+mtp_loss_scaling_factor:
+    value: 0.1
+mtp_num_layers:
+    value: null
+multi_latent_attention:
+    value: false
+multi_turn_scheduler:
+    value: null
+multiple_validation_sets:
+    value: false
+nccl_all_reduce_for_prefill:
+    value: false
+nccl_communicator_config_path:
+    value: null
+nccl_ub:
+    value: false
+new_special_tokens:
+    value: []
+no_load_optim:
+    value: null
+no_load_rng:
+    value: null
+no_persist_layer_norm:
+    value: false
+no_rope_freq:
+    value: null
+no_save_optim:
+    value: null
+no_save_rng:
+    value: null
+non_persistent_ckpt_type:
+    value: null
+non_persistent_global_ckpt_dir:
+    value: null
+non_persistent_local_ckpt_algo:
+    value: fully_parallel
+non_persistent_local_ckpt_dir:
+    value: null
+non_persistent_save_interval:
+    value: null
+norm_bbox:
+    value: null
+norm_epsilon:
+    value: 1e-06
+normalization:
+    value: RMSNorm
+num_attention_heads:
+    value: 16
+num_beams:
+    value: 1
+num_channels:
+    value: 3
+num_classes:
+    value: 1000
+num_dataset_builder_threads:
+    value: 1
+num_distributed_optimizer_instances:
+    value: 1
+num_experts:
+    value: null
+num_generations:
+    value: 8
+num_iterations:
+    value: 1
+num_labels:
+    value: null
+num_layers:
+    value: 28
+num_layers_at_end_in_bf16:
+    value: 1
+num_layers_at_start_in_bf16:
+    value: 1
+num_layers_per_virtual_pipeline_stage:
+    value: null
+num_query_groups:
+    value: 8
+num_virtual_stages_per_pipeline_rank:
+    value: null
+num_workers:
+    value: 32
+object_storage_cache_path:
+    value: null
+off_policy_sequence_mask_delta:
+    value: null
+offload_bridge:
+    value: false
+offload_model:
+    value: false
+offload_optimizer:
+    value: false
+one_logger_async:
+    value: false
+one_logger_project:
+    value: megatron-lm
+one_logger_run_name:
+    value: null
+onnx_safe:
+    value: null
+openai_gelu:
+    value: false
+optimizer:
+    value: adam
+optimizer_cpu_offload:
+    value: false
+optimizer_offload_fraction:
+    value: 1
+original_max_position_embeddings:
+    value: null
+output_bert_embeddings:
+    value: false
+overlap_cpu_optimizer_d2h_h2d:
+    value: false
+overlap_grad_reduce:
+    value: true
+overlap_moe_expert_parallel_comm:
+    value: false
+overlap_p2p_comm:
+    value: false
+overlap_p2p_comm_warmup_flush:
+    value: false
+overlap_param_gather:
+    value: true
+overlap_param_gather_with_optimizer_step:
+    value: false
+overlong_filter:
+    value: false
+override_opt_param_scheduler:
+    value: false
+packing:
+    value: true
+packing_length:
+    value: 4096
+packing_num_proc:
+    value: 1
+padded_vocab_size:
+    value: 151936
+padding_free:
+    value: true
+padding_side:
+    value: right
+params_dtype:
+    value: torch.bfloat16
+partial_rotary_factor:
+    value: null
+patch_dim:
+    value: 16
+patch_size:
+    value: 1
+per_split_data_args_path:
+    value: null
+perform_initialization:
+    value: false
+pin_cpu_grads:
+    value: true
+pin_cpu_params:
+    value: true
+pipeline_model_parallel_comm_backend:
+    value: null
+pipeline_model_parallel_layout:
+    value: null
+pipeline_model_parallel_size:
+    value: 1
+position_embedding_type:
+    value: rope
+pretrained_checkpoint:
+    value: null
+problem_type:
+    value: null
+profile:
+    value: false
+profile_ranks:
+    value:
+        - 0
+profile_step_end:
+    value: 12
+profile_step_start:
+    value: 10
+q_lora_rank:
+    value: null
+qk_head_dim:
+    value: 128
+qk_l2_norm:
+    value: false
+qk_layernorm:
+    value: true
+qk_pos_emb_head_dim:
+    value: 64
+quant_bits:
+    value: null
+quant_method:
+    value: null
+query_in_block_prob:
+    value: 0.1
+rampup_batch_size:
+    value: null
+rank:
+    value: 7
+ray_exp_name:
+    value: null
+recompute_granularity:
+    value: full
+recompute_method:
+    value: uniform
+recompute_modules:
+    value:
+        - core_attn
+recompute_num_layers:
+    value: 1
+record_memory_history:
+    value: false
+ref_adapter_load:
+    value: null
+ref_adapters:
+    value: []
+ref_load:
+    value: null
+ref_model:
+    value: null
+ref_model_mixup_alpha:
+    value: 0.6
+ref_model_sync_steps:
+    value: 512
+reference_free:
+    value: false
+relative_attention_max_distance:
+    value: 128
+relative_attention_num_buckets:
+    value: 32
+remove_unused_columns:
+    value: true
+repetition_max_penalty:
+    value: -1
+repetition_n_grams:
+    value: 3
+repetition_penalty:
+    value: 1
+replication:
+    value: false
+replication_factor:
+    value: 2
+replication_jump:
+    value: null
+rerun_mode:
+    value: validate_results
+reset_attention_mask:
+    value: false
+reset_position_ids:
+    value: false
+response_prefix:
+    value: null
+result_rejected_tracker_filename:
+    value: null
+retriever_report_topk_accuracies:
+    value: []
+retriever_score_scaling:
+    value: false
+retriever_seq_length:
+    value: 256
+retro_add_retriever:
+    value: false
+retro_attention_gate:
+    value: 1
+retro_cyclic_train_iters:
+    value: null
+retro_encoder_attention_dropout:
+    value: 0.1
+retro_encoder_hidden_dropout:
+    value: 0.1
+retro_encoder_layers:
+    value: 2
+retro_num_neighbors:
+    value: 2
+retro_num_retrieved_chunks:
+    value: 2
+retro_project_dir:
+    value: null
+retro_verify_neighbor_count:
+    value: true
+reuse_grad_buf_for_mxfp8_param_ag:
+    value: false
+reward_funcs:
+    value: []
+reward_model:
+    value: null
+reward_model_plugin:
+    value: null
+reward_weights:
+    value: null
+rlhf_type:
+    value: null
+rollout_importance_sampling_mode:
+    value: null
+rollout_importance_sampling_threshold:
+    value: 2
+rope_scaling:
+    value: null
+rope_scaling_factor:
+    value: 8
+rope_type:
+    value: null
+rotary_base:
+    value: 1000000
+rotary_interleaved:
+    value: false
+rotary_percent:
+    value: 1
+rotary_scaling_factor:
+    value: 1
+rotary_seq_len_interpolation_factor:
+    value: null
+rpo_alpha:
+    value: null
+run_workload_inspector_server:
+    value: false
+sample_rate:
+    value: 1
+save:
+    value: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709
+save_interval:
+    value: 100
+save_retain_interval:
+    value: null
+save_safetensors:
+    value: false
+save_strategy:
+    value: steps
+scale_rewards:
+    value: group
+scatter_gather_tensors_in_pipeline:
+    value: true
+seed:
+    value: 42
+seq_length:
+    value: 4096
+sequence_parallel:
+    value: false
+sequence_parallel_size:
+    value: 1
+sft:
+    value: false
+sft_tokenizer_prompt_format:
+    value: nemotron-h-aligned
+sgd_momentum:
+    value: 0.9
+sharp_enabled_group:
+    value: null
+short_seq_prob:
+    value: 0.1
+shuffle_buffer_size:
+    value: 1000
+skip_train:
+    value: false
+skipped_train_samples:
+    value: 0
+sleep_level:
+    value: 0
+soft_cache_length:
+    value: null
+soft_max_length:
+    value: null
+spec:
+    value: null
+split:
+    value: null
+split_dataset_ratio:
+    value: 0
+squared_relu:
+    value: false
+start_weight_decay:
+    value: 0.1
+steps_per_generation:
+    value: null
+stop_words:
+    value: []
+stopping_strategy:
+    value: first_exhausted
+straggler_ctrlr_port:
+    value: 65535
+straggler_minmax_count:
+    value: 1
+stream:
+    value: false
+streaming:
+    value: false
+strict:
+    value: false
+strict_fsdp_dtensor_load:
+    value: true
+suggested_communication_unit_size:
+    value: null
+swiglu:
+    value: true
+swin_backbone_type:
+    value: tiny
+symmetric_ar_type:
+    value: null
+sync_ref_model:
+    value: false
+system:
+    value: null
+target_modules:
+    value:
+        - all-linear
+target_regex:
+    value: null
+task_type:
+    value: causal_lm
+tau_neg:
+    value: 1.05
+tau_pos:
+    value: 1
+te_rng_tracker:
+    value: false
+temperature:
+    value: null
+template:
+    value: qwen3
+template_backend:
+    value: swift
+tensor_model_parallel_size:
+    value: 1
+tensorboard_dir:
+    value: /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs
+tensorboard_log_interval:
+    value: 1
+tensorboard_queue_size:
+    value: 50
+test_data_path:
+    value: null
+test_mode:
+    value: false
+tiktoken_num_special_tokens:
+    value: 1000
+tiktoken_pattern:
+    value: null
+tiktoken_special_tokens:
+    value: null
+timing_log_level:
+    value: 0
+timing_log_option:
+    value: minmax
+titles_data_path:
+    value: null
+tokenizer_model:
+    value: null
+tokenizer_type:
+    value: null
+top_entropy_quantile:
+    value: 1
+top_k:
+    value: 50
+top_logprobs:
+    value: null
+top_p:
+    value: 0.9
+torch_dtype:
+    value: torch.bfloat16
+torch_fsdp2_reshard_after_forward:
+    value: true
+tp_comm_bootstrap_backend:
+    value: nccl
+tp_comm_bulk_dgrad:
+    value: true
+tp_comm_bulk_wgrad:
+    value: true
+tp_comm_overlap:
+    value: false
+tp_comm_overlap_ag:
+    value: true
+tp_comm_overlap_cfg:
+    value: null
+tp_comm_overlap_rs:
+    value: true
+tp_comm_overlap_rs_dgrad:
+    value: false
+tp_comm_split_ag:
+    value: true
+tp_comm_split_rs:
+    value: true
+train_data_path:
+    value: null
+train_dataloader_shuffle:
+    value: true
+train_iters:
+    value: 38100
+train_samples:
+    value: null
+train_sync_interval:
+    value: null
+train_type:
+    value: full
+trainable_parameters:
+    value: []
+trainable_parameters_regex:
+    value: null
+transformer_impl:
+    value: transformer_engine
+transformer_pipeline_model_parallel_size:
+    value: 1
+truncation_strategy:
+    value: right
+tuner_backend:
+    value: peft
+undesirable_weight:
+    value: 1
+untie_embeddings_and_output_weights:
+    value: false
+use_chat_template:
+    value: false
+use_checkpoint_args:
+    value: false
+use_checkpoint_opt_param_scheduler:
+    value: false
+use_cpu_initialization:
+    value: null
+use_dist_ckpt:
+    value: true
+use_dist_ckpt_deprecated:
+    value: false
+use_distributed_optimizer:
+    value: true
+use_flash_attn:
+    value: false
+use_fused_weighted_squared_relu:
+    value: false
+use_hf:
+    value: true
+use_legacy_models:
+    value: false
+use_megatron_fsdp:
+    value: false
+use_mp_args_from_checkpoint_args:
+    value: false
+use_one_sent_docs:
+    value: false
+use_persistent_ckpt_worker:
+    value: false
+use_precision_aware_optimizer:
+    value: true
+use_pytorch_profiler:
+    value: false
+use_ray:
+    value: false
+use_ring_exchange_p2p:
+    value: false
+use_rope_scaling:
+    value: false
+use_rotary_position_embeddings:
+    value: false
+use_rslora:
+    value: false
+use_shared_expert_gate:
+    value: false
+use_sharp:
+    value: false
+use_swift_lora:
+    value: false
+use_tokenizer_model_from_checkpoint_args:
+    value: true
+use_torch_fsdp2:
+    value: false
+use_torch_optimizer_for_cpu_offload:
+    value: false
+use_tp_pp_dp_mapping:
+    value: false
+use_vllm:
+    value: true
+v_head_dim:
+    value: 128
+val_dataset:
+    value: []
+val_dataset_shuffle:
+    value: false
+valid_data_path:
+    value: null
+variable_seq_lengths:
+    value: false
+virtual_pipeline_model_parallel_size:
+    value: null
+vision_backbone_type:
+    value: vit
+vision_pretraining:
+    value: false
+vision_pretraining_type:
+    value: classify
+vit_gradient_checkpointing:
+    value: true
+vit_lr:
+    value: null
+vllm_disable_cascade_attn:
+    value: false
+vllm_enable_prefix_caching:
+    value: true
+vllm_enforce_eager:
+    value: false
+vllm_engine_kwargs:
+    value: null
+vllm_gpu_memory_utilization:
+    value: 0.9
+vllm_limit_mm_per_prompt:
+    value: null
+vllm_max_model_len:
+    value: null
+vllm_max_num_seqs:
+    value: null
+vllm_mm_processor_cache_gb:
+    value: null
+vllm_mode:
+    value: null
+vllm_server_base_url:
+    value: null
+vllm_server_group_port:
+    value: null
+vllm_server_host:
+    value: null
+vllm_server_pass_dataset:
+    value: false
+vllm_server_port:
+    value:
+        - 8000
+vllm_server_timeout:
+    value: 240
+vllm_tensor_parallel_size:
+    value: 1
+vocab_extra_ids:
+    value: 0
+vocab_file:
+    value: null
+vocab_size:
+    value: null
+wandb_exp_name:
+    value: baseline
+wandb_log_unique_prompts:
+    value: null
+wandb_project:
+    value: plt
+wandb_save_dir:
+    value: ""
+weight_decay:
+    value: 0.1
+weight_decay_incr_style:
+    value: constant
+wgrad_deferral_limit:
+    value: 0
+world_size:
+    value: 8
+yaml_cfg:
+    value: null

wandb/wandb/run-20251224_034518-gd3q7mjv/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50b97e62e75f4a01467f0076f2e7e3f1f3b3812717d6040c9597cf6049f6a3b4
+size 15045585

wandb/wandb/run-20251224_034518-gd3q7mjv/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,219 @@

+pip==25.3
+setuptools==80.9.0
+wheel==0.45.1
+multidict==6.7.0
+cffi==2.0.0
+typing-inspection==0.4.2
+smmap==5.0.2
+sentry-sdk==2.48.0
+pydantic_core==2.41.5
+pydantic_core==2.41.4
+protobuf==6.33.2
+annotated-types==0.7.0
+pydantic==2.12.5
+pydantic==2.12.3
+gitdb==4.0.12
+GitPython==3.1.45
+wandb==0.23.1
+sortedcontainers==2.4.0
+pytz==2025.2
+pydub==0.25.1
+jieba==0.42.1
+crcmod==1.7
+cpm-kernels==1.0.11
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+addict==2.4.0
+zstandard==0.25.0
+zipp==3.23.0
+xxhash==3.6.0
+Werkzeug==3.1.4
+websockets==15.0.1
+uvicorn==0.40.0
+tzdata==2025.3
+tomlkit==0.13.3
+tensorboard-data-server==0.7.2
+sniffio==1.3.1
+simplejson==3.20.2
+semantic-version==2.10.0
+scipy==1.16.3
+safetensors==0.7.0
+ruff==0.14.10
+rouge==1.0.1
+regex==2025.11.3
+python-multipart==0.0.21
+pyparsing==3.3.1
+pycryptodome==3.23.0
+pycparser==2.23
+pyarrow==22.0.0
+propcache==0.4.1
+mdurl==0.1.2
+pillow==11.3.0
+orjson==3.11.5
+omegaconf==2.3.0
+Markdown==3.10
+kiwisolver==1.4.9
+json_repair==0.54.3
+joblib==1.5.3
+jmespath==0.10.0
+jiter==0.12.0
+grpcio==1.76.0
+groovy==0.1.2
+future==1.0.0
+trl==0.24.0
+fsspec==2025.3.0
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.1
+distro==1.9.0
+dill==0.3.8
+dacite==1.9.2
+cycler==0.12.1
+contourpy==1.3.3
+attrs==25.4.0
+attrdict==2.0.1
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+absl-py==2.3.1
+yarl==1.22.0
+tiktoken==0.12.0
+tensorboard==2.20.0
+starlette==0.50.0
+pandas==2.3.3
+nltk==3.9.2
+multiprocess==0.70.16
+modelscope==1.33.0
+matplotlib==3.10.8
+markdown-it-py==4.0.0
+importlib_metadata==8.7.1
+huggingface-hub==0.36.0
+binpacking==1.5.2
+aiosignal==1.4.0
+tokenizers==0.22.1
+safehttpx==0.1.7
+rich==14.2.0
+openai==2.14.0
+gradio_client==1.14.0
+fastapi==0.127.0
+cryptography==46.0.3
+aiohttp==3.13.2
+typer==0.20.1
+transformers==4.57.3
+aliyun-python-sdk-core==2.16.0
+accelerate==1.12.0
+transformers-stream-generator==0.0.5
+peft==0.18.0
+gradio==5.50.0
+datasets==3.6.0
+aliyun-python-sdk-kms==2.16.5
+oss2==2.19.1
+ms_swift==3.12.0.dev0
+liger_kernel==0.6.4
+hf_transfer==0.1.9
+pybind11==3.0.1
+transformer_engine==2.10.0
+ml_dtypes==0.5.4
+onnx==1.20.0
+transformer_engine_cu12==2.10.0
+onnx-ir==0.1.13
+onnxscript==0.5.7
+transformer_engine_torch==2.10.0
+apex==0.1
+numpy==1.26.4
+megatron-core==0.15.0
+flash_attn==2.8.3
+charset-normalizer==3.4.4
+Jinja2==3.1.6
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.6.1
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+requests==2.32.5
+sentencepiece==0.2.1
+sympy==1.14.0
+torch==2.9.1+cu128
+torchaudio==2.9.1+cu128
+torchcodec==0.9.1
+torchdata==0.10.0
+torchtext==0.6.0
+torchvision==0.24.1+cu128
+triton==3.5.1
+urllib3==2.6.2
+anyio==4.12.0
+asttokens==3.0.1
+certifi==2025.11.12
+click==8.3.1
+comm==0.2.3
+debugpy==1.8.18
+decorator==5.2.1
+executing==2.2.1
+filelock==3.20.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.11
+ipykernel==7.1.0
+ipython==9.8.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.8
+jedi==0.19.2
+jupyter_client==8.7.0
+jupyter_core==5.9.1
+jupyterlab_widgets==3.0.16
+matplotlib-inline==0.2.1
+nest-asyncio==1.6.0
+packaging==25.0
+parso==0.8.5
+pexpect==4.9.0
+platformdirs==4.5.1
+prompt_toolkit==3.0.52
+psutil==7.1.3
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+PyYAML==6.0.3
+pyzmq==27.1.0
+shellingham==1.5.4
+six==1.17.0
+stack-data==0.6.3
+tornado==6.5.3
+tqdm==4.67.1
+traitlets==5.14.3
+typer-slim==0.20.0
+typing_extensions==4.15.0
+wcwidth==0.2.14
+widgetsnbextension==4.0.15
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.2
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.45.1
+zipp==3.19.2

wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,275 @@

+{
+  "os":  "Linux-6.8.0-58-generic-x86_64-with-glibc2.39",
+  "python":  "CPython 3.12.12",
+  "startedAt":  "2025-12-24T03:45:18.795219Z",
+  "args":  [
+    "--seed",
+    "42",
+    "--micro-batch-size",
+    "4",
+    "--global-batch-size",
+    "256",
+    "--recompute-granularity",
+    "full",
+    "--recompute-method",
+    "uniform",
+    "--recompute-num-layers",
+    "1",
+    "--recompute-modules",
+    "core_attn",
+    "--train-iters",
+    "38100",
+    "--log-interval",
+    "1",
+    "--tensorboard-dir",
+    "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs",
+    "--cross-entropy-loss-fusion",
+    "--cross-entropy-fusion-impl",
+    "native",
+    "--calculate-per-token-loss",
+    "--attention-backend",
+    "flash",
+    "--optimizer",
+    "adam",
+    "--optimizer-offload-fraction",
+    "1.0",
+    "--use-precision-aware-optimizer",
+    "--main-grads-dtype",
+    "fp32",
+    "--main-params-dtype",
+    "fp32",
+    "--exp-avg-dtype",
+    "fp32",
+    "--exp-avg-sq-dtype",
+    "fp32",
+    "--dataloader-type",
+    "cyclic",
+    "--manual-gc-interval",
+    "0",
+    "--lr",
+    "0.0001",
+    "--lr-decay-style",
+    "cosine",
+    "--lr-warmup-iters",
+    "0",
+    "--lr-warmup-fraction",
+    "0.05",
+    "--min-lr",
+    "3e-06",
+    "--weight-decay",
+    "0.1",
+    "--clip-grad",
+    "1.0",
+    "--adam-beta1",
+    "0.9",
+    "--adam-beta2",
+    "0.95",
+    "--adam-eps",
+    "1e-08",
+    "--sgd-momentum",
+    "0.9",
+    "--save",
+    "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709",
+    "--save-interval",
+    "100",
+    "--load",
+    "/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore",
+    "--finetune",
+    "--ckpt-format",
+    "torch_dist",
+    "--no-initialization",
+    "--auto-detect-ckpt-format",
+    "--exit-on-missing-checkpoint",
+    "--distributed-backend",
+    "nccl",
+    "--local-rank",
+    "7",
+    "--use-distributed-optimizer",
+    "--tensor-model-parallel-size",
+    "1",
+    "--pipeline-model-parallel-size",
+    "1",
+    "--context-parallel-size",
+    "1",
+    "--overlap-grad-reduce",
+    "--overlap-param-gather",
+    "--distributed-timeout-minutes",
+    "300000",
+    "--num-layers",
+    "28",
+    "--hidden-size",
+    "1024",
+    "--ffn-hidden-size",
+    "3072",
+    "--num-attention-heads",
+    "16",
+    "--group-query-attention",
+    "--num-query-groups",
+    "8",
+    "--max-position-embeddings",
+    "32768",
+    "--position-embedding-type",
+    "rope",
+    "--rotary-base",
+    "1000000",
+    "--rotary-percent",
+    "1.0",
+    "--normalization",
+    "RMSNorm",
+    "--norm-epsilon",
+    "1e-06",
+    "--swiglu",
+    "--disable-bias-linear",
+    "--attention-dropout",
+    "0.0",
+    "--hidden-dropout",
+    "0.0",
+    "--kv-channels",
+    "128",
+    "--qk-layernorm",
+    "--transformer-impl",
+    "transformer_engine",
+    "--moe-layer-freq",
+    "1",
+    "--moe-router-topk",
+    "2",
+    "--moe-router-dtype",
+    "fp32",
+    "--moe-router-score-function",
+    "softmax",
+    "--moe-router-load-balancing-type",
+    "aux_loss",
+    "--expert-model-parallel-size",
+    "1",
+    "--expert-tensor-parallel-size",
+    "1",
+    "--moe-token-dispatcher-type",
+    "alltoall",
+    "--moe-grouped-gemm",
+    "--moe-aux-loss-coeff",
+    "0.0",
+    "--moe-token-drop-policy",
+    "probs",
+    "--kv-lora-rank",
+    "32",
+    "--qk-head-dim",
+    "128",
+    "--qk-pos-emb-head-dim",
+    "64",
+    "--mtp-loss-scaling-factor",
+    "0.1",
+    "--fp8-recipe",
+    "delayed",
+    "--fp8-amax-history-len",
+    "1024",
+    "--fp8-amax-compute-algo",
+    "max",
+    "--bf16",
+    "--attention-softmax-in-fp32",
+    "--tensorboard-log-interval",
+    "1",
+    "--tensorboard-queue-size",
+    "50",
+    "--log-timers-to-tensorboard",
+    "--log-validation-ppl-to-tensorboard",
+    "--log-memory-to-tensorboard",
+    "--logging-level",
+    "20",
+    "--wandb-project",
+    "plt",
+    "--wandb-exp-name",
+    "baseline",
+    "--eval-iters",
+    "-1",
+    "--eval-interval",
+    "100",
+    "--seq-length",
+    "4096",
+    "--num-workers",
+    "32"
+  ],
+  "program":  "/workspace/halcyon-recipe2/swift/cli/_megatron/pt.py",
+  "codePath":  "swift/cli/_megatron/pt.py",
+  "codePathLocal":  "swift/cli/_megatron/pt.py",
+  "git":  {
+    "remote":  "https://github.com/weak-kajuma/halcyon-recipe2.git",
+    "commit":  "ea7cc214b68fb511dd83bff83a504b7f43053577"
+  },
+  "email":  "kazuma826826@gmail.com",
+  "root":  "/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb",
+  "host":  "36fd00e7b21c",
+  "executable":  "/venv/main/bin/python3.12",
+  "cpu_count":  72,
+  "cpu_count_logical":  144,
+  "gpu":  "NVIDIA GeForce RTX 5090",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "7669363507200",
+      "used":  "983051857920"
+    }
+  },
+  "memory":  {
+    "total":  "540643262464"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-23ca8669-46fc-19eb-348b-e51e591c150d"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b"
+    },
+    {
+      "name":  "NVIDIA GeForce RTX 5090",
+      "memoryTotal":  "34190917632",
+      "cudaCores":  21760,
+      "architecture":  "Blackwell",
+      "uuid":  "GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef"
+    }
+  ],
+  "cudaVersion":  "13.0",
+  "writerId":  "5bh5hk313ky3l0v9f9cesb7o1x31upc6"
+}

wandb/wandb/run-20251224_034518-gd3q7mjv/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"iteration-time":4.885399580001831,"grad-norm":0.36394086480140686,"_wandb":{"runtime":186668},"_timestamp":1.766734585640789e+09,"samples vs steps":9753600,"_runtime":186668.708556184,"_step":38100,"lm loss":2.0087497234344482,"learning-rate":3.000000106112566e-06,"batch-size":256,"loss-scale":1}

wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,16 @@

+{"time":"2025-12-24T03:45:18.883988557Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_kirugit/port-611253.txt","pid":611253,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-12-24T03:45:18.884899822Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":611253}
+{"time":"2025-12-24T03:45:18.884913121Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-611253-617899-2898902707/socket","Net":"unix"}}
+{"time":"2025-12-24T03:45:19.067142394Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2025-12-24T03:45:19.071653716Z","level":"INFO","msg":"handleInformInit: received","streamId":"gd3q7mjv","id":"1(@)"}
+{"time":"2025-12-24T03:45:19.342900184Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"gd3q7mjv","id":"1(@)"}
+{"time":"2025-12-26T07:36:29.518345904Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"gd3q7mjv","id":"1(@)"}
+{"time":"2025-12-26T07:36:29.520024321Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"gd3q7mjv","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.382214788Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.382296341Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.382312754Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-12-26T07:36:43.382392286Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.382512221Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.382522699Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
+{"time":"2025-12-26T07:36:43.38298197Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-611253-617899-2898902707/socket","Net":"unix"}}
+{"time":"2025-12-26T07:36:43.383080926Z","level":"INFO","msg":"server is closed"}

wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log ADDED Viewed

	@@ -0,0 +1,26 @@

+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Configure stats pid to 611253
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
+2025-12-24 03:45:18,797 INFO    MainThread:611253 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug.log
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/wandb/wandb/run-20251224_034518-gd3q7mjv/logs/debug-internal.log
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():841] calling init triggers
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 4, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 38100, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/qwen3-baseline/v0-20251224-033709', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 4096, 'encoder_seq_length': 4096, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'baseline', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 4096, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/full'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base-mcore', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 4096, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': [], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': None, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 1, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
+2025-12-24 03:45:18,798 INFO    MainThread:611253 [wandb_init.py:init():889] starting backend
+2025-12-24 03:45:19,067 INFO    MainThread:611253 [wandb_init.py:init():892] sending inform_init request
+2025-12-24 03:45:19,070 INFO    MainThread:611253 [wandb_init.py:init():900] backend started and connected
+2025-12-24 03:45:19,074 INFO    MainThread:611253 [wandb_init.py:init():970] updated telemetry
+2025-12-24 03:45:19,081 INFO    MainThread:611253 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2025-12-24 03:45:19,580 INFO    MainThread:611253 [wandb_init.py:init():1041] starting run threads in backend
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_console_start():2521] atexit reg
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2025-12-24 03:45:19,738 INFO    MainThread:611253 [wandb_run.py:_redirect():2461] Redirects installed.
+2025-12-24 03:45:19,742 INFO    MainThread:611253 [wandb_init.py:init():1081] run started, returning control to user process
+2025-12-26 07:36:28,290 INFO    MainThread:611253 [wandb_run.py:_finish():2287] finishing run tepic/plt/gd3q7mjv
+2025-12-26 07:36:28,292 INFO    MainThread:611253 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
+2025-12-26 07:36:28,293 INFO    MainThread:611253 [wandb_run.py:_restore():2468] restore
+2025-12-26 07:36:28,293 INFO    MainThread:611253 [wandb_run.py:_restore():2474] restore done
+2025-12-26 07:36:29,517 INFO    MainThread:611253 [wandb_run.py:_footer_sync_info():3862] logging synced files

wandb/wandb/run-20251224_034518-gd3q7mjv/run-gd3q7mjv.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee9f0fe4fc8a60cfce51f5a6d3a2c72ba17eaf99be8dfd51ebe7114742980546
+size 86368426