diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json new file mode 100644 index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "v_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..769434819ffdba0e62edfbea133a5f3f81419b36 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9e52255857ec7ffa31d4b7a55befca1557263cad59e8426b72706443dc8bf45 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f451a96f9827b26925ff5d2aea804cd5b88d76d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8a85458b2b2064332e1f0a7f65d87cdcd60c0c659d728f67111e110f934661 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ae9a2dad6329a572ce9c358363ba3641f862802 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:899c32d86e329a048bc56fc532041672db03a64112324ce74ede0991b822733d +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..946c94728c60c93a444d4187bc77833cb51b8214 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33638aa13e6f9200e30ea1ee43d5a987332a7b81042c25d8f358c427b6f6ca28 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68009d6a413cc382144abe0ded9c4129e8439b47 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d72f5b84e6381c6e489dbf6625fcf8bb91d408b25438d55ad7f96ec3fed34b +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f00124e5a0ea627b8addbbf3644b01ed2e2e835 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3052be4c3806c2550bace460681d1af2b3beeb7973b270c598ff4095198a4fc +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0099b653969fd7512c604969ec5d2e07300ef118 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a650af753c3f9853d157a148cbf7292ea7973999c01035dc62eaa2ffd1a958 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7b0f50bc9d960908ba49f55ba4c088080152779 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffdb58c9f32cb1cc2fec92b915d06c00271f6c23b3b162644a43d8bc322fad23 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f694c5feaaca0ce72187371fc6ee20956a7d7ecb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b9d5a7067f65dce1a3a7f5943fce25d44ab9d56b2097dfea4aafe4bf85b681 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..37ac50652a3badbfb1bdeaccb8b1934575b584eb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bc3650851dae439677613c9e23a5528de47b679 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e00a6e8b4b743026f68d749a8cb3bdd4b746838 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5354141d42e077c356f9ca8c6b12bd7e5e41f2af --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9db8a98ca69cd5bfebe102039231d58d7ea374e --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c42b1ef948ce2918e44502f72db30bc09f3c40f0dfa68050c22f884d1aac4ff5 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9fed2b18b2d15786dd38faf631940c18e0068ffa --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.43920898, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.741686768010279, + "learning_rate": 2.5e-05, + "logits/chosen": -0.3984375, + "logits/rejected": 0.2080078125, + "logps/chosen": -282.0, + "logps/rejected": -272.0, + "loss": 1.7568359375, + "memory(GiB)": 14.02, + "nll_loss": 1.5859375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.062883 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 3.179213138025169, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.734375, + "logits/rejected": -0.185546875, + "logps/chosen": -358.0, + "logps/rejected": -516.0, + "loss": 1.7740478515625, + "memory(GiB)": 31.0, + "nll_loss": 1.0234375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.287109375, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.1416015625, + "step": 5, + "train_speed(iter/s)": 0.084048 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.1865745356779005, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.5078125, + "logits/rejected": -0.036376953125, + "logps/chosen": -348.0, + "logps/rejected": -552.0, + "loss": 1.3421142578125, + "memory(GiB)": 45.84, + "nll_loss": 0.8203125, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6015625, + "rewards/margins": 0.91796875, + "rewards/rejected": 0.6875, + "step": 10, + "train_speed(iter/s)": 0.086989 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.2081256636056697, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.703125, + "logps/chosen": -368.0, + "logps/rejected": -284.0, + "loss": 0.77510986328125, + "memory(GiB)": 45.84, + "nll_loss": 0.67578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984375, + "rewards/margins": 3.3125, + "rewards/rejected": -0.33203125, + "step": 15, + "train_speed(iter/s)": 0.09061 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12928414880774924, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.2060546875, + "logits/rejected": -0.61328125, + "logps/chosen": -458.0, + "logps/rejected": -310.0, + "loss": 0.53414306640625, + "memory(GiB)": 45.84, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34375, + "rewards/margins": 5.1875, + "rewards/rejected": -1.859375, + "step": 20, + "train_speed(iter/s)": 0.091844 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.306640625, + "eval_logps/chosen": -4.78125, + "eval_logps/rejected": -160.0, + "eval_loss": 0.439208984375, + "eval_nll_loss": 0.2080078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 6.21875, + "eval_rewards/rejected": -0.2001953125, + "eval_runtime": 1.2472, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 0.802, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8734807261184.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "v_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc019ba4c600200e26b0892741a1594e15feaa03 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f11fe94391435c7d5420e38387d3c3608d75f3f9be24e441266861cccdc8a8 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e7c80fa1f00a97c8c17c13eef7ccab4a5ffc6a0 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:429e5d67e0755a91d26b3af6e20eebf35d48c9712e983c4912b6b2a32c6cdfba +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5b9e110bc8c35a2630a287afcce7fb3707c9419 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9f3b35d4369219be01924f6bdfbcdf49a8f0c6e18fe467e925f35aa5a00e47 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2feccbcf03e3553114091c9f1f3dd1fb932171b --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64ed82bf699d994cbaae135846c7652886d6334cf796ddbc13b454e7e23f354c +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e25756d3539b7ab936a0195dd897b9a76ee80085 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5806f50ed505b0718d02f32d92d343f4ac2ca7e286eacbe1d3692a5768ae223e +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..521cff52824a521f9600db9459948a11f52abf09 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2caf3158f1f6f446801f4d26836267d7f8eace7459ddd0add1a9abd1b83631aa +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a726173f4adb98fe2e2b46df9a3a7a5386be326 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4dde91841e38e32dbb367a21a6ab2d644ee359442a82e929982af7d81f1339b +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ded5c487b0af4c917cf833cdafa2261290cfc53a --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42bebe291bfd2d5039060b95d8080361a7981aa9abfcb39bce2d72a9c2ebef8e +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6cc4d81a0b906cb49404ddc23dce222d0007551 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d27803d4e916264476fdba88d87a2e03b998cd73b3ea77e32f59ac3bde61a55 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..67f1c55b2b0a3119f2287d39e40e22b4f158741b --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest @@ -0,0 +1 @@ +global_step39 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8799407442db08820f995bcf1b9158f696af19f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f39416636e7990907141a415603582d33812fc9 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f8e5c420bc296502c335bcadd512d01972f28a0 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2ab72c0a7472f98efb1865889d6039f3ae7d12fc3c8e7bfeea52279fc333219 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..079b90493176b0f049e8711e531298376f026695 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.42163086, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-40", + "epoch": 1.606060606060606, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.741686768010279, + "learning_rate": 2.5e-05, + "logits/chosen": -0.3984375, + "logits/rejected": 0.2080078125, + "logps/chosen": -282.0, + "logps/rejected": -272.0, + "loss": 1.7568359375, + "memory(GiB)": 14.02, + "nll_loss": 1.5859375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.062883 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 3.179213138025169, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.734375, + "logits/rejected": -0.185546875, + "logps/chosen": -358.0, + "logps/rejected": -516.0, + "loss": 1.7740478515625, + "memory(GiB)": 31.0, + "nll_loss": 1.0234375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.287109375, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.1416015625, + "step": 5, + "train_speed(iter/s)": 0.084048 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.1865745356779005, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.5078125, + "logits/rejected": -0.036376953125, + "logps/chosen": -348.0, + "logps/rejected": -552.0, + "loss": 1.3421142578125, + "memory(GiB)": 45.84, + "nll_loss": 0.8203125, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6015625, + "rewards/margins": 0.91796875, + "rewards/rejected": 0.6875, + "step": 10, + "train_speed(iter/s)": 0.086989 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.2081256636056697, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.703125, + "logps/chosen": -368.0, + "logps/rejected": -284.0, + "loss": 0.77510986328125, + "memory(GiB)": 45.84, + "nll_loss": 0.67578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984375, + "rewards/margins": 3.3125, + "rewards/rejected": -0.33203125, + "step": 15, + "train_speed(iter/s)": 0.09061 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12928414880774924, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.2060546875, + "logits/rejected": -0.61328125, + "logps/chosen": -458.0, + "logps/rejected": -310.0, + "loss": 0.53414306640625, + "memory(GiB)": 45.84, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34375, + "rewards/margins": 5.1875, + "rewards/rejected": -1.859375, + "step": 20, + "train_speed(iter/s)": 0.091844 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.306640625, + "eval_logps/chosen": -4.78125, + "eval_logps/rejected": -160.0, + "eval_loss": 0.439208984375, + "eval_nll_loss": 0.2080078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 6.21875, + "eval_rewards/rejected": -0.2001953125, + "eval_runtime": 1.2472, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 0.802, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.19570778554820287, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.046630859375, + "logits/rejected": -0.140625, + "logps/chosen": -430.0, + "logps/rejected": -516.0, + "loss": 0.468353271484375, + "memory(GiB)": 45.84, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0, + "rewards/margins": 7.0625, + "rewards/rejected": -2.0625, + "step": 25, + "train_speed(iter/s)": 0.092514 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.09105661940994438, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.052490234375, + "logps/chosen": -280.0, + "logps/rejected": -502.0, + "loss": 0.4760528564453125, + "memory(GiB)": 45.84, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.53125, + "rewards/margins": 10.625, + "rewards/rejected": -4.125, + "step": 30, + "train_speed(iter/s)": 0.09323 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.08621388537462939, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.45703125, + "logits/rejected": 0.1806640625, + "logps/chosen": -212.0, + "logps/rejected": -528.0, + "loss": 0.4629302978515625, + "memory(GiB)": 45.84, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.625, + "rewards/rejected": -3.625, + "step": 35, + "train_speed(iter/s)": 0.093963 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.11934377803721716, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.35546875, + "logps/chosen": -344.0, + "logps/rejected": -282.0, + "loss": 0.4460845947265625, + "memory(GiB)": 45.84, + "nll_loss": 0.404296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.75, + "rewards/margins": 10.75, + "rewards/rejected": -3.984375, + "step": 40, + "train_speed(iter/s)": 0.094552 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.953125, + "eval_logps/chosen": -5.09375, + "eval_logps/rejected": -178.0, + "eval_loss": 0.421630859375, + "eval_nll_loss": 0.2216796875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.0, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": -2.0, + "eval_runtime": 1.4128, + "eval_samples_per_second": 2.831, + "eval_steps_per_second": 0.708, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 17165975126016.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "v_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7fbf70f5e9dbab491ad68ef42cab5d47b81bd6f9 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b833ad688223299d12ae89a33634d417b275c016962768f74f6c1b3349400502 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e783cf95a2c851bb2b9b3af4fa47faf30d81e309 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e2b6e9968138490df7cb73722ffab10b88171cee622305bef82d535a3151dfa +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1f06cb9814b1bc14102116170c9c05fbcc6c075 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880944dff0f0c20d4b6eec7c3fcc2270fdccf63d03e13b03c26d61c9d61cb540 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ca6988f7dbcbadd28909f525682b11a668311bb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61abab95371bd0b099f908aee125a63e3daddf8bcf7954b4e566c6845bf0713f +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bd94df4047c4934c3677281e85edf1960a57649 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40fe6c96dfd64b22723661fe3c7d7acd78ff6f8a1ca900eb12d1c9e39a313aa1 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5011349fbf7bc231576172746d34f19d8be3a03 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae87355a9241c0ea4edf21abd0fb8d38012f4aa9723febcef39bbc7901c36db +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f27a9e89ff30e1b7e5fc2b97fce8cf811b4d1cb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84abe8df1ad89f1801c346cbe87bf7fead94c9cb08b42f184d0dfb8f1c359ccc +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6a5432b84d6286d0b574866aaab3d0e6352ed4f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860eafdd10ecf7c0eaad3c6aca57b888ad1a5f906134d493e84eec89940369c2 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db55b37d6a437c84db4a22626123c43786e59401 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124f015af78bbf627b57d97070f950f843b2b5ef2f1886ddbeae34a9ce2a2d10 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..099fa08342218cca7c00fb7043635561ebda9695 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest @@ -0,0 +1 @@ +global_step59 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c54ea122b283c04f6b60c1eedefeb301763a8f9f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea57ead2533e587fe50f62107d7cb32945fe1354 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4689a9445d07528dc4fd91011a7f034c11773a68 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..919b5e43a96a9afdeb196f402142bc3aab67f247 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..95126866042baa544d6bc4555d944440b37fdb21 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e1521c1c8dfc88bc6566a95cc91f42709693a765076997f6318af86035c445 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1535f6cf60a3e9b40b7e1196105c3c1e2954f335 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.41357422, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60", + "epoch": 2.404040404040404, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.741686768010279, + "learning_rate": 2.5e-05, + "logits/chosen": -0.3984375, + "logits/rejected": 0.2080078125, + "logps/chosen": -282.0, + "logps/rejected": -272.0, + "loss": 1.7568359375, + "memory(GiB)": 14.02, + "nll_loss": 1.5859375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.062883 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 3.179213138025169, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.734375, + "logits/rejected": -0.185546875, + "logps/chosen": -358.0, + "logps/rejected": -516.0, + "loss": 1.7740478515625, + "memory(GiB)": 31.0, + "nll_loss": 1.0234375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.287109375, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.1416015625, + "step": 5, + "train_speed(iter/s)": 0.084048 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.1865745356779005, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.5078125, + "logits/rejected": -0.036376953125, + "logps/chosen": -348.0, + "logps/rejected": -552.0, + "loss": 1.3421142578125, + "memory(GiB)": 45.84, + "nll_loss": 0.8203125, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6015625, + "rewards/margins": 0.91796875, + "rewards/rejected": 0.6875, + "step": 10, + "train_speed(iter/s)": 0.086989 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.2081256636056697, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.703125, + "logps/chosen": -368.0, + "logps/rejected": -284.0, + "loss": 0.77510986328125, + "memory(GiB)": 45.84, + "nll_loss": 0.67578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984375, + "rewards/margins": 3.3125, + "rewards/rejected": -0.33203125, + "step": 15, + "train_speed(iter/s)": 0.09061 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12928414880774924, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.2060546875, + "logits/rejected": -0.61328125, + "logps/chosen": -458.0, + "logps/rejected": -310.0, + "loss": 0.53414306640625, + "memory(GiB)": 45.84, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34375, + "rewards/margins": 5.1875, + "rewards/rejected": -1.859375, + "step": 20, + "train_speed(iter/s)": 0.091844 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.306640625, + "eval_logps/chosen": -4.78125, + "eval_logps/rejected": -160.0, + "eval_loss": 0.439208984375, + "eval_nll_loss": 0.2080078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 6.21875, + "eval_rewards/rejected": -0.2001953125, + "eval_runtime": 1.2472, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 0.802, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.19570778554820287, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.046630859375, + "logits/rejected": -0.140625, + "logps/chosen": -430.0, + "logps/rejected": -516.0, + "loss": 0.468353271484375, + "memory(GiB)": 45.84, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0, + "rewards/margins": 7.0625, + "rewards/rejected": -2.0625, + "step": 25, + "train_speed(iter/s)": 0.092514 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.09105661940994438, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.052490234375, + "logps/chosen": -280.0, + "logps/rejected": -502.0, + "loss": 0.4760528564453125, + "memory(GiB)": 45.84, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.53125, + "rewards/margins": 10.625, + "rewards/rejected": -4.125, + "step": 30, + "train_speed(iter/s)": 0.09323 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.08621388537462939, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.45703125, + "logits/rejected": 0.1806640625, + "logps/chosen": -212.0, + "logps/rejected": -528.0, + "loss": 0.4629302978515625, + "memory(GiB)": 45.84, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.625, + "rewards/rejected": -3.625, + "step": 35, + "train_speed(iter/s)": 0.093963 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.11934377803721716, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.35546875, + "logps/chosen": -344.0, + "logps/rejected": -282.0, + "loss": 0.4460845947265625, + "memory(GiB)": 45.84, + "nll_loss": 0.404296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.75, + "rewards/margins": 10.75, + "rewards/rejected": -3.984375, + "step": 40, + "train_speed(iter/s)": 0.094552 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.953125, + "eval_logps/chosen": -5.09375, + "eval_logps/rejected": -178.0, + "eval_loss": 0.421630859375, + "eval_nll_loss": 0.2216796875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.0, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": -2.0, + "eval_runtime": 1.4128, + "eval_samples_per_second": 2.831, + "eval_steps_per_second": 0.708, + "step": 40 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.1278829740469663, + "learning_rate": 3.411042902090492e-05, + "logits/chosen": -0.0233154296875, + "logits/rejected": 0.328125, + "logps/chosen": -310.0, + "logps/rejected": -494.0, + "loss": 0.539617919921875, + "memory(GiB)": 45.84, + "nll_loss": 0.51953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.75, + "rewards/margins": 10.625, + "rewards/rejected": -2.875, + "step": 45, + "train_speed(iter/s)": 0.094018 + }, + { + "epoch": 2.0, + "grad_norm": 0.20021892626227725, + "learning_rate": 2.3678391856132204e-05, + "logits/chosen": 0.181640625, + "logits/rejected": 0.0159912109375, + "logps/chosen": -300.0, + "logps/rejected": -280.0, + "loss": 0.437335205078125, + "memory(GiB)": 45.85, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.8125, + "rewards/margins": 11.3125, + "rewards/rejected": -3.5, + "step": 50, + "train_speed(iter/s)": 0.094129 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17561192586448465, + "learning_rate": 1.4644660940672627e-05, + "logits/chosen": 0.0888671875, + "logits/rejected": 0.3046875, + "logps/chosen": -298.0, + "logps/rejected": -620.0, + "loss": 0.48487548828125, + "memory(GiB)": 45.85, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.96875, + "rewards/margins": 9.375, + "rewards/rejected": -2.40625, + "step": 55, + "train_speed(iter/s)": 0.094012 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.06159661984448856, + "learning_rate": 7.489143213519301e-06, + "logits/chosen": 0.0595703125, + "logits/rejected": -0.3125, + "logps/chosen": -422.0, + "logps/rejected": -468.0, + "loss": 0.45642852783203125, + "memory(GiB)": 45.85, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.125, + "rewards/margins": 11.1875, + "rewards/rejected": -3.0625, + "step": 60, + "train_speed(iter/s)": 0.094412 + }, + { + "epoch": 2.404040404040404, + "eval_logits/chosen": -1.4140625, + "eval_logits/rejected": 1.171875, + "eval_logps/chosen": -4.8125, + "eval_logps/rejected": -194.0, + "eval_loss": 0.41357421875, + "eval_nll_loss": 0.208984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 9.625, + "eval_rewards/rejected": -3.59375, + "eval_runtime": 1.3539, + "eval_samples_per_second": 2.954, + "eval_steps_per_second": 0.739, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 26206478368768.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "v_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07356f87962f58b0a54e8b164cfac213aee29101 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff451d5dacda09cd8d5e772db9d49595fba9aaf2a8d4142ee5134d33549539e6 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json new file mode 100644 index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f2ca773af6eb647432387f94fd0d38244fa9880 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d5b6b9eb6990c77f4187385e1ecd43690a709b63536d9d4aac87b389862dcc +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0361ad7512916df91e9d9d5b6776f9b3a2f8aeb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c12938e258e9f032221091191c41232e0f66e15b2e2a5b6fb707ad0af4701f +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bf980a291414bf2ca5063855c4ef33bdf2ee2d2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b320ed365a63b3109f4755f5bc13522b7b5a881053de0cb9353d4597c4d691d +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b089aa8a9cf884d93b7e2644bd3f6512ff86a7a --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98dddd7880cafd60833a0e6809618ef6b0d7d22e342d49d7609688cf91a3a09b +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a524f27b92645e5645dfcc9b75110e322a57a6b2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f7e868315deb2e0bb0064d1d57817ce3429b318a03d3a3dd62a9dfce05bbef +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e47b1e355f370a9f30c7c77ba856b6efddbb168f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64cb77e899835cb8a51cfb333ad499ca2901e784ea087a7c21b702d596df647 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7726800cef0fc2d62c65ee7dd3f9e0cb17c01440 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3afc72b3e2aaabd252874436aacbe2ce7bff38be7845ea0f1ffb58987003f91e +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b02e5338327c0d40effa89d5444bb462c2d1d89c --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3089aac8b20a3c6b4d889ebbe8a52e9c331dd4d49e66893559428bee931945 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest new file mode 100644 index 0000000000000000000000000000000000000000..bbeadc7466d2728e3046120a012ebc37c29267cb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest @@ -0,0 +1 @@ +global_step71 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..be2e24cc9d9ef8857272cec1451c810e205ec4e9 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..efcf4dd2e74596ac28af81f9f8bd0be9a807deb3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c9222e37d4e9d1745c0e126e0fe0c4a348e298d --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7821bf0f5f0621fd0159152432f0a7bc66aa6823 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d0d6f336655bdacf5eb53294b71e20f2d0edb17 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ba16a2cd6668009497101c7aa1ee348685f1df2d9a2a20c23be3737c813063 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0a81da7c7b5decbdc85f2c036c193db226e332 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json @@ -0,0 +1,371 @@ +{ + "best_metric": 0.41357422, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60", + "epoch": 2.888888888888889, + "eval_steps": 20, + "global_step": 72, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.741686768010279, + "learning_rate": 2.5e-05, + "logits/chosen": -0.3984375, + "logits/rejected": 0.2080078125, + "logps/chosen": -282.0, + "logps/rejected": -272.0, + "loss": 1.7568359375, + "memory(GiB)": 14.02, + "nll_loss": 1.5859375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.062883 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 3.179213138025169, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.734375, + "logits/rejected": -0.185546875, + "logps/chosen": -358.0, + "logps/rejected": -516.0, + "loss": 1.7740478515625, + "memory(GiB)": 31.0, + "nll_loss": 1.0234375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.287109375, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.1416015625, + "step": 5, + "train_speed(iter/s)": 0.084048 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.1865745356779005, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.5078125, + "logits/rejected": -0.036376953125, + "logps/chosen": -348.0, + "logps/rejected": -552.0, + "loss": 1.3421142578125, + "memory(GiB)": 45.84, + "nll_loss": 0.8203125, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.6015625, + "rewards/margins": 0.91796875, + "rewards/rejected": 0.6875, + "step": 10, + "train_speed(iter/s)": 0.086989 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.2081256636056697, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.23828125, + "logits/rejected": -0.703125, + "logps/chosen": -368.0, + "logps/rejected": -284.0, + "loss": 0.77510986328125, + "memory(GiB)": 45.84, + "nll_loss": 0.67578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984375, + "rewards/margins": 3.3125, + "rewards/rejected": -0.33203125, + "step": 15, + "train_speed(iter/s)": 0.09061 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.12928414880774924, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.2060546875, + "logits/rejected": -0.61328125, + "logps/chosen": -458.0, + "logps/rejected": -310.0, + "loss": 0.53414306640625, + "memory(GiB)": 45.84, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34375, + "rewards/margins": 5.1875, + "rewards/rejected": -1.859375, + "step": 20, + "train_speed(iter/s)": 0.091844 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.306640625, + "eval_logps/chosen": -4.78125, + "eval_logps/rejected": -160.0, + "eval_loss": 0.439208984375, + "eval_nll_loss": 0.2080078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 6.21875, + "eval_rewards/rejected": -0.2001953125, + "eval_runtime": 1.2472, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 0.802, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.19570778554820287, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.046630859375, + "logits/rejected": -0.140625, + "logps/chosen": -430.0, + "logps/rejected": -516.0, + "loss": 0.468353271484375, + "memory(GiB)": 45.84, + "nll_loss": 0.5078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0, + "rewards/margins": 7.0625, + "rewards/rejected": -2.0625, + "step": 25, + "train_speed(iter/s)": 0.092514 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.09105661940994438, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.052490234375, + "logps/chosen": -280.0, + "logps/rejected": -502.0, + "loss": 0.4760528564453125, + "memory(GiB)": 45.84, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.53125, + "rewards/margins": 10.625, + "rewards/rejected": -4.125, + "step": 30, + "train_speed(iter/s)": 0.09323 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.08621388537462939, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.45703125, + "logits/rejected": 0.1806640625, + "logps/chosen": -212.0, + "logps/rejected": -528.0, + "loss": 0.4629302978515625, + "memory(GiB)": 45.84, + "nll_loss": 0.462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.625, + "rewards/rejected": -3.625, + "step": 35, + "train_speed(iter/s)": 0.093963 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.11934377803721716, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": 0.18359375, + "logits/rejected": -0.35546875, + "logps/chosen": -344.0, + "logps/rejected": -282.0, + "loss": 0.4460845947265625, + "memory(GiB)": 45.84, + "nll_loss": 0.404296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.75, + "rewards/margins": 10.75, + "rewards/rejected": -3.984375, + "step": 40, + "train_speed(iter/s)": 0.094552 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.4375, + "eval_logits/rejected": 0.953125, + "eval_logps/chosen": -5.09375, + "eval_logps/rejected": -178.0, + "eval_loss": 0.421630859375, + "eval_nll_loss": 0.2216796875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.0, + "eval_rewards/margins": 8.0, + "eval_rewards/rejected": -2.0, + "eval_runtime": 1.4128, + "eval_samples_per_second": 2.831, + "eval_steps_per_second": 0.708, + "step": 40 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.1278829740469663, + "learning_rate": 3.411042902090492e-05, + "logits/chosen": -0.0233154296875, + "logits/rejected": 0.328125, + "logps/chosen": -310.0, + "logps/rejected": -494.0, + "loss": 0.539617919921875, + "memory(GiB)": 45.84, + "nll_loss": 0.51953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.75, + "rewards/margins": 10.625, + "rewards/rejected": -2.875, + "step": 45, + "train_speed(iter/s)": 0.094018 + }, + { + "epoch": 2.0, + "grad_norm": 0.20021892626227725, + "learning_rate": 2.3678391856132204e-05, + "logits/chosen": 0.181640625, + "logits/rejected": 0.0159912109375, + "logps/chosen": -300.0, + "logps/rejected": -280.0, + "loss": 0.437335205078125, + "memory(GiB)": 45.85, + "nll_loss": 0.40234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.8125, + "rewards/margins": 11.3125, + "rewards/rejected": -3.5, + "step": 50, + "train_speed(iter/s)": 0.094129 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.17561192586448465, + "learning_rate": 1.4644660940672627e-05, + "logits/chosen": 0.0888671875, + "logits/rejected": 0.3046875, + "logps/chosen": -298.0, + "logps/rejected": -620.0, + "loss": 0.48487548828125, + "memory(GiB)": 45.85, + "nll_loss": 0.45703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.96875, + "rewards/margins": 9.375, + "rewards/rejected": -2.40625, + "step": 55, + "train_speed(iter/s)": 0.094012 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.06159661984448856, + "learning_rate": 7.489143213519301e-06, + "logits/chosen": 0.0595703125, + "logits/rejected": -0.3125, + "logps/chosen": -422.0, + "logps/rejected": -468.0, + "loss": 0.45642852783203125, + "memory(GiB)": 45.85, + "nll_loss": 0.515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.125, + "rewards/margins": 11.1875, + "rewards/rejected": -3.0625, + "step": 60, + "train_speed(iter/s)": 0.094412 + }, + { + "epoch": 2.404040404040404, + "eval_logits/chosen": -1.4140625, + "eval_logits/rejected": 1.171875, + "eval_logps/chosen": -4.8125, + "eval_logps/rejected": -194.0, + "eval_loss": 0.41357421875, + "eval_nll_loss": 0.208984375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 9.625, + "eval_rewards/rejected": -3.59375, + "eval_runtime": 1.3539, + "eval_samples_per_second": 2.954, + "eval_steps_per_second": 0.739, + "step": 60 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.12507359172791535, + "learning_rate": 2.591967620451707e-06, + "logits/chosen": -0.41796875, + "logits/rejected": 0.447265625, + "logps/chosen": -206.0, + "logps/rejected": -488.0, + "loss": 0.47344970703125, + "memory(GiB)": 45.85, + "nll_loss": 0.55078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.5, + "rewards/margins": 11.75, + "rewards/rejected": -4.25, + "step": 65, + "train_speed(iter/s)": 0.094381 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.13343167302632133, + "learning_rate": 2.1329118524827662e-07, + "logits/chosen": 0.0218505859375, + "logits/rejected": 0.055908203125, + "logps/chosen": -274.0, + "logps/rejected": -446.0, + "loss": 0.43180007934570314, + "memory(GiB)": 45.85, + "nll_loss": 0.458984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.5, + "rewards/margins": 11.25, + "rewards/rejected": -3.734375, + "step": 70, + "train_speed(iter/s)": 0.094656 + }, + { + "epoch": 2.888888888888889, + "eval_logits/chosen": -1.421875, + "eval_logits/rejected": 1.1875, + "eval_logps/chosen": -4.84375, + "eval_logps/rejected": -196.0, + "eval_loss": 0.414306640625, + "eval_nll_loss": 0.2099609375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.03125, + "eval_rewards/margins": 9.8125, + "eval_rewards/rejected": -3.796875, + "eval_runtime": 1.3572, + "eval_samples_per_second": 2.947, + "eval_steps_per_second": 0.737, + "step": 72 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 31365457739776.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..4c708d3a5785f9e917401606bb742dc2b3ee8a45 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..50eb007ae00ed25bfee2ae00af74f7fdacecf1bf Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..36cb6379c583bc0a50d2283464f25d5cc623b0c2 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3a445b37d179a145bc175a21b9b8268965ee752c Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..15368799be0c751785409a72cd4ec74180191bf0 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c73aef62fd8c77e7da83b81ba38ccd696972c382 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..2b88ed37a3c651cbf54899f712bcd1515e2be8fc Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..bff48443ab4ba9b4dd6c8578efb9dfee8786c9b6 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..f0a039de67bfae0805a1115a43451703297be821 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ae27e9ee434a4df106c4ee15fe15663ffca4378f Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..aba91c89d2521852f673b1924c044d779882d40b Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..e7220036d6e5983d8e5812f0ac9dbd73f92913b3 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a7d287bc2108ba05a9f3f594ea8abe7dd3120dde Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..bf6c6bacafca8d256f07d43a8851a9463ac6e566 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..8378287c7533dfa720b80460c365cffd20f03bab Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..9d80e686cd9522b9efdfd0b12d052a155efc0d3b Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..bfcce7ff956d379e25b60149b98415666d43ec0e Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..242322449f519ae3288a6bf6b0a1487ff673f4d9 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..2f120ad7c37770ac9c2c78990a80575eb4c890f2 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..2bc7e0b79f310f3149946bef94dd22981e802a88 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..6b3419f3056edd9a42e798ed8a5e5894464f86a2 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..6281dc99d60d4bb0d0bf0f1103caf4fc5b875e56 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..422b81beb4839d188316e2373bd68f55db735fbb Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..1a4f76c5f8435fb589b78732175fc70cc6ef802a Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..33cce2fdd4b67464b2da8ecd01063ce06a43e716 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..cf7f037662da793ec4504cfd52fabef4808cd1da Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3c9d91519cd50c651e00a0d9d1df9359313c6be5 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..06cb39f0566cc3c686d80bc540ff624c8e4ba232 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..0a5bd1d3170d65f876cbeebb92fe21e10853b208 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..7216ef100f53ed5e665aaa9528c4ef127b3b1e50 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3546d7732fcd75094eee28a57cf94529760d8d33 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..8c3c7491fbc21428e2362a18d07661f52331b465 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..6fbaf64d83cd46fd748c06dd7282aa47efa0dc2d Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d81c19aacbb83eb86dda03c584ac6518b2b8b59 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl @@ -0,0 +1,21 @@ +{"loss": 1.75683594, "grad_norm": 2.74168677, "learning_rate": 2.5e-05, "memory(GiB)": 14.02, "train_speed(iter/s)": 0.062883, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -282.0, "logits/rejected": 0.20800781, "logits/chosen": -0.3984375, "nll_loss": 1.5859375, "epoch": 0.04040404, "global_step/max_steps": "1/72", "percentage": "1.39%", "elapsed_time": "12s", "remaining_time": "15m 12s"} +{"loss": 1.77404785, "grad_norm": 3.17921314, "learning_rate": 9.995e-05, "memory(GiB)": 31.0, "train_speed(iter/s)": 0.084048, "rewards/chosen": 0.28710938, "rewards/rejected": 0.14160156, "rewards/accuracies": 0.5, "rewards/margins": 0.14550781, "logps/rejected": -516.0, "logps/chosen": -358.0, "logits/rejected": -0.18554688, "logits/chosen": -0.734375, "nll_loss": 1.0234375, "epoch": 0.2020202, "global_step/max_steps": "5/72", "percentage": "6.94%", "elapsed_time": "56s", "remaining_time": "12m 35s"} +{"loss": 1.34211426, "grad_norm": 1.18657454, "learning_rate": 9.809e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.086989, "rewards/chosen": 1.6015625, "rewards/rejected": 0.6875, "rewards/accuracies": 0.85000002, "rewards/margins": 0.91796875, "logps/rejected": -552.0, "logps/chosen": -348.0, "logits/rejected": -0.03637695, "logits/chosen": -0.5078125, "nll_loss": 0.8203125, "epoch": 0.4040404, "global_step/max_steps": "10/72", "percentage": "13.89%", "elapsed_time": "1m 51s", "remaining_time": "11m 33s"} +{"loss": 0.77510986, "grad_norm": 0.20812566, "learning_rate": 9.368e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09061, "rewards/chosen": 2.984375, "rewards/rejected": -0.33203125, "rewards/accuracies": 1.0, "rewards/margins": 3.3125, "logps/rejected": -284.0, "logps/chosen": -368.0, "logits/rejected": -0.703125, "logits/chosen": -0.23828125, "nll_loss": 0.67578125, "epoch": 0.60606061, "global_step/max_steps": "15/72", "percentage": "20.83%", "elapsed_time": "2m 42s", "remaining_time": "10m 17s"} +{"loss": 0.53414307, "grad_norm": 0.12928415, "learning_rate": 8.695e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.091844, "rewards/chosen": 3.34375, "rewards/rejected": -1.859375, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -310.0, "logps/chosen": -458.0, "logits/rejected": -0.61328125, "logits/chosen": -0.20605469, "nll_loss": 0.578125, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 34s", "remaining_time": "9m 18s"} +{"eval_loss": 0.43920898, "eval_runtime": 1.2472, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -0.20019531, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.21875, "eval_logps/rejected": -160.0, "eval_logps/chosen": -4.78125, "eval_logits/rejected": 0.30664062, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.20800781, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 35s", "remaining_time": "9m 21s"} +{"loss": 0.46835327, "grad_norm": 0.19570779, "learning_rate": 7.826e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.092514, "rewards/chosen": 5.0, "rewards/rejected": -2.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -516.0, "logps/chosen": -430.0, "logits/rejected": -0.140625, "logits/chosen": 0.04663086, "nll_loss": 0.5078125, "epoch": 1.0, "global_step/max_steps": "25/72", "percentage": "34.72%", "elapsed_time": "4m 27s", "remaining_time": "8m 22s"} +{"loss": 0.47605286, "grad_norm": 0.09105662, "learning_rate": 6.806e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09323, "rewards/chosen": 6.53125, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -502.0, "logps/chosen": -280.0, "logits/rejected": -0.05249023, "logits/chosen": -0.15820312, "nll_loss": 0.40234375, "epoch": 1.2020202, "global_step/max_steps": "30/72", "percentage": "41.67%", "elapsed_time": "5m 18s", "remaining_time": "7m 26s"} +{"loss": 0.4629303, "grad_norm": 0.08621389, "learning_rate": 5.691e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.093963, "rewards/chosen": 7.0, "rewards/rejected": -3.625, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -528.0, "logps/chosen": -212.0, "logits/rejected": 0.18066406, "logits/chosen": -0.45703125, "nll_loss": 0.46289062, "epoch": 1.4040404, "global_step/max_steps": "35/72", "percentage": "48.61%", "elapsed_time": "6m 9s", "remaining_time": "6m 30s"} +{"loss": 0.44608459, "grad_norm": 0.11934378, "learning_rate": 4.539e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094552, "rewards/chosen": 6.75, "rewards/rejected": -3.984375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -282.0, "logps/chosen": -344.0, "logits/rejected": -0.35546875, "logits/chosen": 0.18359375, "nll_loss": 0.40429688, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 59s", "remaining_time": "5m 35s"} +{"eval_loss": 0.42163086, "eval_runtime": 1.4128, "eval_samples_per_second": 2.831, "eval_steps_per_second": 0.708, "eval_rewards/chosen": 6.0, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -178.0, "eval_logps/chosen": -5.09375, "eval_logits/rejected": 0.953125, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.22167969, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "7m 1s", "remaining_time": "5m 37s"} +{"loss": 0.53961792, "grad_norm": 0.12788297, "learning_rate": 3.411e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094018, "rewards/chosen": 7.75, "rewards/rejected": -2.875, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -494.0, "logps/chosen": -310.0, "logits/rejected": 0.328125, "logits/chosen": -0.02331543, "nll_loss": 0.51953125, "epoch": 1.80808081, "global_step/max_steps": "45/72", "percentage": "62.50%", "elapsed_time": "7m 55s", "remaining_time": "4m 45s"} +{"loss": 0.43733521, "grad_norm": 0.20021893, "learning_rate": 2.368e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094129, "rewards/chosen": 7.8125, "rewards/rejected": -3.5, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -280.0, "logps/chosen": -300.0, "logits/rejected": 0.01599121, "logits/chosen": 0.18164062, "nll_loss": 0.40234375, "epoch": 2.0, "global_step/max_steps": "50/72", "percentage": "69.44%", "elapsed_time": "8m 48s", "remaining_time": "3m 52s"} +{"loss": 0.48487549, "grad_norm": 0.17561193, "learning_rate": 1.464e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094012, "rewards/chosen": 6.96875, "rewards/rejected": -2.40625, "rewards/accuracies": 1.0, "rewards/margins": 9.375, "logps/rejected": -620.0, "logps/chosen": -298.0, "logits/rejected": 0.3046875, "logits/chosen": 0.08886719, "nll_loss": 0.45703125, "epoch": 2.2020202, "global_step/max_steps": "55/72", "percentage": "76.39%", "elapsed_time": "9m 41s", "remaining_time": "2m 59s"} +{"loss": 0.45642853, "grad_norm": 0.06159662, "learning_rate": 7.49e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094412, "rewards/chosen": 8.125, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -468.0, "logps/chosen": -422.0, "logits/rejected": -0.3125, "logits/chosen": 0.05957031, "nll_loss": 0.515625, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 32s", "remaining_time": "2m 6s"} +{"eval_loss": 0.41357422, "eval_runtime": 1.3539, "eval_samples_per_second": 2.954, "eval_steps_per_second": 0.739, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.625, "eval_logps/rejected": -194.0, "eval_logps/chosen": -4.8125, "eval_logits/rejected": 1.171875, "eval_logits/chosen": -1.4140625, "eval_nll_loss": 0.20898438, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 33s", "remaining_time": "2m 6s"} +{"loss": 0.47344971, "grad_norm": 0.12507359, "learning_rate": 2.59e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094381, "rewards/chosen": 7.5, "rewards/rejected": -4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.75, "logps/rejected": -488.0, "logps/chosen": -206.0, "logits/rejected": 0.44726562, "logits/chosen": -0.41796875, "nll_loss": 0.55078125, "epoch": 2.60606061, "global_step/max_steps": "65/72", "percentage": "90.28%", "elapsed_time": "11m 25s", "remaining_time": "1m 13s"} +{"loss": 0.43180008, "grad_norm": 0.13343167, "learning_rate": 2.1e-07, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094656, "rewards/chosen": 7.5, "rewards/rejected": -3.734375, "rewards/accuracies": 1.0, "rewards/margins": 11.25, "logps/rejected": -446.0, "logps/chosen": -274.0, "logits/rejected": 0.0559082, "logits/chosen": 0.02185059, "nll_loss": 0.45898438, "epoch": 2.80808081, "global_step/max_steps": "70/72", "percentage": "97.22%", "elapsed_time": "12m 16s", "remaining_time": "21s"} +{"eval_loss": 0.41430664, "eval_runtime": 1.3572, "eval_samples_per_second": 2.947, "eval_steps_per_second": 0.737, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -196.0, "eval_logps/chosen": -4.84375, "eval_logits/rejected": 1.1875, "eval_logits/chosen": -1.421875, "eval_nll_loss": 0.20996094, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 39s", "remaining_time": "0s"} +{"train_runtime": 760.0145, "train_samples_per_second": 1.559, "train_steps_per_second": 0.095, "total_flos": 31365457739776.0, "train_loss": 0.64717012, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 39s", "remaining_time": "0s"} +{"train_dataset": "1189.215190±496.010190, min=317.000000, max=4190.000000, size=395", "val_dataset": "1200.750000±508.140421, min=734.000000, max=2041.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-72", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60", "best_metric": 0.41357422, "global_step": 72, "log_history": [{"loss": 1.7568359375, "grad_norm": 2.741686768010279, "learning_rate": 2.5e-05, "memory(GiB)": 14.02, "train_speed(iter/s)": 0.062883, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -282.0, "logits/rejected": 0.2080078125, "logits/chosen": -0.3984375, "nll_loss": 1.5859375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 1.7740478515625, "grad_norm": 3.179213138025169, "learning_rate": 9.994664874011863e-05, "memory(GiB)": 31.0, "train_speed(iter/s)": 0.084048, "rewards/chosen": 0.287109375, "rewards/rejected": 0.1416015625, "rewards/accuracies": 0.5, "rewards/margins": 0.1455078125, "logps/rejected": -516.0, "logps/chosen": -358.0, "logits/rejected": -0.185546875, "logits/chosen": -0.734375, "nll_loss": 1.0234375, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.3421142578125, "grad_norm": 1.1865745356779005, "learning_rate": 9.809128215864097e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.086989, "rewards/chosen": 1.6015625, "rewards/rejected": 0.6875, "rewards/accuracies": 0.8500000238418579, "rewards/margins": 0.91796875, "logps/rejected": -552.0, "logps/chosen": -348.0, "logits/rejected": -0.036376953125, "logits/chosen": -0.5078125, "nll_loss": 0.8203125, "epoch": 0.40404040404040403, "step": 10}, {"loss": 0.77510986328125, "grad_norm": 0.2081256636056697, "learning_rate": 9.368111953231848e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09061, "rewards/chosen": 2.984375, "rewards/rejected": -0.33203125, "rewards/accuracies": 1.0, "rewards/margins": 3.3125, "logps/rejected": -284.0, "logps/chosen": -368.0, "logits/rejected": -0.703125, "logits/chosen": -0.23828125, "nll_loss": 0.67578125, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.53414306640625, "grad_norm": 0.12928414880774924, "learning_rate": 8.695044586103296e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.091844, "rewards/chosen": 3.34375, "rewards/rejected": -1.859375, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -310.0, "logps/chosen": -458.0, "logits/rejected": -0.61328125, "logits/chosen": -0.2060546875, "nll_loss": 0.578125, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.439208984375, "eval_runtime": 1.2472, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -0.2001953125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.21875, "eval_logps/rejected": -160.0, "eval_logps/chosen": -4.78125, "eval_logits/rejected": 0.306640625, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.2080078125, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.468353271484375, "grad_norm": 0.19570778554820287, "learning_rate": 7.82568207211296e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.092514, "rewards/chosen": 5.0, "rewards/rejected": -2.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -516.0, "logps/chosen": -430.0, "logits/rejected": -0.140625, "logits/chosen": 0.046630859375, "nll_loss": 0.5078125, "epoch": 1.0, "step": 25}, {"loss": 0.4760528564453125, "grad_norm": 0.09105661940994438, "learning_rate": 6.806208330935766e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09323, "rewards/chosen": 6.53125, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -502.0, "logps/chosen": -280.0, "logits/rejected": -0.052490234375, "logits/chosen": -0.158203125, "nll_loss": 0.40234375, "epoch": 1.202020202020202, "step": 30}, {"loss": 0.4629302978515625, "grad_norm": 0.08621388537462939, "learning_rate": 5.6907817747594116e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.093963, "rewards/chosen": 7.0, "rewards/rejected": -3.625, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -528.0, "logps/chosen": -212.0, "logits/rejected": 0.1806640625, "logits/chosen": -0.45703125, "nll_loss": 0.462890625, "epoch": 1.404040404040404, "step": 35}, {"loss": 0.4460845947265625, "grad_norm": 0.11934377803721716, "learning_rate": 4.5386582026834906e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094552, "rewards/chosen": 6.75, "rewards/rejected": -3.984375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -282.0, "logps/chosen": -344.0, "logits/rejected": -0.35546875, "logits/chosen": 0.18359375, "nll_loss": 0.404296875, "epoch": 1.606060606060606, "step": 40}, {"eval_loss": 0.421630859375, "eval_runtime": 1.4128, "eval_samples_per_second": 2.831, "eval_steps_per_second": 0.708, "eval_rewards/chosen": 6.0, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -178.0, "eval_logps/chosen": -5.09375, "eval_logits/rejected": 0.953125, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.2216796875, "epoch": 1.606060606060606, "step": 40}, {"loss": 0.539617919921875, "grad_norm": 0.1278829740469663, "learning_rate": 3.411042902090492e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094018, "rewards/chosen": 7.75, "rewards/rejected": -2.875, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -494.0, "logps/chosen": -310.0, "logits/rejected": 0.328125, "logits/chosen": -0.0233154296875, "nll_loss": 0.51953125, "epoch": 1.808080808080808, "step": 45}, {"loss": 0.437335205078125, "grad_norm": 0.20021892626227725, "learning_rate": 2.3678391856132204e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094129, "rewards/chosen": 7.8125, "rewards/rejected": -3.5, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -280.0, "logps/chosen": -300.0, "logits/rejected": 0.0159912109375, "logits/chosen": 0.181640625, "nll_loss": 0.40234375, "epoch": 2.0, "step": 50}, {"loss": 0.48487548828125, "grad_norm": 0.17561192586448465, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094012, "rewards/chosen": 6.96875, "rewards/rejected": -2.40625, "rewards/accuracies": 1.0, "rewards/margins": 9.375, "logps/rejected": -620.0, "logps/chosen": -298.0, "logits/rejected": 0.3046875, "logits/chosen": 0.0888671875, "nll_loss": 0.45703125, "epoch": 2.202020202020202, "step": 55}, {"loss": 0.45642852783203125, "grad_norm": 0.06159661984448856, "learning_rate": 7.489143213519301e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094412, "rewards/chosen": 8.125, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -468.0, "logps/chosen": -422.0, "logits/rejected": -0.3125, "logits/chosen": 0.0595703125, "nll_loss": 0.515625, "epoch": 2.404040404040404, "step": 60}, {"eval_loss": 0.41357421875, "eval_runtime": 1.3539, "eval_samples_per_second": 2.954, "eval_steps_per_second": 0.739, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.625, "eval_logps/rejected": -194.0, "eval_logps/chosen": -4.8125, "eval_logits/rejected": 1.171875, "eval_logits/chosen": -1.4140625, "eval_nll_loss": 0.208984375, "epoch": 2.404040404040404, "step": 60}, {"loss": 0.47344970703125, "grad_norm": 0.12507359172791535, "learning_rate": 2.591967620451707e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094381, "rewards/chosen": 7.5, "rewards/rejected": -4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.75, "logps/rejected": -488.0, "logps/chosen": -206.0, "logits/rejected": 0.447265625, "logits/chosen": -0.41796875, "nll_loss": 0.55078125, "epoch": 2.606060606060606, "step": 65}, {"loss": 0.43180007934570314, "grad_norm": 0.13343167302632133, "learning_rate": 2.1329118524827662e-07, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094656, "rewards/chosen": 7.5, "rewards/rejected": -3.734375, "rewards/accuracies": 1.0, "rewards/margins": 11.25, "logps/rejected": -446.0, "logps/chosen": -274.0, "logits/rejected": 0.055908203125, "logits/chosen": 0.0218505859375, "nll_loss": 0.458984375, "epoch": 2.808080808080808, "step": 70}, {"eval_loss": 0.414306640625, "eval_runtime": 1.3572, "eval_samples_per_second": 2.947, "eval_steps_per_second": 0.737, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -196.0, "eval_logps/chosen": -4.84375, "eval_logits/rejected": 1.1875, "eval_logits/chosen": -1.421875, "eval_nll_loss": 0.2099609375, "epoch": 2.888888888888889, "step": 72}, {"train_runtime": 760.0145, "train_samples_per_second": 1.559, "train_steps_per_second": 0.095, "total_flos": 31365457739776.0, "train_loss": 0.6471701198154025, "epoch": 2.888888888888889, "step": 72}], "memory": 45.845703125} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0 b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0 new file mode 100644 index 0000000000000000000000000000000000000000..a9ec9284c429951058a208e798266988bb28eb48 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88558a3cd946b321eccf012afda6a2a48fe17126efee9fa3f3f5a9bd4b8f9ae +size 23705 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json new file mode 100644 index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4e76809f01a498fe02cd62a07b36a15db217c933 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4159bbc77660983c58e8f43a1c115af4f4ccff9c7155a2b2b8da97bf491c3162 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1dfb163f468b909fe0c58ce444e12c589f55b50 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b246226f7517e1095b0cba8cb6c3fb461d12b9753114a79bcce3664deafa2385 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64215930e29c2a65027c1f36017d1b7273ca4434 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a82cc0f9cb5f56ef17584cddc4eb25c077dce638eda0d5b11b8acad396d63f8 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b498de605c3c33a1f6020c204e0fa403fa96366 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d1689336d82c68b9ffba45814111a23c49bbe0053646ce8bc2632924453aad +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adf014d92fdbe44fb3e075639643fd2c72ac1aee --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce3eb8132cb6d0cef9b1a069aa8252e700e37918ceae0a5faa00e0752ddd8cce +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f00124e5a0ea627b8addbbf3644b01ed2e2e835 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3052be4c3806c2550bace460681d1af2b3beeb7973b270c598ff4095198a4fc +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0099b653969fd7512c604969ec5d2e07300ef118 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a650af753c3f9853d157a148cbf7292ea7973999c01035dc62eaa2ffd1a958 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7b0f50bc9d960908ba49f55ba4c088080152779 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffdb58c9f32cb1cc2fec92b915d06c00271f6c23b3b162644a43d8bc322fad23 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f694c5feaaca0ce72187371fc6ee20956a7d7ecb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b9d5a7067f65dce1a3a7f5943fce25d44ab9d56b2097dfea4aafe4bf85b681 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..37ac50652a3badbfb1bdeaccb8b1934575b584eb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bc3650851dae439677613c9e23a5528de47b679 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e00a6e8b4b743026f68d749a8cb3bdd4b746838 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5354141d42e077c356f9ca8c6b12bd7e5e41f2af --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9db8a98ca69cd5bfebe102039231d58d7ea374e --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c42b1ef948ce2918e44502f72db30bc09f3c40f0dfa68050c22f884d1aac4ff5 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9e5fdb291bcd69a047683e120683c57884c066c6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.58154297, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20", + "epoch": 0.8080808080808081, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.3671343726657543, + "learning_rate": 2.5e-05, + "logits/chosen": -0.46875, + "logits/rejected": 0.228515625, + "logps/chosen": -286.0, + "logps/rejected": -272.0, + "loss": 1.8359375, + "memory(GiB)": 13.63, + "nll_loss": 1.7109375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.067542 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.798293390214536, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.7421875, + "logits/rejected": -0.185546875, + "logps/chosen": -362.0, + "logps/rejected": -512.0, + "loss": 1.83404541015625, + "memory(GiB)": 30.5, + "nll_loss": 1.1015625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.1005859375, + "rewards/rejected": 0.1279296875, + "step": 5, + "train_speed(iter/s)": 0.088059 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.205296809705217, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.421875, + "logits/rejected": -0.012451171875, + "logps/chosen": -350.0, + "logps/rejected": -548.0, + "loss": 1.5821044921875, + "memory(GiB)": 42.9, + "nll_loss": 0.8515625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.765625, + "rewards/margins": 0.63671875, + "rewards/rejected": 1.1328125, + "step": 10, + "train_speed(iter/s)": 0.090254 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.4006731478550383, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.14453125, + "logits/rejected": -0.609375, + "logps/chosen": -366.0, + "logps/rejected": -260.0, + "loss": 1.147705078125, + "memory(GiB)": 42.9, + "nll_loss": 0.75, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.53125, + "rewards/margins": 1.625, + "rewards/rejected": 1.90625, + "step": 15, + "train_speed(iter/s)": 0.09299 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.5239399131286955, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.033203125, + "logits/rejected": -0.53515625, + "logps/chosen": -452.0, + "logps/rejected": -280.0, + "loss": 0.72830810546875, + "memory(GiB)": 42.9, + "nll_loss": 0.609375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.125, + "rewards/margins": 3.0625, + "rewards/rejected": 1.0703125, + "step": 20, + "train_speed(iter/s)": 0.093763 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": 0.349609375, + "eval_logps/chosen": -17.5, + "eval_logps/rejected": -172.0, + "eval_loss": 0.58154296875, + "eval_nll_loss": 0.76171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 5.5, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": -0.80078125, + "eval_runtime": 1.2034, + "eval_samples_per_second": 3.324, + "eval_steps_per_second": 0.831, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8682004316160.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c6a9e141468c2e7aa30d50198bf0fc3c3baabd9d --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606674b8361c60bba8260ac0b95666f47368fb0cb1f386f6b25f1610a5ad7c1b +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bb96e8a686b1c62d928e4d48061a040ea5d87ec --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dc17147f84b9dfc398ff1d1f46c8a8aa7003211e1925d4d06ad06c7c2dd3558 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15e7cea4c2332d19f9c8fefad50ceb6fc0b1dc11 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2825587ca9746368793c0c0852399766eab3f1c29ae7d6742583f7b30ae81ca1 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d13fe824c6a9eaa2dc149dde07d891f913e81f7 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:656d9f585026bb51286c480d43ba0bd5fc46ead30f916682f7c7c2b5e129c6b0 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa07aef4759012465b54e6e79c200e277c108544 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9981c1d4f779336e387c48f93caf59757a8856bde2431fabbdae899d23d8a9db +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..521cff52824a521f9600db9459948a11f52abf09 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2caf3158f1f6f446801f4d26836267d7f8eace7459ddd0add1a9abd1b83631aa +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a726173f4adb98fe2e2b46df9a3a7a5386be326 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4dde91841e38e32dbb367a21a6ab2d644ee359442a82e929982af7d81f1339b +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ded5c487b0af4c917cf833cdafa2261290cfc53a --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42bebe291bfd2d5039060b95d8080361a7981aa9abfcb39bce2d72a9c2ebef8e +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6cc4d81a0b906cb49404ddc23dce222d0007551 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d27803d4e916264476fdba88d87a2e03b998cd73b3ea77e32f59ac3bde61a55 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..67f1c55b2b0a3119f2287d39e40e22b4f158741b --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest @@ -0,0 +1 @@ +global_step39 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8799407442db08820f995bcf1b9158f696af19f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f39416636e7990907141a415603582d33812fc9 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f8e5c420bc296502c335bcadd512d01972f28a0 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2ab72c0a7472f98efb1865889d6039f3ae7d12fc3c8e7bfeea52279fc333219 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1b14557eea9691f7f69da1096d677e374b950af --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.43310547, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40", + "epoch": 1.606060606060606, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.3671343726657543, + "learning_rate": 2.5e-05, + "logits/chosen": -0.46875, + "logits/rejected": 0.228515625, + "logps/chosen": -286.0, + "logps/rejected": -272.0, + "loss": 1.8359375, + "memory(GiB)": 13.63, + "nll_loss": 1.7109375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.067542 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.798293390214536, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.7421875, + "logits/rejected": -0.185546875, + "logps/chosen": -362.0, + "logps/rejected": -512.0, + "loss": 1.83404541015625, + "memory(GiB)": 30.5, + "nll_loss": 1.1015625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.1005859375, + "rewards/rejected": 0.1279296875, + "step": 5, + "train_speed(iter/s)": 0.088059 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.205296809705217, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.421875, + "logits/rejected": -0.012451171875, + "logps/chosen": -350.0, + "logps/rejected": -548.0, + "loss": 1.5821044921875, + "memory(GiB)": 42.9, + "nll_loss": 0.8515625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.765625, + "rewards/margins": 0.63671875, + "rewards/rejected": 1.1328125, + "step": 10, + "train_speed(iter/s)": 0.090254 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.4006731478550383, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.14453125, + "logits/rejected": -0.609375, + "logps/chosen": -366.0, + "logps/rejected": -260.0, + "loss": 1.147705078125, + "memory(GiB)": 42.9, + "nll_loss": 0.75, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.53125, + "rewards/margins": 1.625, + "rewards/rejected": 1.90625, + "step": 15, + "train_speed(iter/s)": 0.09299 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.5239399131286955, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.033203125, + "logits/rejected": -0.53515625, + "logps/chosen": -452.0, + "logps/rejected": -280.0, + "loss": 0.72830810546875, + "memory(GiB)": 42.9, + "nll_loss": 0.609375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.125, + "rewards/margins": 3.0625, + "rewards/rejected": 1.0703125, + "step": 20, + "train_speed(iter/s)": 0.093763 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": 0.349609375, + "eval_logps/chosen": -17.5, + "eval_logps/rejected": -172.0, + "eval_loss": 0.58154296875, + "eval_nll_loss": 0.76171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 5.5, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": -0.80078125, + "eval_runtime": 1.2034, + "eval_samples_per_second": 3.324, + "eval_steps_per_second": 0.831, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.5826068375172234, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.08447265625, + "logits/rejected": -0.142578125, + "logps/chosen": -434.0, + "logps/rejected": -496.0, + "loss": 0.532879638671875, + "memory(GiB)": 42.9, + "nll_loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0625, + "rewards/margins": 5.1875, + "rewards/rejected": -0.1357421875, + "step": 25, + "train_speed(iter/s)": 0.094482 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.1173239600840837, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.0703125, + "logps/chosen": -282.0, + "logps/rejected": -492.0, + "loss": 0.484796142578125, + "memory(GiB)": 42.9, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.71875, + "rewards/margins": 9.8125, + "rewards/rejected": -3.078125, + "step": 30, + "train_speed(iter/s)": 0.09487 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.10486166807457631, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.47265625, + "logits/rejected": 0.05126953125, + "logps/chosen": -217.0, + "logps/rejected": -524.0, + "loss": 0.477923583984375, + "memory(GiB)": 42.9, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.03125, + "rewards/margins": 10.5625, + "rewards/rejected": -3.53125, + "step": 35, + "train_speed(iter/s)": 0.095273 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.13908151012153538, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": -0.005706787109375, + "logits/rejected": -0.498046875, + "logps/chosen": -344.0, + "logps/rejected": -276.0, + "loss": 0.45271148681640627, + "memory(GiB)": 42.9, + "nll_loss": 0.408203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.5, + "rewards/rejected": -3.515625, + "step": 40, + "train_speed(iter/s)": 0.095656 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": 0.62109375, + "eval_logps/chosen": -5.34375, + "eval_logps/rejected": -175.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.232421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.6875, + "eval_rewards/margins": 7.78125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 1.3521, + "eval_samples_per_second": 2.958, + "eval_steps_per_second": 0.74, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 17048500207616.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ed77d7a0f1181182cf7693e023a2eb1e120e450 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a684115bfa3fb108fb2545f221efd333a9896e688740202490c8c2a6be1f27ec +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffb91e28c7385342a16c39d19bc6686dd65a8688 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0f28849a47c3709b61ee79f2a597a2f0919cb0f7e6af861d5380fbc3a0c4ca3 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cac9f6bb648587d2a3b570fd4ee90f907390e5fd --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1d7090b00b7d986b8c044d7ee440dd7f1e07c5651b76a8bb23c8e7b0f09356 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1de3ca3f3be68e0b799dc048d14af9189d25534a --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f11cbe419db26dd81a200168126e634587ac942367bba71edfb14cc1b9764c37 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91408596191e7ed067d633143da5d47bfa826148 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50e18fc41b88cad068f7e66295e39525d6739667e3563ed550d31533e19e9b1 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5011349fbf7bc231576172746d34f19d8be3a03 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae87355a9241c0ea4edf21abd0fb8d38012f4aa9723febcef39bbc7901c36db +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f27a9e89ff30e1b7e5fc2b97fce8cf811b4d1cb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84abe8df1ad89f1801c346cbe87bf7fead94c9cb08b42f184d0dfb8f1c359ccc +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6a5432b84d6286d0b574866aaab3d0e6352ed4f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860eafdd10ecf7c0eaad3c6aca57b888ad1a5f906134d493e84eec89940369c2 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db55b37d6a437c84db4a22626123c43786e59401 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124f015af78bbf627b57d97070f950f843b2b5ef2f1886ddbeae34a9ce2a2d10 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..099fa08342218cca7c00fb7043635561ebda9695 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest @@ -0,0 +1 @@ +global_step59 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c54ea122b283c04f6b60c1eedefeb301763a8f9f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea57ead2533e587fe50f62107d7cb32945fe1354 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4689a9445d07528dc4fd91011a7f034c11773a68 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..919b5e43a96a9afdeb196f402142bc3aab67f247 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..95126866042baa544d6bc4555d944440b37fdb21 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e1521c1c8dfc88bc6566a95cc91f42709693a765076997f6318af86035c445 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cab50a66dcea9a9cdb64373c69130e11f283d2e3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.42553711, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60", + "epoch": 2.404040404040404, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.3671343726657543, + "learning_rate": 2.5e-05, + "logits/chosen": -0.46875, + "logits/rejected": 0.228515625, + "logps/chosen": -286.0, + "logps/rejected": -272.0, + "loss": 1.8359375, + "memory(GiB)": 13.63, + "nll_loss": 1.7109375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.067542 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.798293390214536, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.7421875, + "logits/rejected": -0.185546875, + "logps/chosen": -362.0, + "logps/rejected": -512.0, + "loss": 1.83404541015625, + "memory(GiB)": 30.5, + "nll_loss": 1.1015625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.1005859375, + "rewards/rejected": 0.1279296875, + "step": 5, + "train_speed(iter/s)": 0.088059 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.205296809705217, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.421875, + "logits/rejected": -0.012451171875, + "logps/chosen": -350.0, + "logps/rejected": -548.0, + "loss": 1.5821044921875, + "memory(GiB)": 42.9, + "nll_loss": 0.8515625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.765625, + "rewards/margins": 0.63671875, + "rewards/rejected": 1.1328125, + "step": 10, + "train_speed(iter/s)": 0.090254 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.4006731478550383, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.14453125, + "logits/rejected": -0.609375, + "logps/chosen": -366.0, + "logps/rejected": -260.0, + "loss": 1.147705078125, + "memory(GiB)": 42.9, + "nll_loss": 0.75, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.53125, + "rewards/margins": 1.625, + "rewards/rejected": 1.90625, + "step": 15, + "train_speed(iter/s)": 0.09299 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.5239399131286955, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.033203125, + "logits/rejected": -0.53515625, + "logps/chosen": -452.0, + "logps/rejected": -280.0, + "loss": 0.72830810546875, + "memory(GiB)": 42.9, + "nll_loss": 0.609375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.125, + "rewards/margins": 3.0625, + "rewards/rejected": 1.0703125, + "step": 20, + "train_speed(iter/s)": 0.093763 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": 0.349609375, + "eval_logps/chosen": -17.5, + "eval_logps/rejected": -172.0, + "eval_loss": 0.58154296875, + "eval_nll_loss": 0.76171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 5.5, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": -0.80078125, + "eval_runtime": 1.2034, + "eval_samples_per_second": 3.324, + "eval_steps_per_second": 0.831, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.5826068375172234, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.08447265625, + "logits/rejected": -0.142578125, + "logps/chosen": -434.0, + "logps/rejected": -496.0, + "loss": 0.532879638671875, + "memory(GiB)": 42.9, + "nll_loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0625, + "rewards/margins": 5.1875, + "rewards/rejected": -0.1357421875, + "step": 25, + "train_speed(iter/s)": 0.094482 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.1173239600840837, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.0703125, + "logps/chosen": -282.0, + "logps/rejected": -492.0, + "loss": 0.484796142578125, + "memory(GiB)": 42.9, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.71875, + "rewards/margins": 9.8125, + "rewards/rejected": -3.078125, + "step": 30, + "train_speed(iter/s)": 0.09487 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.10486166807457631, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.47265625, + "logits/rejected": 0.05126953125, + "logps/chosen": -217.0, + "logps/rejected": -524.0, + "loss": 0.477923583984375, + "memory(GiB)": 42.9, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.03125, + "rewards/margins": 10.5625, + "rewards/rejected": -3.53125, + "step": 35, + "train_speed(iter/s)": 0.095273 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.13908151012153538, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": -0.005706787109375, + "logits/rejected": -0.498046875, + "logps/chosen": -344.0, + "logps/rejected": -276.0, + "loss": 0.45271148681640627, + "memory(GiB)": 42.9, + "nll_loss": 0.408203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.5, + "rewards/rejected": -3.515625, + "step": 40, + "train_speed(iter/s)": 0.095656 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": 0.62109375, + "eval_logps/chosen": -5.34375, + "eval_logps/rejected": -175.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.232421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.6875, + "eval_rewards/margins": 7.78125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 1.3521, + "eval_samples_per_second": 2.958, + "eval_steps_per_second": 0.74, + "step": 40 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.15003750533951385, + "learning_rate": 3.411042902090492e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": 0.1650390625, + "logps/chosen": -314.0, + "logps/rejected": -496.0, + "loss": 0.546685791015625, + "memory(GiB)": 42.9, + "nll_loss": 0.5234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.65625, + "rewards/margins": 10.875, + "rewards/rejected": -3.234375, + "step": 45, + "train_speed(iter/s)": 0.095055 + }, + { + "epoch": 2.0, + "grad_norm": 0.17349498363870808, + "learning_rate": 2.3678391856132204e-05, + "logits/chosen": -0.0074462890625, + "logits/rejected": -0.140625, + "logps/chosen": -304.0, + "logps/rejected": -274.0, + "loss": 0.44422264099121095, + "memory(GiB)": 42.9, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.59375, + "rewards/margins": 10.4375, + "rewards/rejected": -2.859375, + "step": 50, + "train_speed(iter/s)": 0.095326 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.21138809828743063, + "learning_rate": 1.4644660940672627e-05, + "logits/chosen": -0.09521484375, + "logits/rejected": 0.1259765625, + "logps/chosen": -300.0, + "logps/rejected": -616.0, + "loss": 0.49451904296875, + "memory(GiB)": 42.9, + "nll_loss": 0.474609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.25, + "rewards/margins": 9.3125, + "rewards/rejected": -2.078125, + "step": 55, + "train_speed(iter/s)": 0.095115 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.06468135061973618, + "learning_rate": 7.489143213519301e-06, + "logits/chosen": -0.12353515625, + "logits/rejected": -0.482421875, + "logps/chosen": -420.0, + "logps/rejected": -470.0, + "loss": 0.460784912109375, + "memory(GiB)": 42.9, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.8125, + "rewards/margins": 12.125, + "rewards/rejected": -3.328125, + "step": 60, + "train_speed(iter/s)": 0.095377 + }, + { + "epoch": 2.404040404040404, + "eval_logits/chosen": -1.9921875, + "eval_logits/rejected": 0.87890625, + "eval_logps/chosen": -5.1875, + "eval_logps/rejected": -179.0, + "eval_loss": 0.425537109375, + "eval_nll_loss": 0.2255859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 8.25, + "eval_rewards/rejected": -1.5, + "eval_runtime": 1.3466, + "eval_samples_per_second": 2.97, + "eval_steps_per_second": 0.743, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 26025733423104.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/wangruotong/LLM_test/Models/Marco-o1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d83da27a2c8d998634dacb835a64eb00d3dd2e8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75002a00d611e4875b86199e228d1535b69dae7bedd01ae668f2824526694ce8 +size 40422208 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json new file mode 100644 index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json @@ -0,0 +1,371 @@ +{ + "model": "/home/wangruotong/LLM_test/Models/Marco-o1", + "model_type": "marco_o1", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "marco_o1", + "system": "You are a helpful assistant.", + "max_length": 4200, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 1, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 200, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 4, + "local_world_size": 4, + "model_suffix": "Marco-o1", + "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])", + "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1", + "hub": "", + "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd1302acc77b849e3573039184c4a969c41784ea --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4870218bc5905167b4fdf693995be469a3f88fde5feec15eeac27c87a38117ff +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33fe548d8b646d5e438f35e2d5273e6273b28f81 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d423e087ba616b7cfbe7c4a687dea5c1165f732ee64fc0d2d912a8477c5f548 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e927fa6822aebf010559d7eb083bba70ade6a5d3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b64199c7f51ee6f7d6a3ee0e21de586030d7d0dac6100b8371b92ac6dc3fcd4 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cc0219d7b61da1d8baa80bf8f492b3740f01c1d --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d009cc82d34e2036e23f4a4a8791bb1f50b240ccacf4e168e8f57671ead9ca59 +size 60559280 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a524f27b92645e5645dfcc9b75110e322a57a6b2 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f7e868315deb2e0bb0064d1d57817ce3429b318a03d3a3dd62a9dfce05bbef +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e47b1e355f370a9f30c7c77ba856b6efddbb168f --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64cb77e899835cb8a51cfb333ad499ca2901e784ea087a7c21b702d596df647 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7726800cef0fc2d62c65ee7dd3f9e0cb17c01440 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3afc72b3e2aaabd252874436aacbe2ce7bff38be7845ea0f1ffb58987003f91e +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b02e5338327c0d40effa89d5444bb462c2d1d89c --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3089aac8b20a3c6b4d889ebbe8a52e9c331dd4d49e66893559428bee931945 +size 388374 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest new file mode 100644 index 0000000000000000000000000000000000000000..bbeadc7466d2728e3046120a012ebc37c29267cb --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest @@ -0,0 +1 @@ +global_step71 \ No newline at end of file diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..be2e24cc9d9ef8857272cec1451c810e205ec4e9 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..efcf4dd2e74596ac28af81f9f8bd0be9a807deb3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c9222e37d4e9d1745c0e126e0fe0c4a348e298d --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7821bf0f5f0621fd0159152432f0a7bc66aa6823 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4 +size 15024 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d0d6f336655bdacf5eb53294b71e20f2d0edb17 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ba16a2cd6668009497101c7aa1ee348685f1df2d9a2a20c23be3737c813063 +size 1064 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd49325958fc683960562dc1cf8d3bd5ef38687 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json @@ -0,0 +1,371 @@ +{ + "best_metric": 0.42553711, + "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60", + "epoch": 2.888888888888889, + "eval_steps": 20, + "global_step": 72, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04040404040404041, + "grad_norm": 2.3671343726657543, + "learning_rate": 2.5e-05, + "logits/chosen": -0.46875, + "logits/rejected": 0.228515625, + "logps/chosen": -286.0, + "logps/rejected": -272.0, + "loss": 1.8359375, + "memory(GiB)": 13.63, + "nll_loss": 1.7109375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.067542 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.798293390214536, + "learning_rate": 9.994664874011863e-05, + "logits/chosen": -0.7421875, + "logits/rejected": -0.185546875, + "logps/chosen": -362.0, + "logps/rejected": -512.0, + "loss": 1.83404541015625, + "memory(GiB)": 30.5, + "nll_loss": 1.1015625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.1005859375, + "rewards/rejected": 0.1279296875, + "step": 5, + "train_speed(iter/s)": 0.088059 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.205296809705217, + "learning_rate": 9.809128215864097e-05, + "logits/chosen": -0.421875, + "logits/rejected": -0.012451171875, + "logps/chosen": -350.0, + "logps/rejected": -548.0, + "loss": 1.5821044921875, + "memory(GiB)": 42.9, + "nll_loss": 0.8515625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.765625, + "rewards/margins": 0.63671875, + "rewards/rejected": 1.1328125, + "step": 10, + "train_speed(iter/s)": 0.090254 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.4006731478550383, + "learning_rate": 9.368111953231848e-05, + "logits/chosen": -0.14453125, + "logits/rejected": -0.609375, + "logps/chosen": -366.0, + "logps/rejected": -260.0, + "loss": 1.147705078125, + "memory(GiB)": 42.9, + "nll_loss": 0.75, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.53125, + "rewards/margins": 1.625, + "rewards/rejected": 1.90625, + "step": 15, + "train_speed(iter/s)": 0.09299 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.5239399131286955, + "learning_rate": 8.695044586103296e-05, + "logits/chosen": -0.033203125, + "logits/rejected": -0.53515625, + "logps/chosen": -452.0, + "logps/rejected": -280.0, + "loss": 0.72830810546875, + "memory(GiB)": 42.9, + "nll_loss": 0.609375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.125, + "rewards/margins": 3.0625, + "rewards/rejected": 1.0703125, + "step": 20, + "train_speed(iter/s)": 0.093763 + }, + { + "epoch": 0.8080808080808081, + "eval_logits/chosen": -1.8671875, + "eval_logits/rejected": 0.349609375, + "eval_logps/chosen": -17.5, + "eval_logps/rejected": -172.0, + "eval_loss": 0.58154296875, + "eval_nll_loss": 0.76171875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 5.5, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": -0.80078125, + "eval_runtime": 1.2034, + "eval_samples_per_second": 3.324, + "eval_steps_per_second": 0.831, + "step": 20 + }, + { + "epoch": 1.0, + "grad_norm": 0.5826068375172234, + "learning_rate": 7.82568207211296e-05, + "logits/chosen": 0.08447265625, + "logits/rejected": -0.142578125, + "logps/chosen": -434.0, + "logps/rejected": -496.0, + "loss": 0.532879638671875, + "memory(GiB)": 42.9, + "nll_loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0625, + "rewards/margins": 5.1875, + "rewards/rejected": -0.1357421875, + "step": 25, + "train_speed(iter/s)": 0.094482 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.1173239600840837, + "learning_rate": 6.806208330935766e-05, + "logits/chosen": -0.158203125, + "logits/rejected": -0.0703125, + "logps/chosen": -282.0, + "logps/rejected": -492.0, + "loss": 0.484796142578125, + "memory(GiB)": 42.9, + "nll_loss": 0.40625, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.71875, + "rewards/margins": 9.8125, + "rewards/rejected": -3.078125, + "step": 30, + "train_speed(iter/s)": 0.09487 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.10486166807457631, + "learning_rate": 5.6907817747594116e-05, + "logits/chosen": -0.47265625, + "logits/rejected": 0.05126953125, + "logps/chosen": -217.0, + "logps/rejected": -524.0, + "loss": 0.477923583984375, + "memory(GiB)": 42.9, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.03125, + "rewards/margins": 10.5625, + "rewards/rejected": -3.53125, + "step": 35, + "train_speed(iter/s)": 0.095273 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.13908151012153538, + "learning_rate": 4.5386582026834906e-05, + "logits/chosen": -0.005706787109375, + "logits/rejected": -0.498046875, + "logps/chosen": -344.0, + "logps/rejected": -276.0, + "loss": 0.45271148681640627, + "memory(GiB)": 42.9, + "nll_loss": 0.408203125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0, + "rewards/margins": 10.5, + "rewards/rejected": -3.515625, + "step": 40, + "train_speed(iter/s)": 0.095656 + }, + { + "epoch": 1.606060606060606, + "eval_logits/chosen": -1.921875, + "eval_logits/rejected": 0.62109375, + "eval_logps/chosen": -5.34375, + "eval_logps/rejected": -175.0, + "eval_loss": 0.43310546875, + "eval_nll_loss": 0.232421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.6875, + "eval_rewards/margins": 7.78125, + "eval_rewards/rejected": -1.1015625, + "eval_runtime": 1.3521, + "eval_samples_per_second": 2.958, + "eval_steps_per_second": 0.74, + "step": 40 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.15003750533951385, + "learning_rate": 3.411042902090492e-05, + "logits/chosen": -0.1572265625, + "logits/rejected": 0.1650390625, + "logps/chosen": -314.0, + "logps/rejected": -496.0, + "loss": 0.546685791015625, + "memory(GiB)": 42.9, + "nll_loss": 0.5234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.65625, + "rewards/margins": 10.875, + "rewards/rejected": -3.234375, + "step": 45, + "train_speed(iter/s)": 0.095055 + }, + { + "epoch": 2.0, + "grad_norm": 0.17349498363870808, + "learning_rate": 2.3678391856132204e-05, + "logits/chosen": -0.0074462890625, + "logits/rejected": -0.140625, + "logps/chosen": -304.0, + "logps/rejected": -274.0, + "loss": 0.44422264099121095, + "memory(GiB)": 42.9, + "nll_loss": 0.41015625, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.59375, + "rewards/margins": 10.4375, + "rewards/rejected": -2.859375, + "step": 50, + "train_speed(iter/s)": 0.095326 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.21138809828743063, + "learning_rate": 1.4644660940672627e-05, + "logits/chosen": -0.09521484375, + "logits/rejected": 0.1259765625, + "logps/chosen": -300.0, + "logps/rejected": -616.0, + "loss": 0.49451904296875, + "memory(GiB)": 42.9, + "nll_loss": 0.474609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.25, + "rewards/margins": 9.3125, + "rewards/rejected": -2.078125, + "step": 55, + "train_speed(iter/s)": 0.095115 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.06468135061973618, + "learning_rate": 7.489143213519301e-06, + "logits/chosen": -0.12353515625, + "logits/rejected": -0.482421875, + "logps/chosen": -420.0, + "logps/rejected": -470.0, + "loss": 0.460784912109375, + "memory(GiB)": 42.9, + "nll_loss": 0.51171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.8125, + "rewards/margins": 12.125, + "rewards/rejected": -3.328125, + "step": 60, + "train_speed(iter/s)": 0.095377 + }, + { + "epoch": 2.404040404040404, + "eval_logits/chosen": -1.9921875, + "eval_logits/rejected": 0.87890625, + "eval_logps/chosen": -5.1875, + "eval_logps/rejected": -179.0, + "eval_loss": 0.425537109375, + "eval_nll_loss": 0.2255859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 8.25, + "eval_rewards/rejected": -1.5, + "eval_runtime": 1.3466, + "eval_samples_per_second": 2.97, + "eval_steps_per_second": 0.743, + "step": 60 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.1291076581823192, + "learning_rate": 2.591967620451707e-06, + "logits/chosen": -0.55078125, + "logits/rejected": 0.322265625, + "logps/chosen": -207.0, + "logps/rejected": -480.0, + "loss": 0.4864105224609375, + "memory(GiB)": 42.9, + "nll_loss": 0.5703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.875, + "rewards/margins": 11.3125, + "rewards/rejected": -3.4375, + "step": 65, + "train_speed(iter/s)": 0.095248 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.1432822314318524, + "learning_rate": 2.1329118524827662e-07, + "logits/chosen": -0.1259765625, + "logits/rejected": -0.10009765625, + "logps/chosen": -278.0, + "logps/rejected": -436.0, + "loss": 0.4423820495605469, + "memory(GiB)": 42.9, + "nll_loss": 0.482421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.5, + "rewards/margins": 10.5625, + "rewards/rejected": -3.0625, + "step": 70, + "train_speed(iter/s)": 0.095483 + }, + { + "epoch": 2.888888888888889, + "eval_logits/chosen": -1.9765625, + "eval_logits/rejected": 0.89453125, + "eval_logps/chosen": -5.15625, + "eval_logps/rejected": -179.0, + "eval_loss": 0.42578125, + "eval_nll_loss": 0.224609375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 6.75, + "eval_rewards/margins": 8.25, + "eval_rewards/rejected": -1.5, + "eval_runtime": 1.3679, + "eval_samples_per_second": 2.924, + "eval_steps_per_second": 0.731, + "step": 72 + } + ], + "logging_steps": 5, + "max_steps": 72, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 31145554509824.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d +size 8888 diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e6b7ee5d152a6b8f27bec936454c2927c132300e Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..30e50c872219c436cafa190c9bea9519772f3fb0 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..b67211038c92be8ff3a68ad63f189c77e1e385de Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..be3a1a9506fc8be15fab37bf1465ddcffd5a6772 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..fb7503df5d30e918d16383ae6d4af436dc0b7d48 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..ac85650e3f453a38eecfd153751619d7dca8bd49 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..2b88ed37a3c651cbf54899f712bcd1515e2be8fc Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..867b3d823fb6ce1bdf230b62ef5608afbfa0d085 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..0da5b0c4f1d6a8e6d2a30974ead7ebce86963cfe Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ff513fa6bbb162af8abfa4f8dd4af6b4dfd77b78 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..5442b175cf9f1c352ee5bfbeb1b68309ed1aad82 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..61f05ca5dfb66c19ff266e9f472579591473854a Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..2b40d6b9d5eed2e2aecaff1c0fce6fd5abd6fc04 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..bf6c6bacafca8d256f07d43a8851a9463ac6e566 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..108e9a91b60e0304f3fdb5997cd151380a4a483d Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..9d80e686cd9522b9efdfd0b12d052a155efc0d3b Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..213255979dfcf020eabb97f9110732f04a40bdbe Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..183420c06679bc7968c3d1887878fbea5a5709ef Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..1a5e422d51e863b1562a64416a382b5209f0c737 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..d67d5806aeb5c0bd381d3fed296143c27788d63e Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..73f143e15bf3137e1943190229c724fe8d22d82a Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..73b48531ac749afe5ca124bde45012ffd7a89fdb Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..2c05168f74b1a3dfa55a4969a3c5dbdbcf7961ac Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..2b871edb45164987d9e330c19a86545b33c5d9c0 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..a4031d46f34606f43b779ac4561ef170a0538f01 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..f236c5c70dad1d44d3852b1c0305a434ec8f4b3d Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..0e27f8a3f6fde8cabd190fa72e00571e681d468e Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..9f7941da6a30938add08e508ee1df498e25cb4c6 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c43bac51918a40dc45b17d8ae2d125ea99c17db5 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..983293655c86a195af96e12cdd729b7922bba799 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..5b747e7565dee73bb5420f4b686c8167da741906 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..5590ed64260db5923b03ca4de7f2de69fb7764c6 Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..2bb26c7406b8e93d563d928d553468bb97cbf0df Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png differ diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0bfe643dfea6871d3a0582158e029089ceb273e --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl @@ -0,0 +1,21 @@ +{"loss": 1.8359375, "grad_norm": 2.36713437, "learning_rate": 2.5e-05, "memory(GiB)": 13.63, "train_speed(iter/s)": 0.067542, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -286.0, "logits/rejected": 0.22851562, "logits/chosen": -0.46875, "nll_loss": 1.7109375, "epoch": 0.04040404, "global_step/max_steps": "1/72", "percentage": "1.39%", "elapsed_time": "12s", "remaining_time": "14m 13s"} +{"loss": 1.83404541, "grad_norm": 2.79829339, "learning_rate": 9.995e-05, "memory(GiB)": 30.5, "train_speed(iter/s)": 0.088059, "rewards/chosen": 0.22851562, "rewards/rejected": 0.12792969, "rewards/accuracies": 0.375, "rewards/margins": 0.10058594, "logps/rejected": -512.0, "logps/chosen": -362.0, "logits/rejected": -0.18554688, "logits/chosen": -0.7421875, "nll_loss": 1.1015625, "epoch": 0.2020202, "global_step/max_steps": "5/72", "percentage": "6.94%", "elapsed_time": "54s", "remaining_time": "12m 3s"} +{"loss": 1.58210449, "grad_norm": 2.20529681, "learning_rate": 9.809e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.090254, "rewards/chosen": 1.765625, "rewards/rejected": 1.1328125, "rewards/accuracies": 0.69999999, "rewards/margins": 0.63671875, "logps/rejected": -548.0, "logps/chosen": -350.0, "logits/rejected": -0.01245117, "logits/chosen": -0.421875, "nll_loss": 0.8515625, "epoch": 0.4040404, "global_step/max_steps": "10/72", "percentage": "13.89%", "elapsed_time": "1m 48s", "remaining_time": "11m 9s"} +{"loss": 1.14770508, "grad_norm": 1.40067315, "learning_rate": 9.368e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09299, "rewards/chosen": 3.53125, "rewards/rejected": 1.90625, "rewards/accuracies": 0.94999999, "rewards/margins": 1.625, "logps/rejected": -260.0, "logps/chosen": -366.0, "logits/rejected": -0.609375, "logits/chosen": -0.14453125, "nll_loss": 0.75, "epoch": 0.60606061, "global_step/max_steps": "15/72", "percentage": "20.83%", "elapsed_time": "2m 38s", "remaining_time": "10m 2s"} +{"loss": 0.72830811, "grad_norm": 0.52393991, "learning_rate": 8.695e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.093763, "rewards/chosen": 4.125, "rewards/rejected": 1.0703125, "rewards/accuracies": 0.89999998, "rewards/margins": 3.0625, "logps/rejected": -280.0, "logps/chosen": -452.0, "logits/rejected": -0.53515625, "logits/chosen": -0.03320312, "nll_loss": 0.609375, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 30s", "remaining_time": "9m 7s"} +{"eval_loss": 0.58154297, "eval_runtime": 1.2034, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.831, "eval_rewards/chosen": 5.5, "eval_rewards/rejected": -0.80078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/rejected": -172.0, "eval_logps/chosen": -17.5, "eval_logits/rejected": 0.34960938, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.76171875, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 31s", "remaining_time": "9m 10s"} +{"loss": 0.53287964, "grad_norm": 0.58260684, "learning_rate": 7.826e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.094482, "rewards/chosen": 5.0625, "rewards/rejected": -0.13574219, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -496.0, "logps/chosen": -434.0, "logits/rejected": -0.14257812, "logits/chosen": 0.08447266, "nll_loss": 0.5625, "epoch": 1.0, "global_step/max_steps": "25/72", "percentage": "34.72%", "elapsed_time": "4m 21s", "remaining_time": "8m 12s"} +{"loss": 0.48479614, "grad_norm": 0.11732396, "learning_rate": 6.806e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09487, "rewards/chosen": 6.71875, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -492.0, "logps/chosen": -282.0, "logits/rejected": -0.0703125, "logits/chosen": -0.15820312, "nll_loss": 0.40625, "epoch": 1.2020202, "global_step/max_steps": "30/72", "percentage": "41.67%", "elapsed_time": "5m 13s", "remaining_time": "7m 18s"} +{"loss": 0.47792358, "grad_norm": 0.10486167, "learning_rate": 5.691e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095273, "rewards/chosen": 7.03125, "rewards/rejected": -3.53125, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -524.0, "logps/chosen": -217.0, "logits/rejected": 0.05126953, "logits/chosen": -0.47265625, "nll_loss": 0.48242188, "epoch": 1.4040404, "global_step/max_steps": "35/72", "percentage": "48.61%", "elapsed_time": "6m 4s", "remaining_time": "6m 25s"} +{"loss": 0.45271149, "grad_norm": 0.13908151, "learning_rate": 4.539e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095656, "rewards/chosen": 7.0, "rewards/rejected": -3.515625, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -276.0, "logps/chosen": -344.0, "logits/rejected": -0.49804688, "logits/chosen": -0.00570679, "nll_loss": 0.40820312, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 55s", "remaining_time": "5m 32s"} +{"eval_loss": 0.43310547, "eval_runtime": 1.3521, "eval_samples_per_second": 2.958, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 6.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.78125, "eval_logps/rejected": -175.0, "eval_logps/chosen": -5.34375, "eval_logits/rejected": 0.62109375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.23242188, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 56s", "remaining_time": "5m 33s"} +{"loss": 0.54668579, "grad_norm": 0.15003751, "learning_rate": 3.411e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095055, "rewards/chosen": 7.65625, "rewards/rejected": -3.234375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -496.0, "logps/chosen": -314.0, "logits/rejected": 0.16503906, "logits/chosen": -0.15722656, "nll_loss": 0.5234375, "epoch": 1.80808081, "global_step/max_steps": "45/72", "percentage": "62.50%", "elapsed_time": "7m 50s", "remaining_time": "4m 42s"} +{"loss": 0.44422264, "grad_norm": 0.17349498, "learning_rate": 2.368e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095326, "rewards/chosen": 7.59375, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -274.0, "logps/chosen": -304.0, "logits/rejected": -0.140625, "logits/chosen": -0.00744629, "nll_loss": 0.41015625, "epoch": 2.0, "global_step/max_steps": "50/72", "percentage": "69.44%", "elapsed_time": "8m 41s", "remaining_time": "3m 49s"} +{"loss": 0.49451904, "grad_norm": 0.2113881, "learning_rate": 1.464e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095115, "rewards/chosen": 7.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -616.0, "logps/chosen": -300.0, "logits/rejected": 0.12597656, "logits/chosen": -0.09521484, "nll_loss": 0.47460938, "epoch": 2.2020202, "global_step/max_steps": "55/72", "percentage": "76.39%", "elapsed_time": "9m 35s", "remaining_time": "2m 57s"} +{"loss": 0.46078491, "grad_norm": 0.06468135, "learning_rate": 7.49e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095377, "rewards/chosen": 8.8125, "rewards/rejected": -3.328125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -470.0, "logps/chosen": -420.0, "logits/rejected": -0.48242188, "logits/chosen": -0.12353516, "nll_loss": 0.51171875, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 26s", "remaining_time": "2m 5s"} +{"eval_loss": 0.42553711, "eval_runtime": 1.3466, "eval_samples_per_second": 2.97, "eval_steps_per_second": 0.743, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.1875, "eval_logits/rejected": 0.87890625, "eval_logits/chosen": -1.9921875, "eval_nll_loss": 0.22558594, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 27s", "remaining_time": "2m 5s"} +{"loss": 0.48641052, "grad_norm": 0.12910766, "learning_rate": 2.59e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095248, "rewards/chosen": 7.875, "rewards/rejected": -3.4375, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -207.0, "logits/rejected": 0.32226562, "logits/chosen": -0.55078125, "nll_loss": 0.5703125, "epoch": 2.60606061, "global_step/max_steps": "65/72", "percentage": "90.28%", "elapsed_time": "11m 19s", "remaining_time": "1m 13s"} +{"loss": 0.44238205, "grad_norm": 0.14328223, "learning_rate": 2.1e-07, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095483, "rewards/chosen": 7.5, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -436.0, "logps/chosen": -278.0, "logits/rejected": -0.10009766, "logits/chosen": -0.12597656, "nll_loss": 0.48242188, "epoch": 2.80808081, "global_step/max_steps": "70/72", "percentage": "97.22%", "elapsed_time": "12m 10s", "remaining_time": "20s"} +{"eval_loss": 0.42578125, "eval_runtime": 1.3679, "eval_samples_per_second": 2.924, "eval_steps_per_second": 0.731, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.15625, "eval_logits/rejected": 0.89453125, "eval_logits/chosen": -1.9765625, "eval_nll_loss": 0.22460938, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 32s", "remaining_time": "0s"} +{"train_runtime": 753.8745, "train_samples_per_second": 1.572, "train_steps_per_second": 0.096, "total_flos": 31145554509824.0, "train_loss": 0.71831854, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 33s", "remaining_time": "0s"} +{"train_dataset": "1180.088608±494.952093, min=317.000000, max=4171.000000, size=395", "val_dataset": "1196.000000±512.550973, min=715.000000, max=2041.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60", "best_metric": 0.42553711, "global_step": 72, "log_history": [{"loss": 1.8359375, "grad_norm": 2.3671343726657543, "learning_rate": 2.5e-05, "memory(GiB)": 13.63, "train_speed(iter/s)": 0.067542, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -286.0, "logits/rejected": 0.228515625, "logits/chosen": -0.46875, "nll_loss": 1.7109375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 1.83404541015625, "grad_norm": 2.798293390214536, "learning_rate": 9.994664874011863e-05, "memory(GiB)": 30.5, "train_speed(iter/s)": 0.088059, "rewards/chosen": 0.228515625, "rewards/rejected": 0.1279296875, "rewards/accuracies": 0.375, "rewards/margins": 0.1005859375, "logps/rejected": -512.0, "logps/chosen": -362.0, "logits/rejected": -0.185546875, "logits/chosen": -0.7421875, "nll_loss": 1.1015625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.5821044921875, "grad_norm": 2.205296809705217, "learning_rate": 9.809128215864097e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.090254, "rewards/chosen": 1.765625, "rewards/rejected": 1.1328125, "rewards/accuracies": 0.699999988079071, "rewards/margins": 0.63671875, "logps/rejected": -548.0, "logps/chosen": -350.0, "logits/rejected": -0.012451171875, "logits/chosen": -0.421875, "nll_loss": 0.8515625, "epoch": 0.40404040404040403, "step": 10}, {"loss": 1.147705078125, "grad_norm": 1.4006731478550383, "learning_rate": 9.368111953231848e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09299, "rewards/chosen": 3.53125, "rewards/rejected": 1.90625, "rewards/accuracies": 0.949999988079071, "rewards/margins": 1.625, "logps/rejected": -260.0, "logps/chosen": -366.0, "logits/rejected": -0.609375, "logits/chosen": -0.14453125, "nll_loss": 0.75, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.72830810546875, "grad_norm": 0.5239399131286955, "learning_rate": 8.695044586103296e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.093763, "rewards/chosen": 4.125, "rewards/rejected": 1.0703125, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 3.0625, "logps/rejected": -280.0, "logps/chosen": -452.0, "logits/rejected": -0.53515625, "logits/chosen": -0.033203125, "nll_loss": 0.609375, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.58154296875, "eval_runtime": 1.2034, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.831, "eval_rewards/chosen": 5.5, "eval_rewards/rejected": -0.80078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/rejected": -172.0, "eval_logps/chosen": -17.5, "eval_logits/rejected": 0.349609375, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.76171875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.532879638671875, "grad_norm": 0.5826068375172234, "learning_rate": 7.82568207211296e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.094482, "rewards/chosen": 5.0625, "rewards/rejected": -0.1357421875, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -496.0, "logps/chosen": -434.0, "logits/rejected": -0.142578125, "logits/chosen": 0.08447265625, "nll_loss": 0.5625, "epoch": 1.0, "step": 25}, {"loss": 0.484796142578125, "grad_norm": 0.1173239600840837, "learning_rate": 6.806208330935766e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09487, "rewards/chosen": 6.71875, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -492.0, "logps/chosen": -282.0, "logits/rejected": -0.0703125, "logits/chosen": -0.158203125, "nll_loss": 0.40625, "epoch": 1.202020202020202, "step": 30}, {"loss": 0.477923583984375, "grad_norm": 0.10486166807457631, "learning_rate": 5.6907817747594116e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095273, "rewards/chosen": 7.03125, "rewards/rejected": -3.53125, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -524.0, "logps/chosen": -217.0, "logits/rejected": 0.05126953125, "logits/chosen": -0.47265625, "nll_loss": 0.482421875, "epoch": 1.404040404040404, "step": 35}, {"loss": 0.45271148681640627, "grad_norm": 0.13908151012153538, "learning_rate": 4.5386582026834906e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095656, "rewards/chosen": 7.0, "rewards/rejected": -3.515625, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -276.0, "logps/chosen": -344.0, "logits/rejected": -0.498046875, "logits/chosen": -0.005706787109375, "nll_loss": 0.408203125, "epoch": 1.606060606060606, "step": 40}, {"eval_loss": 0.43310546875, "eval_runtime": 1.3521, "eval_samples_per_second": 2.958, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 6.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.78125, "eval_logps/rejected": -175.0, "eval_logps/chosen": -5.34375, "eval_logits/rejected": 0.62109375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.232421875, "epoch": 1.606060606060606, "step": 40}, {"loss": 0.546685791015625, "grad_norm": 0.15003750533951385, "learning_rate": 3.411042902090492e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095055, "rewards/chosen": 7.65625, "rewards/rejected": -3.234375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -496.0, "logps/chosen": -314.0, "logits/rejected": 0.1650390625, "logits/chosen": -0.1572265625, "nll_loss": 0.5234375, "epoch": 1.808080808080808, "step": 45}, {"loss": 0.44422264099121095, "grad_norm": 0.17349498363870808, "learning_rate": 2.3678391856132204e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095326, "rewards/chosen": 7.59375, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -274.0, "logps/chosen": -304.0, "logits/rejected": -0.140625, "logits/chosen": -0.0074462890625, "nll_loss": 0.41015625, "epoch": 2.0, "step": 50}, {"loss": 0.49451904296875, "grad_norm": 0.21138809828743063, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095115, "rewards/chosen": 7.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -616.0, "logps/chosen": -300.0, "logits/rejected": 0.1259765625, "logits/chosen": -0.09521484375, "nll_loss": 0.474609375, "epoch": 2.202020202020202, "step": 55}, {"loss": 0.460784912109375, "grad_norm": 0.06468135061973618, "learning_rate": 7.489143213519301e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095377, "rewards/chosen": 8.8125, "rewards/rejected": -3.328125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -470.0, "logps/chosen": -420.0, "logits/rejected": -0.482421875, "logits/chosen": -0.12353515625, "nll_loss": 0.51171875, "epoch": 2.404040404040404, "step": 60}, {"eval_loss": 0.425537109375, "eval_runtime": 1.3466, "eval_samples_per_second": 2.97, "eval_steps_per_second": 0.743, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.1875, "eval_logits/rejected": 0.87890625, "eval_logits/chosen": -1.9921875, "eval_nll_loss": 0.2255859375, "epoch": 2.404040404040404, "step": 60}, {"loss": 0.4864105224609375, "grad_norm": 0.1291076581823192, "learning_rate": 2.591967620451707e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095248, "rewards/chosen": 7.875, "rewards/rejected": -3.4375, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -207.0, "logits/rejected": 0.322265625, "logits/chosen": -0.55078125, "nll_loss": 0.5703125, "epoch": 2.606060606060606, "step": 65}, {"loss": 0.4423820495605469, "grad_norm": 0.1432822314318524, "learning_rate": 2.1329118524827662e-07, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095483, "rewards/chosen": 7.5, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -436.0, "logps/chosen": -278.0, "logits/rejected": -0.10009765625, "logits/chosen": -0.1259765625, "nll_loss": 0.482421875, "epoch": 2.808080808080808, "step": 70}, {"eval_loss": 0.42578125, "eval_runtime": 1.3679, "eval_samples_per_second": 2.924, "eval_steps_per_second": 0.731, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.15625, "eval_logits/rejected": 0.89453125, "eval_logits/chosen": -1.9765625, "eval_nll_loss": 0.224609375, "epoch": 2.888888888888889, "step": 72}, {"train_runtime": 753.8745, "train_samples_per_second": 1.572, "train_steps_per_second": 0.096, "total_flos": 31145554509824.0, "train_loss": 0.7183185418446859, "epoch": 2.888888888888889, "step": 72}], "memory": 42.8984375} diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0 b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0 new file mode 100644 index 0000000000000000000000000000000000000000..36efed6dc11e1ef3d65fd6b82dc17ce715401753 --- /dev/null +++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f24324580e4cc61a421b16bf7736b01a10b1584ed690487134d262bf24f7ecb +size 23693