diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..06588a503b1e0705854d0c7d4cbe357eda953de3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +completions.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/args.json b/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-10/README.md b/checkpoint-10/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-10/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-10/adapter_config.json b/checkpoint-10/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-10/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-10/adapter_model.safetensors b/checkpoint-10/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3be713918d53bb926b8c6299a078fad707d5e85c --- /dev/null +++ b/checkpoint-10/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb9c1d89154d04a75fc801f9e0bf97a4fd4018d949501d1197194032bd689800 +size 275342392 diff --git a/checkpoint-10/additional_config.json b/checkpoint-10/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-10/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-10/args.json b/checkpoint-10/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-10/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b12928e478eb3549811c17b4e046a7a85c55bf69 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa6e1268e36f299bdfa10673f61defdcc132d6cb00a4ca590af89de20368f7f +size 51616517 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8154686dd369d32f7907499a662531d2bbf92620 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed3a1801f1d44eb9be80aa13b575a6b788b187af6aabc5f38bb11d766bc197b +size 51616005 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdf810f954484329d403c11aa1d1054101016de9 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42427473761fee2d850c05a1ddbffa24fe26783d8128b39836a939c04e24707b +size 51616517 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a58ef046b68f9d4a3fabdc64542eb40d8bcbff8 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5f638f8cd911d925fde79d595f0f66fc2c44eee5603754dbb4c5d648988a073 +size 51616005 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c64abf557d8bd0adad8df0bd18a9371bcbebe75 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5302da6d9fbfc46ccbdfd28167634bb5554ba4f8fcb33c984103e7d98ff05a10 +size 51616517 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f370143e1cfb106a831ac7fcd08b894bfb01900f --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24c33fc4aa2e1d8bd03d4cd3a7cf120c1835148fe7bc025ed0b9deeec98757b +size 51616005 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90fe59b9cf5c9a19e4d97f7601a63230c9c6aaf2 --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31a928a0aa371e667b6187bc36d734936ce57bc9ffe9ba6d4cc3a8f45729d7a +size 51616517 diff --git a/checkpoint-10/global_step10/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-10/global_step10/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e088b80b952d282d0f5da66d83234146e4cdef --- /dev/null +++ b/checkpoint-10/global_step10/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09ee34677203b97443ac3e21d6933c8d2ed0553d98a9ff65b629b5806df9a9fe +size 51616005 diff --git a/checkpoint-10/global_step10/mp_rank_00_model_states.pt b/checkpoint-10/global_step10/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68aec567f86a66b7eccd2e8e98119fabc644b0b1 --- /dev/null +++ b/checkpoint-10/global_step10/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83944ba3b620a1ac2db019d37a3e1360d49c2e0fdd614fa3161f2d71b9b4798c +size 275768601 diff --git a/checkpoint-10/latest b/checkpoint-10/latest new file mode 100644 index 0000000000000000000000000000000000000000..e23122de54baa1dd9f514f25b8a62c6026b72e10 --- /dev/null +++ b/checkpoint-10/latest @@ -0,0 +1 @@ +global_step10 \ No newline at end of file diff --git a/checkpoint-10/rng_state_0.pth b/checkpoint-10/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..edc331168ae2661669e0ffcf76a36773ac09bcf6 --- /dev/null +++ b/checkpoint-10/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfdf928f4078946884211d832c8499ac8934ee401e75a97f57691538634636e7 +size 16389 diff --git a/checkpoint-10/rng_state_1.pth b/checkpoint-10/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..305eb06567d841626f173d1ca692c8ba2a95fb3f --- /dev/null +++ b/checkpoint-10/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988b2d027d6ce2685233cdd6cf67e9e5cfed222a3829f6f72513877f37c5671f +size 16389 diff --git a/checkpoint-10/rng_state_2.pth b/checkpoint-10/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..664ee7e30f8c079d8f2c8c4d8464fcb8eef71c01 --- /dev/null +++ b/checkpoint-10/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50f058984c1e9796e062208ce8fc16ad8f9f1ccafbe7ca1b6ca94bf64bc6b649 +size 16389 diff --git a/checkpoint-10/rng_state_3.pth b/checkpoint-10/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8d7333a2536a61de0d21d1420a3f5fd19899ccd --- /dev/null +++ b/checkpoint-10/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15349681570ba97fc625c4606a245d9b3a99ef7a0f34998499035a805d1b6460 +size 16389 diff --git a/checkpoint-10/rng_state_4.pth b/checkpoint-10/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..61ee79515c356f882a46ad5683b36881630effc4 --- /dev/null +++ b/checkpoint-10/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60af00b96ce89b043a2396cabfd858ae73643c96d9ff85357fc011481070849 +size 16389 diff --git a/checkpoint-10/rng_state_5.pth b/checkpoint-10/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..3cb1cf71df2613daf7fc079c3841189f113a2175 --- /dev/null +++ b/checkpoint-10/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880a3f5ec515f3a7c413b1d278292242d57086cc621ca12e229fdf5fdc46a5b0 +size 16325 diff --git a/checkpoint-10/rng_state_6.pth b/checkpoint-10/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7f2f8ba27a9c3d9de139c8c838b75d980a9ad90b --- /dev/null +++ b/checkpoint-10/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe4d86e50bcf5e3eb06e17756fdcb7429aeb38f6b74a7708e8bc434f4d9fd7b +size 16389 diff --git a/checkpoint-10/rng_state_7.pth b/checkpoint-10/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..783a86d734863089dbe52135e62028a6e43896f9 --- /dev/null +++ b/checkpoint-10/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14b98013e3ccaa40f1b1f75ce8ba32fa9dde6bf4dea0608a5fad673e5c8aaf8 +size 16453 diff --git a/checkpoint-10/scheduler.pt b/checkpoint-10/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..50f8c42e4b112eaa8d87727c25cf7f83fefd481d --- /dev/null +++ b/checkpoint-10/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6063c6570a3a985c4cbd3f42a6dc1cf20aa828fd54a063513ff8d12c0f13e951 +size 1401 diff --git a/checkpoint-10/trainer_state.json b/checkpoint-10/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9543360277532e9321b29edfc3279b65f6493cb --- /dev/null +++ b/checkpoint-10/trainer_state.json @@ -0,0 +1,187 @@ +{ + "best_metric": 0.012996690347790718, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6", + "epoch": 2.4210526315789473, + "eval_steps": 6, + "global_step": 10, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10/training_args.bin b/checkpoint-10/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-10/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-10/zero_to_fp32.py b/checkpoint-10/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-10/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-12/README.md b/checkpoint-12/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-12/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-12/adapter_config.json b/checkpoint-12/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-12/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-12/adapter_model.safetensors b/checkpoint-12/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bb30c368387a514cfa3cc3dd25fa903b3a890c1 --- /dev/null +++ b/checkpoint-12/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7cbe4a10c8d793f698bc973dabf6d9c7113f5ecaf17792b30fadc884e3e228 +size 275342392 diff --git a/checkpoint-12/additional_config.json b/checkpoint-12/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-12/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-12/args.json b/checkpoint-12/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-12/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8bd4b8c129bfb8839e08831c42dac1b80b14b38 --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd0a151d42c25f8ad6ae407ab16cbe0c6c3da97f410c29982c06f024431ac28 +size 51616517 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d92bbc9a0ae029aab4ace05d7ddebac71befa83 --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be84b5a29e3af0e0330375875affb5995022790020cea19934d6b849d50baf0 +size 51616005 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fae3700e1f6a6afd37e266df1e11eee92a7dddef --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb6eb7e8b571add24017ce3b2199a41fd761562ecce465bc20b410428c4b6703 +size 51616517 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fed9985fe9e8ee448f821252c05bd4663093ff8 --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b73619a38e4bd2f96a4717a36686cbaa5fdb73f695a75fce1d69754cdd39a8f7 +size 51616005 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c7cfae571c2d6d1bd3f8a088171ecf93585cae2 --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152252a1c2c0003269de721bba1b6880e9fcca73fb2d496a7ece9c77279ad827 +size 51616517 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0aa70f1356c8e6426507320e6a61bc4c30ddf6c5 --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50f6f3543ebf41a13a81cdec14296d42b56c063db6ca610111cef2a923866a7d +size 51616005 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9d4a369f1809d37636db127be0953410f3de3ee --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf3d6792c4461b67897f49232c96d520f6416c29ff44e46e075b07fd7dd8142 +size 51616517 diff --git a/checkpoint-12/global_step12/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-12/global_step12/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d0504c5a4137fdd96a81549768ce2a0e157cfff --- /dev/null +++ b/checkpoint-12/global_step12/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3047a009cbca6b77c980781e9eba01c47066f9d7dc1e015bc086228b1ed0c86 +size 51616005 diff --git a/checkpoint-12/global_step12/mp_rank_00_model_states.pt b/checkpoint-12/global_step12/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b96e35a74145d094203fde4aa0ec9290428577c3 --- /dev/null +++ b/checkpoint-12/global_step12/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3f112411932eb1b212f9e8206ccd5f54e8c8ab53638be18ae51c7bc4e907640 +size 275768601 diff --git a/checkpoint-12/latest b/checkpoint-12/latest new file mode 100644 index 0000000000000000000000000000000000000000..c83343c42b3f49cdb62181b171bac8d8520bc1f5 --- /dev/null +++ b/checkpoint-12/latest @@ -0,0 +1 @@ +global_step12 \ No newline at end of file diff --git a/checkpoint-12/rng_state_0.pth b/checkpoint-12/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e8348543986d48ecf2e726a1e1b2c8f1728e098 --- /dev/null +++ b/checkpoint-12/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6723582fe5cbe8430aa3ddc5b2155347e1e9c4fae19aaf45277d607277303c4 +size 16389 diff --git a/checkpoint-12/rng_state_1.pth b/checkpoint-12/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..785733d79b5ac72c3f663cee6c70439dd690aad6 --- /dev/null +++ b/checkpoint-12/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb4ed0769e8275023eae5dba809266df2ad8caf8b67d328569b34b6622f381e +size 16389 diff --git a/checkpoint-12/rng_state_2.pth b/checkpoint-12/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c97df4bd8bfd66e198296e80c978af7f67b57145 --- /dev/null +++ b/checkpoint-12/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e77ffd6c06075db3f422331c7b0849d5f9372186bcb1a54d36e5b11b232433c0 +size 16389 diff --git a/checkpoint-12/rng_state_3.pth b/checkpoint-12/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c24098a99d45d93269a0fe255ee1e934a7de2680 --- /dev/null +++ b/checkpoint-12/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c929e790c269b7966bd4f0f60b66086394acb31df23d60ef73a083e5c29020 +size 16389 diff --git a/checkpoint-12/rng_state_4.pth b/checkpoint-12/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5115a0362a864acfed53ccd0dbfd597f0d7887c --- /dev/null +++ b/checkpoint-12/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:281a53b6873813e4ee2b057e2568675989e06828aab6175ffd69916c9cc695c2 +size 16389 diff --git a/checkpoint-12/rng_state_5.pth b/checkpoint-12/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..caa0e9f41b55dd7dc180b112bfe285afb9521c59 --- /dev/null +++ b/checkpoint-12/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfce7b2a1fbdc105f180a8e60b02644db8c4c82244a8d31856246b658ead7850 +size 16325 diff --git a/checkpoint-12/rng_state_6.pth b/checkpoint-12/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ce3c45439a0ed0c8ed0c9a0fcc2a3a0ac252902 --- /dev/null +++ b/checkpoint-12/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05604ca09a09b4637d0be7c4fc0864122be7370475ccc96a9c85bfb5cfa9fedd +size 16389 diff --git a/checkpoint-12/rng_state_7.pth b/checkpoint-12/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b58afa5b03069f7d7aa89246e4f9a9a72bcb3835 --- /dev/null +++ b/checkpoint-12/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f37877eb87038c4ca9a441122bdae8465bbd3158880ad8f48486ea14166645d +size 16453 diff --git a/checkpoint-12/scheduler.pt b/checkpoint-12/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..988ef78229da967c25c1b98623c0850f90029a11 --- /dev/null +++ b/checkpoint-12/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf0072c59fa85f221fa9ad7281bbc53d2d16de177affaaee748baedf21bd26c +size 1401 diff --git a/checkpoint-12/trainer_state.json b/checkpoint-12/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..63af88b38df0307c22be3faf58351ed68fccbb37 --- /dev/null +++ b/checkpoint-12/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.03234308212995529, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12", + "epoch": 2.8421052631578947, + "eval_steps": 6, + "global_step": 12, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12/training_args.bin b/checkpoint-12/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-12/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-12/zero_to_fp32.py b/checkpoint-12/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-12/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-14/README.md b/checkpoint-14/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-14/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-14/adapter_config.json b/checkpoint-14/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-14/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-14/adapter_model.safetensors b/checkpoint-14/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a61e1750b2a2ca76348417309a5dca0bc0de1ac0 --- /dev/null +++ b/checkpoint-14/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4db52982143372a40b34927af295a167e3e78b81592e256906af4c611ec879e7 +size 275342392 diff --git a/checkpoint-14/additional_config.json b/checkpoint-14/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-14/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-14/args.json b/checkpoint-14/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-14/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ced3fc1e08c41c188e4c646b903931b0c5acbe50 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb93290a7c492521f45f9a6039d2a19afb2bdcafc94829f0c11e2715a771818 +size 51616517 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa65c42129b5c8ad15ca905d7af8c3b7110829e2 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00182cdd61e7dbb77057bea0424d842ffb572c2ea79675f14b8db06a574c27c +size 51616005 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78f75c8a07ea82c7f2b8e83015af25e0ab43a9c0 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d14bc327f195f57966877c0f7b0ada86f80ccb877d623dc400126ff35b3b53b +size 51616517 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d54b00846aa4d7a14b0bfabe453dd077c8d19cd8 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c81c187b2f072bef91bd997ff397c41ffd517f1787cc3ae3b904c1ba94ad2ff +size 51616005 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88066d3da5839e58e901c075b03b79725e531e4e --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3352ebc328f3ba2f30c5938993beefbf9515681001ca10ffa2c01a7cc9ddd88c +size 51616517 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bf041639c2a8f4c78e5f323659e6defdfafcf4c --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9de966b67ce0032b3d341180cf37af189df1ae1b14ef15e9bfb2f47473ca741 +size 51616005 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1880232ea9a8d908390b15fcf8e38f6d32e7e073 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54f2b9c370f308055fb095f91868a8878c1619000183595b676cbac3af10f505 +size 51616517 diff --git a/checkpoint-14/global_step14/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-14/global_step14/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d9d93d7c64e742405ca54b5d5f43c00fa4dc718 --- /dev/null +++ b/checkpoint-14/global_step14/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e458003c5cb239cba4d660f554abea73307a4df4581a70b0bdf3da537347266d +size 51616005 diff --git a/checkpoint-14/global_step14/mp_rank_00_model_states.pt b/checkpoint-14/global_step14/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46cabc123db664e33dd354beb8a473f8e7c6edfd --- /dev/null +++ b/checkpoint-14/global_step14/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d88403937ff32664fa5a2f9f11b8341a014ffe1c569fcd944a9a5e5b996f323 +size 275768601 diff --git a/checkpoint-14/latest b/checkpoint-14/latest new file mode 100644 index 0000000000000000000000000000000000000000..0702eb07832ff8d8b5e215fcdd222f6e610eb766 --- /dev/null +++ b/checkpoint-14/latest @@ -0,0 +1 @@ +global_step14 \ No newline at end of file diff --git a/checkpoint-14/rng_state_0.pth b/checkpoint-14/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c736642e2d81983d6c191e2473aac0cf058fd20c --- /dev/null +++ b/checkpoint-14/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a7d071071c37c49de40db4aff68cc2f9de31bb0fddecd8a03237115f738cf3 +size 16389 diff --git a/checkpoint-14/rng_state_1.pth b/checkpoint-14/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..72081ae06d899e83897a857c967f63f118d8c7bc --- /dev/null +++ b/checkpoint-14/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5526ea848b4f1d23d47a1f39606859d15a2d42fbf6c7b798953475ee116a797a +size 16389 diff --git a/checkpoint-14/rng_state_2.pth b/checkpoint-14/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf29bdbf4a98762e7e77d23bb3f3349b44a82d5a --- /dev/null +++ b/checkpoint-14/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8165cdaae66082b679ee29a06f7b3744df9f14cb05346400a3785f2fc3fa3e85 +size 16389 diff --git a/checkpoint-14/rng_state_3.pth b/checkpoint-14/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cb78f79f46ea0aa313f721a188cca87c59dac9b --- /dev/null +++ b/checkpoint-14/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3143b55c0b9d9439e0c31fcee460b506269470b820585b782c0da0f55df02e2 +size 16389 diff --git a/checkpoint-14/rng_state_4.pth b/checkpoint-14/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb3026ba4ca0b393b2034b6b9787e414c8c37f86 --- /dev/null +++ b/checkpoint-14/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a463256ca276163385efe4ff9351b9280d18735b7d36af857c884a3151fe085 +size 16389 diff --git a/checkpoint-14/rng_state_5.pth b/checkpoint-14/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c268180f0ed523d3d9999b78d46ea4ea57c0277f --- /dev/null +++ b/checkpoint-14/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2abdcbc3c3dbe2a686db3338b2f6bacaa214b7d7afe6522a237b081c5639c53 +size 16325 diff --git a/checkpoint-14/rng_state_6.pth b/checkpoint-14/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..92d7ebdcf1055fc0f7175f3af9b01509d3cbf2e5 --- /dev/null +++ b/checkpoint-14/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b9ae6fc2a9beeb9d3b15e652a6c508d78b38813f5993d41eb31efa648f689d +size 16389 diff --git a/checkpoint-14/rng_state_7.pth b/checkpoint-14/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..167747606659c3913551d7c6b10650422d94bf49 --- /dev/null +++ b/checkpoint-14/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9769bb136df4efd7240ee05c0265995972ddd4f1a0ec82a54d256727d2fb07 +size 16453 diff --git a/checkpoint-14/scheduler.pt b/checkpoint-14/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8452bd2374bd9902b885e0a031b5c2533507073c --- /dev/null +++ b/checkpoint-14/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcecea90ca26ba206660b1086a923a631631c8861c3f117850e25c400d5382dd +size 1401 diff --git a/checkpoint-14/trainer_state.json b/checkpoint-14/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0283ff9c783b7aa5ca912544b2dd35fcf23f9a99 --- /dev/null +++ b/checkpoint-14/trainer_state.json @@ -0,0 +1,257 @@ +{ + "best_metric": 0.03234308212995529, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12", + "epoch": 3.4210526315789473, + "eval_steps": 6, + "global_step": 14, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-14/training_args.bin b/checkpoint-14/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-14/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-14/zero_to_fp32.py b/checkpoint-14/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-14/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-16/README.md b/checkpoint-16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-16/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-16/adapter_config.json b/checkpoint-16/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-16/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-16/adapter_model.safetensors b/checkpoint-16/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d7a3e0bf8dee4f822ad1514ba08d5c7a5f0f8f4 --- /dev/null +++ b/checkpoint-16/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e4275298b7a47db99dc030c5650cf99feb4428da4989f107705f625afb343c +size 275342392 diff --git a/checkpoint-16/additional_config.json b/checkpoint-16/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-16/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-16/args.json b/checkpoint-16/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-16/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..815e60698311e86b0999b6e37007a44141cf34ce --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:125c1516745595e529f62df8a6dc919af84f90aa0ac15fb62652f6c68013bedc +size 51616517 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f0a1a4665a6d54015f42c4214d5e7abf90f5883 --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:101607f59f878de2bda9d8f81ec6d68ab6138e1f382f879d3dada58d46f38552 +size 51616005 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f149f8cf974aa539dcec6b0b7948760b5e753c2d --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d257a9e85c1a1f35d99090cf4f0c23ddbd90ff78a47dd366de5f3be786e7381 +size 51616517 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17037ad360236f04079ef4b768700b68dfb98c30 --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8db14aed39689db4a50e73d925a8eb8e6dd7e0cd82abb0ac1c09e46b776e2e +size 51616005 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0199e19420e1d80e3107e579b77c3fcb1d8fb331 --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9637cd69f9b6a8e4d397460672b33dd5410a4711c9f294a8d802502b0de4c4 +size 51616517 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac3ea396933dde3a0296ec005b385d6677203f5e --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee901b15dc7a631faca1241d53eca6cf055e021d33de6cf0ac1ed996f265ba3 +size 51616005 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..508ba6ebadf460824c201aaaf0dd1229234122ab --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b04e0c778215567009f833489bcc7856de6f7df3ed83d3c4ee005c2c6856ae +size 51616517 diff --git a/checkpoint-16/global_step16/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-16/global_step16/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3262e098ff2703d9d4aa8dca612b4a946512bdd --- /dev/null +++ b/checkpoint-16/global_step16/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba5712119db87c2222e67d14392d3eea71fc984e46c1ab4cf8e732ad322cf36 +size 51616005 diff --git a/checkpoint-16/global_step16/mp_rank_00_model_states.pt b/checkpoint-16/global_step16/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64a91e60e2b5bdf86069da24f818d54b76061caa --- /dev/null +++ b/checkpoint-16/global_step16/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ada40ead251219fea698a98b6a96b77ac53afeb7fc0b07706e31d552466b114 +size 275768601 diff --git a/checkpoint-16/latest b/checkpoint-16/latest new file mode 100644 index 0000000000000000000000000000000000000000..2d27cba4a6e222d4d494f31f45a18f8f444863e0 --- /dev/null +++ b/checkpoint-16/latest @@ -0,0 +1 @@ +global_step16 \ No newline at end of file diff --git a/checkpoint-16/rng_state_0.pth b/checkpoint-16/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..26269a3697e1f68b85fde19a8600b8ecb1bbf689 --- /dev/null +++ b/checkpoint-16/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa11e88bad2554304d133bcb6dab324e4ac8ec0939e2b0cafe7b2c32592d89f1 +size 16389 diff --git a/checkpoint-16/rng_state_1.pth b/checkpoint-16/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c20f61a267d243e82133a175d00616ae3cd3c311 --- /dev/null +++ b/checkpoint-16/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea3a765e1f5510be4d63c0afec5a0c48547ab24ca2856df22c3e405ceef672e3 +size 16389 diff --git a/checkpoint-16/rng_state_2.pth b/checkpoint-16/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..73475e3bc37e6e4f1092de8c59991e977d8edebc --- /dev/null +++ b/checkpoint-16/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400f26eee08326b27b3462f41ccd3795ce63b3ae6eaeb264795fbd75e23fdc70 +size 16389 diff --git a/checkpoint-16/rng_state_3.pth b/checkpoint-16/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..248ae9b102002dcfd30d9112f8bb23978a0b1d9b --- /dev/null +++ b/checkpoint-16/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb967db599d650f89d6abed522e292b8e1e1038bc0fd6f5fbc7c2e35ab736d41 +size 16389 diff --git a/checkpoint-16/rng_state_4.pth b/checkpoint-16/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad1027899fd44efd1c2300b9ed3e71947fcd1330 --- /dev/null +++ b/checkpoint-16/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a6e3d19048e9956ec7ed4436bc6255aafcd04e91f156a7af8ac248fc76f658 +size 16389 diff --git a/checkpoint-16/rng_state_5.pth b/checkpoint-16/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f1af18874467a89c81c55d3801e23c8e4770c3f --- /dev/null +++ b/checkpoint-16/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb942b686c66b4b042754582ad4d2c75e9e0d513e23dd7d9bce3d55a8c40aa38 +size 16325 diff --git a/checkpoint-16/rng_state_6.pth b/checkpoint-16/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f59909c29f748a2f5c3a9786d58d8462234a8aa --- /dev/null +++ b/checkpoint-16/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1bd340a7dca5c28144cad4a6ebf922ecd7a58668da5663acc14ebe04c5ab4be +size 16389 diff --git a/checkpoint-16/rng_state_7.pth b/checkpoint-16/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8741d627bc0ae3721314daf4b0e62a7a6508b63d --- /dev/null +++ b/checkpoint-16/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341cfead866973e59f612b4b1f51a130a969de0e3a459d2a66a0b4bd0326c07b +size 16453 diff --git a/checkpoint-16/scheduler.pt b/checkpoint-16/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..db47704b9603eba9ed1119739e70e01543cf9390 --- /dev/null +++ b/checkpoint-16/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f568b3ba3a52464a53db18c9e7d11439b5789929dbb19774ca189da51b08333a +size 1401 diff --git a/checkpoint-16/trainer_state.json b/checkpoint-16/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9227c90abf65fb0517b0d1c7eb78e18f0426365e --- /dev/null +++ b/checkpoint-16/trainer_state.json @@ -0,0 +1,285 @@ +{ + "best_metric": 0.03234308212995529, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12", + "epoch": 3.8421052631578947, + "eval_steps": 6, + "global_step": 16, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-16/training_args.bin b/checkpoint-16/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-16/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-16/zero_to_fp32.py b/checkpoint-16/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-16/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-18/README.md b/checkpoint-18/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-18/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-18/adapter_config.json b/checkpoint-18/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-18/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-18/adapter_model.safetensors b/checkpoint-18/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7043c7c850dbf5d935077d518fc25d3ec870b1e6 --- /dev/null +++ b/checkpoint-18/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d92166143d1bf4ff04df507e54de7d9cfa9535b638846c0933bb3991c78ed1 +size 275342392 diff --git a/checkpoint-18/additional_config.json b/checkpoint-18/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-18/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-18/args.json b/checkpoint-18/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-18/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d4c9409795857db88698578c1734602f3bd2175 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c913550ce81587d92c66fc2a20a67631752eb2ee1ba76aa3fde6d88de1503fef +size 51616517 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..694f3c7c8f2f13953963840f3c4b0460a485cd16 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c24fa6666887ed7ae72eafeb718f289f10e0733407b1c7cf97736df68dbe5a4 +size 51616005 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..504d476cf96d792e2657d80746d8ecc285e7fd52 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54823b966e33776411e1169d66b0921abf49c63143aa458c77c125a57305d22 +size 51616517 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cc7111ad1908697290fa7ab243b404065afb01d --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7ff999caa49dd526fb2f38a2aa9333f9c891d59f49ba3675ebbb3fcaa94ea2d +size 51616005 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d04c21fe8d0805138b2a93dfe2e0cdb2a958561 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170915b25d89ac5e81b9d9b163cb63f2906794ec03ae88e35ca7f620e52e5740 +size 51616517 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74a0eb57ab096f8ca4d335a326103c1bda82062e --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c0385e56a88f06af225959abacda5e8ca6d77292540171f5bd0263e817e04c1 +size 51616005 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62c320791603569b90798a8079b1deb41ff44f31 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:015b8c24ccb98e06da9d4f0391c8847357bc93b5e12d416853e1467cf3f695de +size 51616517 diff --git a/checkpoint-18/global_step18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-18/global_step18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fec6715fffc912b39132b6832c3f890beb35778 --- /dev/null +++ b/checkpoint-18/global_step18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223531348c604d7b5b7a8737fc0d04d5a65b07e17ab9cfc9daf228195402ba32 +size 51616005 diff --git a/checkpoint-18/global_step18/mp_rank_00_model_states.pt b/checkpoint-18/global_step18/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb7bfa4a4af85ac0a237db88971929902cbbc900 --- /dev/null +++ b/checkpoint-18/global_step18/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c53fa64fd4ae2a7e87496927eaabbf46b54d57c754e14f250a99d42a5483aee9 +size 275768601 diff --git a/checkpoint-18/latest b/checkpoint-18/latest new file mode 100644 index 0000000000000000000000000000000000000000..9c0258eb8becd88be3c3c47c7548c606060ef544 --- /dev/null +++ b/checkpoint-18/latest @@ -0,0 +1 @@ +global_step18 \ No newline at end of file diff --git a/checkpoint-18/rng_state_0.pth b/checkpoint-18/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..eec3d30aef07fc37ce950ef322aae2d3217de2cc --- /dev/null +++ b/checkpoint-18/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d724d9d2d75f97366a51d17c24a16c593e3246997e32d2fa48df72f139a12da +size 16389 diff --git a/checkpoint-18/rng_state_1.pth b/checkpoint-18/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b7a8182a8ea065eae5b14285f42a3251841e2ae --- /dev/null +++ b/checkpoint-18/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be05d7f51bedd88ea1aff36e78f49231591a7df18edbf7e5b1cfaf388855363 +size 16389 diff --git a/checkpoint-18/rng_state_2.pth b/checkpoint-18/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b451664315a6d92afc943ddf83ce5702435620c --- /dev/null +++ b/checkpoint-18/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e50c9ada351af5a8d9f85ab510cfb043bf31dfd0a55b537531da34fd749bb51 +size 16389 diff --git a/checkpoint-18/rng_state_3.pth b/checkpoint-18/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..317a8677828aebb1d6174e23ae65929e7d15a10c --- /dev/null +++ b/checkpoint-18/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44b1067738c1c8164f78787843aeef950388d2732a105445a2c554590102ae0 +size 16389 diff --git a/checkpoint-18/rng_state_4.pth b/checkpoint-18/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94170745d1ea0e68677bbc4d9b202b21f27f0f26 --- /dev/null +++ b/checkpoint-18/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4511de4cd87f135158c208ffe0d25caf5221d193abc5b180e351c1143adb6122 +size 16389 diff --git a/checkpoint-18/rng_state_5.pth b/checkpoint-18/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..6cee8d8a1d9a928c61a967906fb739d2006019e1 --- /dev/null +++ b/checkpoint-18/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72cb9a6a71588eeb21dc28ab383298953b6ff00dfc8f8d64d0e1429984e1116 +size 16325 diff --git a/checkpoint-18/rng_state_6.pth b/checkpoint-18/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..ddc154b716aa024df95893383bc6f149970bb902 --- /dev/null +++ b/checkpoint-18/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013e0be91f6b0caa8d9080ba69157e13133bafffe3ebad6d0a2c8bac1345cf5f +size 16389 diff --git a/checkpoint-18/rng_state_7.pth b/checkpoint-18/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..369721c4d6529cb1f01b4f1553db1fd6e27c0027 --- /dev/null +++ b/checkpoint-18/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:659b00c84cf97caedbddeb1dcb6b237e932c3673f23efe9bdd1ed064e697d7bd +size 16453 diff --git a/checkpoint-18/scheduler.pt b/checkpoint-18/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7f815ba364f0ed4de0d3e1a600fab6a0a8a9867 --- /dev/null +++ b/checkpoint-18/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05b372f2e5a6925fe28df1cbf55c563a55c6372ebe9dff3775856b698725b81 +size 1401 diff --git a/checkpoint-18/trainer_state.json b/checkpoint-18/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5e552ea3183048034e8097ff86a4f6d1c899830d --- /dev/null +++ b/checkpoint-18/trainer_state.json @@ -0,0 +1,327 @@ +{ + "best_metric": 0.03729328140616417, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18", + "epoch": 4.421052631578947, + "eval_steps": 6, + "global_step": 18, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + }, + { + "clip_ratio": 2.4864069928298704e-05, + "completion_length": 10822.3515625, + "epoch": 4.2105263157894735, + "grad_norm": 0.008352754637598991, + "kl": 0.1787109375, + "learning_rate": 9.01061596377522e-05, + "loss": -0.04504441097378731, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.027318883687257767, + "reward_std": 0.10441224090754986, + "rewards/CosineReward": 0.027319116634316742, + "rewards/RepetitionPenalty": -2.338138500590503e-07, + "step": 17, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005998397711664438, + "learning_rate": 8.83022221559489e-05, + "loss": -0.045487549155950546, + "memory(GiB)": 182.91, + "step": 18, + "train_speed(iter/s)": 0.000432 + }, + { + "epoch": 4.421052631578947, + "eval_clip_ratio": 2.286707422172185e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.18359375, + "eval_loss": -0.38219889998435974, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03729328140616417, + "eval_reward_std": 0.10691346973180771, + "eval_rewards/CosineReward": 0.03729327768087387, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1041.231, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 18 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18/training_args.bin b/checkpoint-18/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-18/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-18/zero_to_fp32.py b/checkpoint-18/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-18/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-2/README.md b/checkpoint-2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-2/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-2/adapter_config.json b/checkpoint-2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-2/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2/adapter_model.safetensors b/checkpoint-2/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8d4e0effab7218a2bda7ecd55f9cc44d50a3269c --- /dev/null +++ b/checkpoint-2/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eed44c3a66f88f5f127c1601d0dd7f0239b848e97b8d2e37085bcadd59a2204 +size 275342392 diff --git a/checkpoint-2/additional_config.json b/checkpoint-2/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-2/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-2/args.json b/checkpoint-2/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-2/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e87fe032a3b10c6635b985a68bf964707338188f --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93872fbeaa8c37749da2844dc5848248f175433839e9a7636e8fb9211a7e7edc +size 51616517 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b46bea3ab2bab1630da71a0dd355af5aaf34a5e2 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e78c33e31f360e39a9f29325e660f9b8c489e1c9586e3f7cc1beaf95c53245ae +size 51616005 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9a1adebeaedecd38e2a01434ddf0a0667f97435 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67f18b06689c2b58285538e103d2ecc44c044f61f9c4d0cedeb63af257c0972 +size 51616517 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a19451611af0c35d7e5cf47217cd32111f31d432 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29cb3b911edfd54413bd87b8e440980891b5e6e3edd58cd30a5f754a713d712c +size 51616005 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fef32b4a9d31c2867668952298e28076be31d7b2 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479a6a235471a10cf947500101641f0eebeb3b4af28464cd80bf4d304a1ec992 +size 51616517 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e6025f26830522def3a46328f690c8bd8e19588 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f751bc2c62cbd00177e776ad20be8e942e5fe1f7f23ea502a09cebb09e3654d6 +size 51616005 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a36e2e5bd83333d4f9db476f557cecf955a899d --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6db5237c9c55d4ca7957d529f293190863061ab48fb1ed3af4d78563db6afe42 +size 51616517 diff --git a/checkpoint-2/global_step2/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-2/global_step2/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7f2c22aa20917eb7af9d08b855b4ff4fba1c886 --- /dev/null +++ b/checkpoint-2/global_step2/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e5455637b580c0ed633e31aca01a18621461e8667287333a2e1b90be74a3c2d +size 51616005 diff --git a/checkpoint-2/global_step2/mp_rank_00_model_states.pt b/checkpoint-2/global_step2/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..402b06ed2d5266a3afe4b3a916fa8b242b312db8 --- /dev/null +++ b/checkpoint-2/global_step2/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d57d58631987ae88487fc533bd65f0106745185c3f1f00b64f01f1d33f45e103 +size 275768601 diff --git a/checkpoint-2/latest b/checkpoint-2/latest new file mode 100644 index 0000000000000000000000000000000000000000..1cede9788299a52cffe568b9411b318cc8cdf096 --- /dev/null +++ b/checkpoint-2/latest @@ -0,0 +1 @@ +global_step2 \ No newline at end of file diff --git a/checkpoint-2/rng_state_0.pth b/checkpoint-2/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2b3ec8b03fe7185595c681c3267b0738331d1b1 --- /dev/null +++ b/checkpoint-2/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3054dbe464c2e42e555704c2825a6a59fdfff55c5d1f114f9c3af6e49dadb1 +size 16389 diff --git a/checkpoint-2/rng_state_1.pth b/checkpoint-2/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..933080a7fafb8301d298b9e60b5d7b42e2014b87 --- /dev/null +++ b/checkpoint-2/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcb376d4daa201a093cc388f01e9f595d8fbcf05cad0d7222f92a478b1ed8fc +size 16389 diff --git a/checkpoint-2/rng_state_2.pth b/checkpoint-2/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4061cef15327bb45c53653e8174c35021c98948c --- /dev/null +++ b/checkpoint-2/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c87adcbf4a23b8788c36e454be93e553f5fb097f526292cb465cd333bb1072d +size 16389 diff --git a/checkpoint-2/rng_state_3.pth b/checkpoint-2/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..651c0824a8185fbb69afca24a4ea0985f4d0863e --- /dev/null +++ b/checkpoint-2/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90678c0a035b30e922e78b4a0f9fd0573cd9909f35ea9708dc1421f2e2ce3be5 +size 16389 diff --git a/checkpoint-2/rng_state_4.pth b/checkpoint-2/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..3be443720f2c94db28eef535a11696849cd2edaa --- /dev/null +++ b/checkpoint-2/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:829732be3cea68ec5357183bd1107fc4772530efcf982689e1e8f6dd40598d31 +size 16389 diff --git a/checkpoint-2/rng_state_5.pth b/checkpoint-2/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d5861cfb5b88954666e84d3850e9d844b2d689c --- /dev/null +++ b/checkpoint-2/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a0a493a8551d65ebb5acbbbd58dc4106ff40df9abd2dbd6c75e8239cb550ca +size 16325 diff --git a/checkpoint-2/rng_state_6.pth b/checkpoint-2/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d50352751661a51e2fa3ec8e67e07a9e3d9929e --- /dev/null +++ b/checkpoint-2/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c753e6faa07d14a3f666cf1ff71ff6729b9eb78313511aca429c386619a2449d +size 16389 diff --git a/checkpoint-2/rng_state_7.pth b/checkpoint-2/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..18583a304a1646b6b4b726b540cc3d4554f5a067 --- /dev/null +++ b/checkpoint-2/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cf8b459ad9af17f640b30d9b964d6bfd30427de86b9fa1a08a63b694f79182 +size 16453 diff --git a/checkpoint-2/scheduler.pt b/checkpoint-2/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8445fec337f672b1c1878769ec6990503ea75789 --- /dev/null +++ b/checkpoint-2/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44e1515cb5e8a202a6371ef4fac73ed5442126f42cba213356b38d88ebb07420 +size 1401 diff --git a/checkpoint-2/trainer_state.json b/checkpoint-2/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..abb60dd05407f920c1cb0c23f346741d8e8e4205 --- /dev/null +++ b/checkpoint-2/trainer_state.json @@ -0,0 +1,61 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.42105263157894735, + "eval_steps": 6, + "global_step": 2, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2/training_args.bin b/checkpoint-2/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-2/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-2/zero_to_fp32.py b/checkpoint-2/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-2/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-20/README.md b/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-20/adapter_config.json b/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-20/adapter_model.safetensors b/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eead267da95ae39fa5e5b44ff697f6cb09220ded --- /dev/null +++ b/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b6e8d5ca00d04b6b7ff211718910353830814523501223167167779641e607 +size 275342392 diff --git a/checkpoint-20/additional_config.json b/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-20/args.json b/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-20/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cdbd6eb8c0d8601869a18bff4f722ae258aa43e --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d06712a386feee7272265ac077ffa146337cbe39bde41aa79c3db405ec40b3 +size 51616517 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c62cfaa85a87a2daf515daf3a99b8b45ce3bd7e --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a023fa77b056e8fb9c4fc363f6ebba8f5c07145572acf085eb5d2cf850f74a +size 51616005 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66ba8aa1fcc1cefc25b5f0ff877271eb68833e44 --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527c10fdb847eb89307b592c36087f278a46991972c8a7cbb8e42eb62da9a299 +size 51616517 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8751f65412b5b1963812f9e3d3df0ab4bae390f --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3359bf4311bdd01ac5967d402f8ae84eafff999a88b3019e48e4616d3b2ecf57 +size 51616005 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37892dac0242048e0c7a391ad51ff5eff4dfc603 --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb04f097e6278a0911d566a1a7440ff0649b79c9211b64b52b8affe1841dd5c8 +size 51616517 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf002233c1e3141d7a14da48980017d594ed112e --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5674972c9e6e749335ce01865ad92a220d4f6b49e4ca1b52c394a1ae2b7291ca +size 51616005 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cd63ee6357b6b4dec4623b41583a29764a60c25 --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b929baba57c41eddfdc80be52f42099769e54bd50a62583472105898a8af47e6 +size 51616517 diff --git a/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b119ac9488a278b2b90e40ac127edb3ef0981329 --- /dev/null +++ b/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf117925147df180f0e3437a26b08483553d20821d7c5dd0e216012e5bf7adb +size 51616005 diff --git a/checkpoint-20/global_step20/mp_rank_00_model_states.pt b/checkpoint-20/global_step20/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2797874e7f41f8604ef932a9ee49f68bc2e472e8 --- /dev/null +++ b/checkpoint-20/global_step20/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b40e00e755217c2b63a83be5809d9f2370b6030fedcabbd074b30af1cb35c7b +size 275768601 diff --git a/checkpoint-20/latest b/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/checkpoint-20/rng_state_0.pth b/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..58c8f712193d2e80a8fe5dac91dcb92d1bd38a78 --- /dev/null +++ b/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ba7c9fc0b06c75f823d89d14c254b8b1821e183ef9a39276dd56b4a240757f +size 16389 diff --git a/checkpoint-20/rng_state_1.pth b/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..66b4d4e00066118eae64cf65d08a9eb1e0f2b814 --- /dev/null +++ b/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:553da4510b8d98b1b815e7befe5806ba5b3176bfe58a4a900a8e897ae509c701 +size 16389 diff --git a/checkpoint-20/rng_state_2.pth b/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..27db780074b9c07dd0f79f51f2fc5f2e1a83179c --- /dev/null +++ b/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5734484fcb5b64478a57f4faced248e3542370cb3631e17a559dadf66e0b34fc +size 16389 diff --git a/checkpoint-20/rng_state_3.pth b/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..94b120c49f13d520958db6a1145781a462d9d6fd --- /dev/null +++ b/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eee69b2e09ead4cc1d245e31f5bdae0ec19b66eb0f77750ac43b38436b9c0cd +size 16389 diff --git a/checkpoint-20/rng_state_4.pth b/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a2cc1e4e9d3eb893e6dee417e6c9cee8537e3fa --- /dev/null +++ b/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4169ff394580b47d7a997bcf320b406fee100598097050ffb16defb1ab8b00f +size 16389 diff --git a/checkpoint-20/rng_state_5.pth b/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f660679c2f64ed6cbaa10fb5732bb570862a661 --- /dev/null +++ b/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a8dc5038ef4ba785b29551e1e5367dffc8e759018827dc38bdf2680158a662 +size 16325 diff --git a/checkpoint-20/rng_state_6.pth b/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..706360d47c68770b2bcd8212f9c1906b51c50071 --- /dev/null +++ b/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3815bd1e8978edf50b8753c544cfc1b20a3d4f2419333e397e29205a94f94f7c +size 16389 diff --git a/checkpoint-20/rng_state_7.pth b/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..42b62e1201bee1ff035043c0dcd912f37fb88e52 --- /dev/null +++ b/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6c1bce46fc90f4949376ac76f2be2aeb8c7e76348abca9ae4e787dfdfa3398 +size 16453 diff --git a/checkpoint-20/scheduler.pt b/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f757b456beec1db76f919965dffa83ddf023fc43 --- /dev/null +++ b/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d099ada0984e22e98ca446260a7482217056c25a85b6cd5ed342e3cf5921d84 +size 1401 diff --git a/checkpoint-20/trainer_state.json b/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..939e242f4ac78f4a828aaf48a8eafdabdd7268dd --- /dev/null +++ b/checkpoint-20/trainer_state.json @@ -0,0 +1,355 @@ +{ + "best_metric": 0.03729328140616417, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18", + "epoch": 4.842105263157895, + "eval_steps": 6, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + }, + { + "clip_ratio": 2.4864069928298704e-05, + "completion_length": 10822.3515625, + "epoch": 4.2105263157894735, + "grad_norm": 0.008352754637598991, + "kl": 0.1787109375, + "learning_rate": 9.01061596377522e-05, + "loss": -0.04504441097378731, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.027318883687257767, + "reward_std": 0.10441224090754986, + "rewards/CosineReward": 0.027319116634316742, + "rewards/RepetitionPenalty": -2.338138500590503e-07, + "step": 17, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005998397711664438, + "learning_rate": 8.83022221559489e-05, + "loss": -0.045487549155950546, + "memory(GiB)": 182.91, + "step": 18, + "train_speed(iter/s)": 0.000432 + }, + { + "epoch": 4.421052631578947, + "eval_clip_ratio": 2.286707422172185e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.18359375, + "eval_loss": -0.38219889998435974, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03729328140616417, + "eval_reward_std": 0.10691346973180771, + "eval_rewards/CosineReward": 0.03729327768087387, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1041.231, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 18 + }, + { + "clip_ratio": 6.176384295031312e-05, + "completion_length": 10454.50390625, + "epoch": 4.631578947368421, + "grad_norm": 0.007075619418174028, + "kl": 0.1820068359375, + "learning_rate": 8.636868207865244e-05, + "loss": -0.03466903418302536, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.466796875, + "reward": 0.04069916973821819, + "reward_std": 0.11991005763411522, + "rewards/CosineReward": 0.04070046404376626, + "rewards/RepetitionPenalty": -1.294118249006715e-06, + "step": 19, + "train_speed(iter/s)": 0.000404 + }, + { + "clip_ratio": 6.06911453360226e-05, + "epoch": 4.842105263157895, + "grad_norm": 0.005896567367017269, + "kl": 0.19287109375, + "learning_rate": 8.43120818934367e-05, + "loss": -0.03502114117145538, + "memory(GiB)": 182.91, + "step": 20, + "train_speed(iter/s)": 0.000424 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-20/training_args.bin b/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-20/zero_to_fp32.py b/checkpoint-20/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-22/README.md b/checkpoint-22/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-22/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-22/adapter_config.json b/checkpoint-22/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-22/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-22/adapter_model.safetensors b/checkpoint-22/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..832d19219009fa7f774e896bc2afb72f51bc51a7 --- /dev/null +++ b/checkpoint-22/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e7f2a2ad7ab4bcbd354808e1842a891328cbbaa3632de71b174700a5f02ba1b +size 275342392 diff --git a/checkpoint-22/additional_config.json b/checkpoint-22/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-22/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-22/args.json b/checkpoint-22/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-22/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81cb0379608998a43c279ffaacaeea267d8f9d38 --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404d224ad79f0e9c30dc56a3e52d72c84b2ddf9b09126811da719c07d9d73f76 +size 51616517 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b2aaf031c3da31aa8d15cc711ee5eba98bd96b7 --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:070ed21950e189c3d86e2e1a3bda73f3506f5ffbecc8914812629604d71445c9 +size 51616005 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdfe072a12b6f6edd763010b11a7e3b93f6250be --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47cbd0bc7da7ce1031d606eaee0d5bfa26f93f2ef3b1bc84fc3df2c0274ee24 +size 51616517 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc040945763f0ed74db342ae0c0a52782afebb47 --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5de7c93f8160f016ae691ddcf6a627f1fdb74a6afef4f8ac0113e53e5ce5c98 +size 51616005 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc2023ab500b037c79e308838e8d38e7b122907b --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b3d324da7ece665322dc39b23a979a03b50e9f8538ad96c1fe29772dbb15232 +size 51616517 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b60ca78f960a8e8302eb70c07384d54502bdb9f7 --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4bd5c3dbaa54d1a27bb5e70f1078e47cd0855a5b1d0a46ea9526a8ae286ddf +size 51616005 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bde6b6b8894bdb43dac1c66cb483424b87ef718c --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c1aa167ca94be13811a3937d60235313d3aecb94019d99336b1372c2c36ed0 +size 51616517 diff --git a/checkpoint-22/global_step22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-22/global_step22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a7f3a6c5a142d5744f0ce6d615b1e2137a9bd5e --- /dev/null +++ b/checkpoint-22/global_step22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e47748ea33e48772b30ddc13ec6df1e642317da85096c2d59f97a38e5d1dde +size 51616005 diff --git a/checkpoint-22/global_step22/mp_rank_00_model_states.pt b/checkpoint-22/global_step22/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09c580522c84ea0940ad6c2033e4eae76e211377 --- /dev/null +++ b/checkpoint-22/global_step22/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7baec0b4a85ad07a31e6649ac5b10948533c345f3ad7d368fbbf1d9817629821 +size 275768601 diff --git a/checkpoint-22/latest b/checkpoint-22/latest new file mode 100644 index 0000000000000000000000000000000000000000..29b43a0a9f5b85f01b0a6f91fea39fd52e78071c --- /dev/null +++ b/checkpoint-22/latest @@ -0,0 +1 @@ +global_step22 \ No newline at end of file diff --git a/checkpoint-22/rng_state_0.pth b/checkpoint-22/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0dd2f1a1ec3dfeacf4cfccee32dd8175bd04a1d --- /dev/null +++ b/checkpoint-22/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a5b9f587de53ec3f368b62b00cea1af5f4c7cf57ffc85dadcba398532226ca +size 16389 diff --git a/checkpoint-22/rng_state_1.pth b/checkpoint-22/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f006671a7038997b86a8125e52969d70da1bc2b1 --- /dev/null +++ b/checkpoint-22/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534274a76b56284045b56dbf715980f1a4eb9a4e515d7b55115078ea6b417b90 +size 16389 diff --git a/checkpoint-22/rng_state_2.pth b/checkpoint-22/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0eaf4714027c81b6717642768bf25045e783705 --- /dev/null +++ b/checkpoint-22/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43a0bcf2c2142327639256bb25dcccf44a6cda316e35187cc3737357f640d9e +size 16389 diff --git a/checkpoint-22/rng_state_3.pth b/checkpoint-22/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca2ffef769091dc173362eca027f3c449a0e04ca --- /dev/null +++ b/checkpoint-22/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f301d165042d3cab3f69d0874480bc55497fb6fdd74cb5b59dee6f359e65b6 +size 16389 diff --git a/checkpoint-22/rng_state_4.pth b/checkpoint-22/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a52d99207d68b262b24c6f7947747c759089d564 --- /dev/null +++ b/checkpoint-22/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac48f49d4ea004e48e202f84ce1e679b32d5c4aff904920db75e32998bdd4850 +size 16389 diff --git a/checkpoint-22/rng_state_5.pth b/checkpoint-22/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..667e1c1ed8d05adb191a749978614e4df281d497 --- /dev/null +++ b/checkpoint-22/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddcd08bb1d6aa63c4ec2a8312bc6cf86d7ccc0e29cbe4ba68efddf728a5412f3 +size 16325 diff --git a/checkpoint-22/rng_state_6.pth b/checkpoint-22/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..281596bd7c0c0f7a224f0a0cdb573cc2d94b03ba --- /dev/null +++ b/checkpoint-22/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987a19c4e59d43b35cbda3c18ee5349ff5e321ad4038fb497f54261481f5b2bd +size 16389 diff --git a/checkpoint-22/rng_state_7.pth b/checkpoint-22/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..07e543363dfa3e38326faad5409e3de07f1b9fbe --- /dev/null +++ b/checkpoint-22/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723ac972516c9e67065ee3b717933ea99c5c747fbebf940357748d4f6cd2f5d9 +size 16453 diff --git a/checkpoint-22/scheduler.pt b/checkpoint-22/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dca33b1a414343549eceeda3ca83058f62eec87 --- /dev/null +++ b/checkpoint-22/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5473ef829f05cd3969afbd7dfd3ac559dfb2dd3c4e232890fc1a8cfbe4ca982 +size 1401 diff --git a/checkpoint-22/trainer_state.json b/checkpoint-22/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f25a50fd2117664a679744f3e1068c255dd295e1 --- /dev/null +++ b/checkpoint-22/trainer_state.json @@ -0,0 +1,383 @@ +{ + "best_metric": 0.03729328140616417, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18", + "epoch": 5.421052631578947, + "eval_steps": 6, + "global_step": 22, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + }, + { + "clip_ratio": 2.4864069928298704e-05, + "completion_length": 10822.3515625, + "epoch": 4.2105263157894735, + "grad_norm": 0.008352754637598991, + "kl": 0.1787109375, + "learning_rate": 9.01061596377522e-05, + "loss": -0.04504441097378731, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.027318883687257767, + "reward_std": 0.10441224090754986, + "rewards/CosineReward": 0.027319116634316742, + "rewards/RepetitionPenalty": -2.338138500590503e-07, + "step": 17, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005998397711664438, + "learning_rate": 8.83022221559489e-05, + "loss": -0.045487549155950546, + "memory(GiB)": 182.91, + "step": 18, + "train_speed(iter/s)": 0.000432 + }, + { + "epoch": 4.421052631578947, + "eval_clip_ratio": 2.286707422172185e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.18359375, + "eval_loss": -0.38219889998435974, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03729328140616417, + "eval_reward_std": 0.10691346973180771, + "eval_rewards/CosineReward": 0.03729327768087387, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1041.231, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 18 + }, + { + "clip_ratio": 6.176384295031312e-05, + "completion_length": 10454.50390625, + "epoch": 4.631578947368421, + "grad_norm": 0.007075619418174028, + "kl": 0.1820068359375, + "learning_rate": 8.636868207865244e-05, + "loss": -0.03466903418302536, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.466796875, + "reward": 0.04069916973821819, + "reward_std": 0.11991005763411522, + "rewards/CosineReward": 0.04070046404376626, + "rewards/RepetitionPenalty": -1.294118249006715e-06, + "step": 19, + "train_speed(iter/s)": 0.000404 + }, + { + "clip_ratio": 6.06911453360226e-05, + "epoch": 4.842105263157895, + "grad_norm": 0.005896567367017269, + "kl": 0.19287109375, + "learning_rate": 8.43120818934367e-05, + "loss": -0.03502114117145538, + "memory(GiB)": 182.91, + "step": 20, + "train_speed(iter/s)": 0.000424 + }, + { + "clip_ratio": 3.8725801914551994e-05, + "completion_length": 10645.056640625, + "epoch": 5.2105263157894735, + "grad_norm": 0.004154536407440901, + "kl": 0.17626953125, + "learning_rate": 8.213938048432697e-05, + "loss": -0.008662773296236992, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.04996980866417289, + "reward_std": 0.13849420100450516, + "rewards/CosineReward": 0.049969930201768875, + "rewards/RepetitionPenalty": -1.1864573679076784e-07, + "step": 21, + "train_speed(iter/s)": 0.000408 + }, + { + "clip_ratio": 5.869188044016482e-05, + "epoch": 5.421052631578947, + "grad_norm": 0.004300669766962528, + "kl": 0.178955078125, + "learning_rate": 7.985792958513931e-05, + "loss": -0.008743642829358578, + "memory(GiB)": 182.91, + "step": 22, + "train_speed(iter/s)": 0.000426 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-22/training_args.bin b/checkpoint-22/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-22/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-22/zero_to_fp32.py b/checkpoint-22/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-22/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-24/README.md b/checkpoint-24/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-24/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-24/adapter_config.json b/checkpoint-24/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-24/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-24/adapter_model.safetensors b/checkpoint-24/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec4e3aa1faa26fea7d7c37a2b3754e2d51f373aa --- /dev/null +++ b/checkpoint-24/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba7becfd769351365c953bca1094b2bc7bb4bd99fd8ea912d897451e5c1b6ee0 +size 275342392 diff --git a/checkpoint-24/additional_config.json b/checkpoint-24/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-24/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-24/args.json b/checkpoint-24/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-24/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..523e7787e3e7098486be092fa986994194e8f5ad --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08cc103104ff243b6c102836e4109729982b3894da4463a864f00a534d715a4a +size 51616517 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58a0ad9e213bc0f4d80dc41f2fad2df37534bd95 --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3199563d00c3c2a5bea0f9c5ab5bd84f7c20215a7f303953ac4c9c7fb65180c0 +size 51616005 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6649b02b4ece117619fd36fc61aac765d8e0f5a8 --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb296bae0bd26d5b7c5f37c19a055348b7b61a73d3013fb6496bc5989a703a8 +size 51616517 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9878d98340a5c75b2019124933d999e697486540 --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea22e387be23dda5ada93bf1cbb883d5b1fe9b2cec5c3c9c611947609025164 +size 51616005 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03405beb13151629b0f986c56b374e1f5171008c --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db47fae105a102664fb60a45f0570aec10aebe3876c53c83d1d7c6f7eb1ca1f +size 51616517 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47413304b238c11b89dc3ebf6738ff19c71dbdaf --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bd8ecc455a7df975e12d7a093140433adc06eb244ee8f6ab61d5303b1bcce7 +size 51616005 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00bfa95c12dc8022e794ca2ec2be521ac6e66f25 --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b9ebaa77d2bff037cf68922ac01980c0ed34de6d417f21e80576aa24c90adb6 +size 51616517 diff --git a/checkpoint-24/global_step24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-24/global_step24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62b3e515bfdbc4a5b077961dff92cbb8941058c2 --- /dev/null +++ b/checkpoint-24/global_step24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876827001dbe72be274e9caf2693990a6d673d5829e2146a0d4b4da229ead134 +size 51616005 diff --git a/checkpoint-24/global_step24/mp_rank_00_model_states.pt b/checkpoint-24/global_step24/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3aba7798144dfb62bb8182e65347c861922d02df --- /dev/null +++ b/checkpoint-24/global_step24/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:842f801b27eac137b86d068c9b43ce0b9a8cee61c36f9e1747d14475e1b00ed0 +size 275768601 diff --git a/checkpoint-24/latest b/checkpoint-24/latest new file mode 100644 index 0000000000000000000000000000000000000000..73987812e51bc1ddb3a7831c1546db2f65868179 --- /dev/null +++ b/checkpoint-24/latest @@ -0,0 +1 @@ +global_step24 \ No newline at end of file diff --git a/checkpoint-24/rng_state_0.pth b/checkpoint-24/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..95a85a9e607c8d412f8ec207a01f177de52392b8 --- /dev/null +++ b/checkpoint-24/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001843131228361979ca7fe74bdefc2606b415e3730a1009a7d8a172aae352c1 +size 16389 diff --git a/checkpoint-24/rng_state_1.pth b/checkpoint-24/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c74e1c2ad5f350ef8d8c7f523acaf63c51496d0d --- /dev/null +++ b/checkpoint-24/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e312f27ce2977b02df03934c9c8beb49ca83fe03fb91bde7582bcd7a791db330 +size 16389 diff --git a/checkpoint-24/rng_state_2.pth b/checkpoint-24/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbd7ccb6f3e5726c858cd3679e7a9bc56f645956 --- /dev/null +++ b/checkpoint-24/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6768dbbebf083fd28f35933fab27eaf039c35a4acd926298e9b3fd2b6c27c235 +size 16389 diff --git a/checkpoint-24/rng_state_3.pth b/checkpoint-24/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f897d601b061602d64b8f18f8f1b1d50f4ad6a8 --- /dev/null +++ b/checkpoint-24/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:515817bde77a1ad2b0448ee8d52f5a043833e8d0bb8790ef4458cf7f770072ab +size 16389 diff --git a/checkpoint-24/rng_state_4.pth b/checkpoint-24/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ce69165d1cb108a2ea2ef366a993a477e022e4d --- /dev/null +++ b/checkpoint-24/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e04652d7c5663646d149ea63919374d8711a3cc34cdf83e0c7b53367652ed9 +size 16389 diff --git a/checkpoint-24/rng_state_5.pth b/checkpoint-24/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e3d582d20ea5e8dffc1bddc47003d25ffafe283 --- /dev/null +++ b/checkpoint-24/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1cd65e45d80e74426adebabb86d40eb0f0fd98ca5cbac5d72e9f2efc160220 +size 16325 diff --git a/checkpoint-24/rng_state_6.pth b/checkpoint-24/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5c2f8e0bc4c61a8460a122e912b694327e4c594 --- /dev/null +++ b/checkpoint-24/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a9fec192ddf524c7a3a4c465375057e258c5080ab9d8bfcc9dbb5ef6a485a8 +size 16389 diff --git a/checkpoint-24/rng_state_7.pth b/checkpoint-24/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dca0565ca830a34b3d071b50307c75b60192072 --- /dev/null +++ b/checkpoint-24/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed664556cc3513bb3bce5a730d112547e84b2f06217b9cb6bd5422cdb73f0b2 +size 16453 diff --git a/checkpoint-24/scheduler.pt b/checkpoint-24/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..98e961db865b98ee0bb272d7960c521d45149703 --- /dev/null +++ b/checkpoint-24/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6f7888372853e31f4e598407c7e5e33416b491809a3ee3e6bdb73d3aba9ff5 +size 1401 diff --git a/checkpoint-24/trainer_state.json b/checkpoint-24/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..28c98e01019184c604a3825cec4b497fc6482848 --- /dev/null +++ b/checkpoint-24/trainer_state.json @@ -0,0 +1,425 @@ +{ + "best_metric": 0.04339282959699631, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24", + "epoch": 5.842105263157895, + "eval_steps": 6, + "global_step": 24, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + }, + { + "clip_ratio": 2.4864069928298704e-05, + "completion_length": 10822.3515625, + "epoch": 4.2105263157894735, + "grad_norm": 0.008352754637598991, + "kl": 0.1787109375, + "learning_rate": 9.01061596377522e-05, + "loss": -0.04504441097378731, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.027318883687257767, + "reward_std": 0.10441224090754986, + "rewards/CosineReward": 0.027319116634316742, + "rewards/RepetitionPenalty": -2.338138500590503e-07, + "step": 17, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005998397711664438, + "learning_rate": 8.83022221559489e-05, + "loss": -0.045487549155950546, + "memory(GiB)": 182.91, + "step": 18, + "train_speed(iter/s)": 0.000432 + }, + { + "epoch": 4.421052631578947, + "eval_clip_ratio": 2.286707422172185e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.18359375, + "eval_loss": -0.38219889998435974, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03729328140616417, + "eval_reward_std": 0.10691346973180771, + "eval_rewards/CosineReward": 0.03729327768087387, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1041.231, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 18 + }, + { + "clip_ratio": 6.176384295031312e-05, + "completion_length": 10454.50390625, + "epoch": 4.631578947368421, + "grad_norm": 0.007075619418174028, + "kl": 0.1820068359375, + "learning_rate": 8.636868207865244e-05, + "loss": -0.03466903418302536, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.466796875, + "reward": 0.04069916973821819, + "reward_std": 0.11991005763411522, + "rewards/CosineReward": 0.04070046404376626, + "rewards/RepetitionPenalty": -1.294118249006715e-06, + "step": 19, + "train_speed(iter/s)": 0.000404 + }, + { + "clip_ratio": 6.06911453360226e-05, + "epoch": 4.842105263157895, + "grad_norm": 0.005896567367017269, + "kl": 0.19287109375, + "learning_rate": 8.43120818934367e-05, + "loss": -0.03502114117145538, + "memory(GiB)": 182.91, + "step": 20, + "train_speed(iter/s)": 0.000424 + }, + { + "clip_ratio": 3.8725801914551994e-05, + "completion_length": 10645.056640625, + "epoch": 5.2105263157894735, + "grad_norm": 0.004154536407440901, + "kl": 0.17626953125, + "learning_rate": 8.213938048432697e-05, + "loss": -0.008662773296236992, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.04996980866417289, + "reward_std": 0.13849420100450516, + "rewards/CosineReward": 0.049969930201768875, + "rewards/RepetitionPenalty": -1.1864573679076784e-07, + "step": 21, + "train_speed(iter/s)": 0.000408 + }, + { + "clip_ratio": 5.869188044016482e-05, + "epoch": 5.421052631578947, + "grad_norm": 0.004300669766962528, + "kl": 0.178955078125, + "learning_rate": 7.985792958513931e-05, + "loss": -0.008743642829358578, + "memory(GiB)": 182.91, + "step": 22, + "train_speed(iter/s)": 0.000426 + }, + { + "clip_ratio": 4.6346245653694496e-05, + "completion_length": 10538.072265625, + "epoch": 5.631578947368421, + "grad_norm": 0.01327697653323412, + "kl": 0.1796875, + "learning_rate": 7.74754489035403e-05, + "loss": -0.03423420712351799, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.583984375, + "reward": 0.034468831261619925, + "reward_std": 0.11841745302081108, + "rewards/CosineReward": 0.03447544714435935, + "rewards/RepetitionPenalty": -6.612649428916484e-06, + "step": 23, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 0.014131724834442139, + "learning_rate": 7.500000000000001e-05, + "loss": -0.03426633030176163, + "memory(GiB)": 182.91, + "step": 24, + "train_speed(iter/s)": 0.000427 + }, + { + "epoch": 5.842105263157895, + "eval_clip_ratio": 4.0687620639801025e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.1982421875, + "eval_loss": 0.3612469434738159, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.04339282959699631, + "eval_reward_std": 0.10456253588199615, + "eval_rewards/CosineReward": 0.04339282959699631, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1045.0632, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 24 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24/training_args.bin b/checkpoint-24/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-24/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-24/zero_to_fp32.py b/checkpoint-24/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-24/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-26/README.md b/checkpoint-26/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-26/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-26/adapter_config.json b/checkpoint-26/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-26/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-26/adapter_model.safetensors b/checkpoint-26/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf946b74fb8bba75fa9b720e6c7e4832db822710 --- /dev/null +++ b/checkpoint-26/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e248ba84144cf180154aaafb1e08d78f0615bc967458c1f44af66b802b1ab3 +size 275342392 diff --git a/checkpoint-26/additional_config.json b/checkpoint-26/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-26/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-26/args.json b/checkpoint-26/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-26/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0060dd069d1e7c442d2859d5b995f4481d6d58be --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ae9ef8aa72149174439f3875f6fe75ac8ff4fc16fb8a3e1ef86800edc103d4 +size 51616517 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07452d1f7746601bb9e7a4ae1af7c0c7fe91533b --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c18cf4031a45e381acaa856de209421975d82855a30c1169887f8bbdd3c8148 +size 51616005 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..635463922ef758fa93d7b9522c38ba2b88082570 --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f552f8474e6b14fd73c6081492e88eddd1da5fb46538d4c91d1a81b8a74662 +size 51616517 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0322628e93a2b897205a58ef26094b19d3980ff6 --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e881ef3252869e559b2eef16ee4ea2d5181e89845b83c85279271b2822bce3 +size 51616005 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69e2b8c752d1c00ec55b7c00f1dd9f006334917f --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68864b751112ca08856a78032a246ac35fb9eea37a78a4fe87c58bcf86cb39f +size 51616517 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0156718fc39250f5faf6cc8874cc9ac7a521b54a --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fb217404837421bf126835d4453eab9c579c669e1ef4cb73a22029400859a6 +size 51616005 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de9f1ae6eff651a54a36a305dcf81a033ce5748f --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5effb40a03548575913082dbbf2b928dc980716cb1acfe46fbea4bb560924057 +size 51616517 diff --git a/checkpoint-26/global_step26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-26/global_step26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..520a72176bc92f9f750611ec13e1a5b327289e99 --- /dev/null +++ b/checkpoint-26/global_step26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb14bae11849109b84ff924412c6b081ffe81cc1a4f2d41908b8b1e2791d56f5 +size 51616005 diff --git a/checkpoint-26/global_step26/mp_rank_00_model_states.pt b/checkpoint-26/global_step26/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09b06fb3733fc7f23635df1e2fd0438e548e4ef9 --- /dev/null +++ b/checkpoint-26/global_step26/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f458dd7b5ec2c173a55a76eeae35d5606781fda393fe0eac92ea36ca0d59f50 +size 275768601 diff --git a/checkpoint-26/latest b/checkpoint-26/latest new file mode 100644 index 0000000000000000000000000000000000000000..fdccf4dfdc0964529235bf8eeb5b29034a95d131 --- /dev/null +++ b/checkpoint-26/latest @@ -0,0 +1 @@ +global_step26 \ No newline at end of file diff --git a/checkpoint-26/rng_state_0.pth b/checkpoint-26/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..20e48d3fbca2e74c92ac659669020154066f6d77 --- /dev/null +++ b/checkpoint-26/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d3945821b7612460224e9f157c0e05fb0a703fed125e1df309e31a175292b8 +size 16389 diff --git a/checkpoint-26/rng_state_1.pth b/checkpoint-26/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..03dcf169cf395067ec3676120cb4c7d6c5d2cbe7 --- /dev/null +++ b/checkpoint-26/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f5465ef442e51d48a42e86cc5632741aa94cd87962f1fe4ebca607b8fa92cd +size 16389 diff --git a/checkpoint-26/rng_state_2.pth b/checkpoint-26/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba3a4e9199bab847e120ed47d6e627b5097def49 --- /dev/null +++ b/checkpoint-26/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0ecbac8d10b1a32256a92e3eab1c5ff0c2db196f05fed12a16beb0bc7cff89 +size 16389 diff --git a/checkpoint-26/rng_state_3.pth b/checkpoint-26/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4132f6a68b6e9d4aa5ecb242de5222f1c9df2fe9 --- /dev/null +++ b/checkpoint-26/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8d5bb6df95c4a971ab46531a63b02e451c07c7956d68c2d095f815e6d35da26 +size 16389 diff --git a/checkpoint-26/rng_state_4.pth b/checkpoint-26/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..02fe6f12f0c0cbd6f461538d412d4656de7de0f5 --- /dev/null +++ b/checkpoint-26/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4b444a1b0c461de211837b3f774aab89e333ba698170615bd4731d299224 +size 16389 diff --git a/checkpoint-26/rng_state_5.pth b/checkpoint-26/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ec65bc8d7d3d55c9c4bbbfedcf6623898cac599 --- /dev/null +++ b/checkpoint-26/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:742c12ba0450b48b1dfb2bd8b77ff91ffd0cdc76baa86845953dec19658e21fa +size 16325 diff --git a/checkpoint-26/rng_state_6.pth b/checkpoint-26/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..834e3b4bb99f2d83628ecfdb409f88a4489f3ebf --- /dev/null +++ b/checkpoint-26/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176bda912342e71c45a6e330ca4035152d182060349d93578562820b9768f167 +size 16389 diff --git a/checkpoint-26/rng_state_7.pth b/checkpoint-26/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..348df419d62aaf4c5e28376a01f3415b98450ebe --- /dev/null +++ b/checkpoint-26/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccec3371b30268e38fb76acb6b83ccf647a589181e4e868757b9f9ae7ce30e69 +size 16453 diff --git a/checkpoint-26/scheduler.pt b/checkpoint-26/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..488b360048bf654868c3969367c0aced14f8fc5a --- /dev/null +++ b/checkpoint-26/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7185a2bafc6da35bf3d72f21f38533cc1096af7707f3a00bf62c8ee196ff503 +size 1401 diff --git a/checkpoint-26/trainer_state.json b/checkpoint-26/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5b072b987db923c7897214c7f630cb7ed87c56ab --- /dev/null +++ b/checkpoint-26/trainer_state.json @@ -0,0 +1,453 @@ +{ + "best_metric": 0.04339282959699631, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24", + "epoch": 6.421052631578947, + "eval_steps": 6, + "global_step": 26, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + }, + { + "clip_ratio": 9.482144946559856e-06, + "completion_length": 10432.384765625, + "epoch": 2.2105263157894735, + "grad_norm": 0.10375155508518219, + "kl": 0.0963134765625, + "learning_rate": 9.924038765061042e-05, + "loss": -0.05842069163918495, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.255859375, + "reward": 0.03643610421568155, + "reward_std": 0.11898956261575222, + "rewards/CosineReward": 0.03643618477508426, + "rewards/RepetitionPenalty": -7.898860587829404e-08, + "step": 9, + "train_speed(iter/s)": 0.000396 + }, + { + "clip_ratio": 0.0036088433116674423, + "epoch": 2.4210526315789473, + "grad_norm": 0.09477333724498749, + "kl": 0.1185302734375, + "learning_rate": 9.865224352899119e-05, + "loss": -0.06491819024085999, + "memory(GiB)": 182.91, + "step": 10, + "train_speed(iter/s)": 0.000436 + }, + { + "clip_ratio": 1.2955343891007942e-05, + "completion_length": 10559.296875, + "epoch": 2.6315789473684212, + "grad_norm": 0.06739140301942825, + "kl": 0.1275634765625, + "learning_rate": 9.789947561577445e-05, + "loss": -0.04600231721997261, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.361328125, + "reward": 0.023204635945148766, + "reward_std": 0.10593634657561779, + "rewards/CosineReward": 0.02320496749598533, + "rewards/RepetitionPenalty": -3.3051759373847744e-07, + "step": 11, + "train_speed(iter/s)": 0.000405 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05781339108943939, + "learning_rate": 9.698463103929542e-05, + "loss": -0.05069056898355484, + "memory(GiB)": 182.91, + "step": 12, + "train_speed(iter/s)": 0.000439 + }, + { + "epoch": 2.8421052631578947, + "eval_clip_ratio": 4.392032860778272e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.2275390625, + "eval_loss": 0.17524278163909912, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03234308212995529, + "eval_reward_std": 0.10685288906097412, + "eval_rewards/CosineReward": 0.03234308212995529, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1025.9041, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 12 + }, + { + "clip_ratio": 0.0007908324183745208, + "completion_length": 10652.939453125, + "epoch": 3.2105263157894735, + "grad_norm": 0.01199417095631361, + "kl": 0.151123046875, + "learning_rate": 9.591080534401371e-05, + "loss": -0.02191038429737091, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.419921875, + "reward": 0.035983758978545666, + "reward_std": 0.11553369648754597, + "rewards/CosineReward": 0.03598417737521231, + "rewards/RepetitionPenalty": -4.176556771540163e-07, + "step": 13, + "train_speed(iter/s)": 0.000399 + }, + { + "clip_ratio": 0.0004821276670554653, + "epoch": 3.4210526315789473, + "grad_norm": 0.01075426209717989, + "kl": 0.169189453125, + "learning_rate": 9.468163201617062e-05, + "loss": -0.022672578692436218, + "memory(GiB)": 182.91, + "step": 14, + "train_speed(iter/s)": 0.000427 + }, + { + "clip_ratio": 1.9617403040683712e-05, + "completion_length": 10482.146484375, + "epoch": 3.6315789473684212, + "grad_norm": 0.01779361069202423, + "kl": 0.166748046875, + "learning_rate": 9.330127018922194e-05, + "loss": -0.059799157083034515, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.4765625, + "reward": 0.03584331553429365, + "reward_std": 0.11829411797225475, + "rewards/CosineReward": 0.03584346390562132, + "rewards/RepetitionPenalty": -1.4977952389472193e-07, + "step": 15, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 0.00011349086707923561, + "epoch": 3.8421052631578947, + "grad_norm": 0.013216385617852211, + "kl": 0.16748046875, + "learning_rate": 9.177439057064683e-05, + "loss": -0.06071458384394646, + "memory(GiB)": 182.91, + "step": 16, + "train_speed(iter/s)": 0.000431 + }, + { + "clip_ratio": 2.4864069928298704e-05, + "completion_length": 10822.3515625, + "epoch": 4.2105263157894735, + "grad_norm": 0.008352754637598991, + "kl": 0.1787109375, + "learning_rate": 9.01061596377522e-05, + "loss": -0.04504441097378731, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.027318883687257767, + "reward_std": 0.10441224090754986, + "rewards/CosineReward": 0.027319116634316742, + "rewards/RepetitionPenalty": -2.338138500590503e-07, + "step": 17, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005998397711664438, + "learning_rate": 8.83022221559489e-05, + "loss": -0.045487549155950546, + "memory(GiB)": 182.91, + "step": 18, + "train_speed(iter/s)": 0.000432 + }, + { + "epoch": 4.421052631578947, + "eval_clip_ratio": 2.286707422172185e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.18359375, + "eval_loss": -0.38219889998435974, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.03729328140616417, + "eval_reward_std": 0.10691346973180771, + "eval_rewards/CosineReward": 0.03729327768087387, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1041.231, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 18 + }, + { + "clip_ratio": 6.176384295031312e-05, + "completion_length": 10454.50390625, + "epoch": 4.631578947368421, + "grad_norm": 0.007075619418174028, + "kl": 0.1820068359375, + "learning_rate": 8.636868207865244e-05, + "loss": -0.03466903418302536, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.466796875, + "reward": 0.04069916973821819, + "reward_std": 0.11991005763411522, + "rewards/CosineReward": 0.04070046404376626, + "rewards/RepetitionPenalty": -1.294118249006715e-06, + "step": 19, + "train_speed(iter/s)": 0.000404 + }, + { + "clip_ratio": 6.06911453360226e-05, + "epoch": 4.842105263157895, + "grad_norm": 0.005896567367017269, + "kl": 0.19287109375, + "learning_rate": 8.43120818934367e-05, + "loss": -0.03502114117145538, + "memory(GiB)": 182.91, + "step": 20, + "train_speed(iter/s)": 0.000424 + }, + { + "clip_ratio": 3.8725801914551994e-05, + "completion_length": 10645.056640625, + "epoch": 5.2105263157894735, + "grad_norm": 0.004154536407440901, + "kl": 0.17626953125, + "learning_rate": 8.213938048432697e-05, + "loss": -0.008662773296236992, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.5625, + "reward": 0.04996980866417289, + "reward_std": 0.13849420100450516, + "rewards/CosineReward": 0.049969930201768875, + "rewards/RepetitionPenalty": -1.1864573679076784e-07, + "step": 21, + "train_speed(iter/s)": 0.000408 + }, + { + "clip_ratio": 5.869188044016482e-05, + "epoch": 5.421052631578947, + "grad_norm": 0.004300669766962528, + "kl": 0.178955078125, + "learning_rate": 7.985792958513931e-05, + "loss": -0.008743642829358578, + "memory(GiB)": 182.91, + "step": 22, + "train_speed(iter/s)": 0.000426 + }, + { + "clip_ratio": 4.6346245653694496e-05, + "completion_length": 10538.072265625, + "epoch": 5.631578947368421, + "grad_norm": 0.01327697653323412, + "kl": 0.1796875, + "learning_rate": 7.74754489035403e-05, + "loss": -0.03423420712351799, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.583984375, + "reward": 0.034468831261619925, + "reward_std": 0.11841745302081108, + "rewards/CosineReward": 0.03447544714435935, + "rewards/RepetitionPenalty": -6.612649428916484e-06, + "step": 23, + "train_speed(iter/s)": 0.00041 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 0.014131724834442139, + "learning_rate": 7.500000000000001e-05, + "loss": -0.03426633030176163, + "memory(GiB)": 182.91, + "step": 24, + "train_speed(iter/s)": 0.000427 + }, + { + "epoch": 5.842105263157895, + "eval_clip_ratio": 4.0687620639801025e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.1982421875, + "eval_loss": 0.3612469434738159, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.04339282959699631, + "eval_reward_std": 0.10456253588199615, + "eval_rewards/CosineReward": 0.04339282959699631, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1045.0632, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 24 + }, + { + "clip_ratio": 5.05705434079573e-05, + "completion_length": 10789.259765625, + "epoch": 6.2105263157894735, + "grad_norm": 0.0099335303530097, + "kl": 0.1800537109375, + "learning_rate": 7.243995901002312e-05, + "loss": -0.02097315341234207, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.6171875, + "reward": 0.03010205877944827, + "reward_std": 0.10742511600255966, + "rewards/CosineReward": 0.030102317687124014, + "rewards/RepetitionPenalty": -2.580197531187878e-07, + "step": 25, + "train_speed(iter/s)": 0.000406 + }, + { + "clip_ratio": 4.821802576771006e-05, + "epoch": 6.421052631578947, + "grad_norm": 0.00989576056599617, + "kl": 0.18408203125, + "learning_rate": 6.980398830195785e-05, + "loss": -0.02103913575410843, + "memory(GiB)": 182.91, + "step": 26, + "train_speed(iter/s)": 0.000421 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-26/training_args.bin b/checkpoint-26/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-26/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-26/zero_to_fp32.py b/checkpoint-26/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-26/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-4/README.md b/checkpoint-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4/adapter_config.json b/checkpoint-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-4/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4/adapter_model.safetensors b/checkpoint-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..047eff46b1c2ac8c992fbce64d3fd95e95d3739a --- /dev/null +++ b/checkpoint-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20096c6f0a363cc7636f49a093e080f1c78651afaeeacb7eb50cd98cc664cbc0 +size 275342392 diff --git a/checkpoint-4/additional_config.json b/checkpoint-4/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-4/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-4/args.json b/checkpoint-4/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-4/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d516d40b7edc8335c50536c4cadbeff64667e3f --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ccf9dade9e69db8d98385eec05fbb3facc3a4a0ea9b5c532335250477ccff1 +size 51616517 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d3c40197e1322b8f3fb57c8082ac36fba41439f --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b0593365984f0c7dfad35eaae7ff5c33a22c0cdb86f21de328b75b12ccc9b79 +size 51616005 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9d1cc8dfe8106b6d1bbc7969b4a67c86bed593a --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f23b541b8d385a111ce4dc7ec005006a85170f32e35d8b55a82b108bf2013c3e +size 51616517 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85935e51d7eaf0d8d4bfdde5989191588e6023dc --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64bc2180d5f1068dbfe812a255e57e536e3a0b34ae04ade2218a23d51ea7345d +size 51616005 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee3e9a41ceef645a757ebc5a657e0aab922cfe25 --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f42b1235ef59ef3a7ceebee245b33fe1fb1fcf09dcbf9a4bd87d4d02e854c7b +size 51616517 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..270c6e8354acfc5780a22f14204e246a0825f1cf --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d785b554c1a40fa2e86c5034d72b0257fbb843cb8adb0f81542d16f1fd3b70 +size 51616005 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..805486121064ff9a41dd9258065c49211357f6c7 --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81da1712e7ed614e99e5084c313b11d6af83e0db1837856337224dbedd083e7a +size 51616517 diff --git a/checkpoint-4/global_step4/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-4/global_step4/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8345bc32340c4cad39d8ad7db4c0a16d6c49e4a0 --- /dev/null +++ b/checkpoint-4/global_step4/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98988229330cd3547cc1e92d272a1504d4647c984699b8614497d572fd5486fd +size 51616005 diff --git a/checkpoint-4/global_step4/mp_rank_00_model_states.pt b/checkpoint-4/global_step4/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e680415176399447ce81aace45e97d85fd744041 --- /dev/null +++ b/checkpoint-4/global_step4/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97cb97954c5886e9499c6aea96a418dff6117a5dd67b8c8fb34c7d8dc64c351c +size 275768601 diff --git a/checkpoint-4/latest b/checkpoint-4/latest new file mode 100644 index 0000000000000000000000000000000000000000..c43fb54c698c16aacd8ee4ef7cc88a6ae76f4452 --- /dev/null +++ b/checkpoint-4/latest @@ -0,0 +1 @@ +global_step4 \ No newline at end of file diff --git a/checkpoint-4/rng_state_0.pth b/checkpoint-4/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..92958a95d262334581d38c5dcf66f12b7bb86212 --- /dev/null +++ b/checkpoint-4/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdec6e898a48a04d407c2addbc61dca5c277cad38cb1267cbac8c28e469483c3 +size 16389 diff --git a/checkpoint-4/rng_state_1.pth b/checkpoint-4/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f06f0e66037dbdf9156146971710c8bc7545054 --- /dev/null +++ b/checkpoint-4/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31512522956b453173c1e70438695b3a739ff0dc3ba7859fd9e2d2fc1969caec +size 16389 diff --git a/checkpoint-4/rng_state_2.pth b/checkpoint-4/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd2111bfdd9f1c576c2eee8dc637e852f858cfcc --- /dev/null +++ b/checkpoint-4/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16b174181064840ca42abbe756c39f3398e23609c6906b519d91ad951f98e19c +size 16389 diff --git a/checkpoint-4/rng_state_3.pth b/checkpoint-4/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c762f534e5c33faacbd3348c0d88acd76cc6b87 --- /dev/null +++ b/checkpoint-4/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cd17f237474d007e49675a9010eddb50eea853c9506b7c1aa60c9087229b81a +size 16389 diff --git a/checkpoint-4/rng_state_4.pth b/checkpoint-4/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e69d0569575fd9df0f2fa2f6e0b98526075069a0 --- /dev/null +++ b/checkpoint-4/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e6e1d2c545210248762cb45f9d88caa862b3d4d0922c46c23a2f6b9b6f693e +size 16389 diff --git a/checkpoint-4/rng_state_5.pth b/checkpoint-4/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a5a35c0040582329761f6c4c6bfc187f7439c7f --- /dev/null +++ b/checkpoint-4/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace639f7d8a15a083afd410e1de159351aa823f3df9868f41cb482c91a67d25e +size 16325 diff --git a/checkpoint-4/rng_state_6.pth b/checkpoint-4/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c67c5f1f3f2db5cbb564bb596a4bca5d62f4084 --- /dev/null +++ b/checkpoint-4/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec5bb346807cfb5818c2f9317f1ff509985a16523b58e8396e09513adb521d8f +size 16389 diff --git a/checkpoint-4/rng_state_7.pth b/checkpoint-4/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c20e80374ba870bafd7a3adcbef37968b451cdc --- /dev/null +++ b/checkpoint-4/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf17989ede66da58f0c8fc25132be77ccc6d6c7539c638aea20308c21162ced8 +size 16453 diff --git a/checkpoint-4/scheduler.pt b/checkpoint-4/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a5c79f97199f4e60c305f99782ebce4500e881c --- /dev/null +++ b/checkpoint-4/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2c637427516edccbb86d4864bf55e1030f0e5367debc9d6bd0da65d6146c07 +size 1401 diff --git a/checkpoint-4/trainer_state.json b/checkpoint-4/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f595f5961ff8df694c28b2d941589615801e0294 --- /dev/null +++ b/checkpoint-4/trainer_state.json @@ -0,0 +1,89 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8421052631578947, + "eval_steps": 6, + "global_step": 4, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4/training_args.bin b/checkpoint-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-4/zero_to_fp32.py b/checkpoint-4/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-4/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-6/README.md b/checkpoint-6/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-6/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6/adapter_config.json b/checkpoint-6/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-6/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6/adapter_model.safetensors b/checkpoint-6/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b2a82b6a76d513580b6856bca1d0870d0cf4327 --- /dev/null +++ b/checkpoint-6/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:101d88106c0a6406c1ccbe5f8d01cbd67d6f7aaccfdcbaeb9d8d0073b692f4a9 +size 275342392 diff --git a/checkpoint-6/additional_config.json b/checkpoint-6/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-6/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-6/args.json b/checkpoint-6/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-6/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95a6853477ed041473930b28301992e7666e2fc5 --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7b550bc7c948f0659dd03ae76706e5c054a882645e70d438a20971495ce212 +size 51616517 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..467a44f0fb904a67d60b733c9134f47223d98d61 --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52e6035ac29de94ef07b382f4deee2825bbc60e99a33fb87a24e23ba502cc7c1 +size 51616005 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..022e39d9aa50874882e13436a8e40d47d9630235 --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00168ba2f622125ad244b457b9c44a9e79f404e41661e95aaec5b89ab52e26d +size 51616517 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ba0a90e29caf637fee55b16786ccc8d37cfac6c --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1538edd6c7dfc89cdd0e715d605c751569278bbe8bbde99382df36ec9f313ff0 +size 51616005 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f1b6fd803e48080343a88da3cfd94ec9c336f1a --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52bfa2e179f4c993fbca197eb2d576c6da07755d9484e0056f5e30bbf2795d2a +size 51616517 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b70a2a0c5c37ddef2b3aedf35ffcbb152bddba12 --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246c312438d142bc91d2982be87cd04ce2917eb13731b1182e19f37e1b76b0f9 +size 51616005 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78c3f0ef4d25c0861c06e57756ec59d3531a14db --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3c3facb0b180f0684a97dafac3b7e055d25a87d70f8e0b30e32520f72276bf +size 51616517 diff --git a/checkpoint-6/global_step6/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-6/global_step6/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c530b6796b691c98c3e40b00f2dee0e2059d732 --- /dev/null +++ b/checkpoint-6/global_step6/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e6b07d61e0264c3374b34c2ec72e979a94d87754cc3caa2753d38d4e4df494 +size 51616005 diff --git a/checkpoint-6/global_step6/mp_rank_00_model_states.pt b/checkpoint-6/global_step6/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b66d5bbcbbfd1bc2906694e0e3d96da499cd49ce --- /dev/null +++ b/checkpoint-6/global_step6/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb48788f89492c5135a174357cc9950c38cc47697eac66a93c8d8a288c36cf8 +size 275768601 diff --git a/checkpoint-6/latest b/checkpoint-6/latest new file mode 100644 index 0000000000000000000000000000000000000000..5e749609a281289fae4aebc8b51792a324b6a648 --- /dev/null +++ b/checkpoint-6/latest @@ -0,0 +1 @@ +global_step6 \ No newline at end of file diff --git a/checkpoint-6/rng_state_0.pth b/checkpoint-6/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..aef00db2217e710589d62a37795448508961d9a8 --- /dev/null +++ b/checkpoint-6/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89da4685aa01e8ccf3c1e45aef243397fd6f2fa5a9c245056e8b81488043655e +size 16389 diff --git a/checkpoint-6/rng_state_1.pth b/checkpoint-6/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d48f21e180005994fa5154317f7de447778fc4a4 --- /dev/null +++ b/checkpoint-6/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7e94f34430d5aeac33d535a7b6785a4cd1eb3a5f38f2261da1da9bdf2997c3 +size 16389 diff --git a/checkpoint-6/rng_state_2.pth b/checkpoint-6/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f863ad8706700a01772b917607a127d72af2cb61 --- /dev/null +++ b/checkpoint-6/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbec3a0c2b7ce76e63d4debd90c79d645ccf108ae68637179bd8d3e6ba09572 +size 16389 diff --git a/checkpoint-6/rng_state_3.pth b/checkpoint-6/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4affbf0095babf1391f4a45e7f734b27aec0822c --- /dev/null +++ b/checkpoint-6/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca85352fceed060b57f1128990ab65d3e94d776aaf6e2f7b57dd70347929cdeb +size 16389 diff --git a/checkpoint-6/rng_state_4.pth b/checkpoint-6/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..49ad2c6383d617b671b4e0b7b046415c54e67581 --- /dev/null +++ b/checkpoint-6/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99429aaa5dcbcb4e026432bbb72eef9cd071d4217136151b0149f62121e9692 +size 16389 diff --git a/checkpoint-6/rng_state_5.pth b/checkpoint-6/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b61bf0932a31a1eb74847ce7819cac13c0ae7ee --- /dev/null +++ b/checkpoint-6/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eafe86d3f302611c888882c98207e2dcd6bbaa7bc48cbb750fad21670a87a87a +size 16325 diff --git a/checkpoint-6/rng_state_6.pth b/checkpoint-6/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f49b9003320957c79d5c98dce65a8e6e3178432 --- /dev/null +++ b/checkpoint-6/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e31b48d947e0c69c3b1ac48fc66d0e37fc8eb23ac1b61c11ac1b1e4e9ded46d8 +size 16389 diff --git a/checkpoint-6/rng_state_7.pth b/checkpoint-6/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed748367a83e39d4c0468c77eb583a86d21eeb31 --- /dev/null +++ b/checkpoint-6/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f418adb01deb3ce4234f31eda6618a80494ac9a23da7edfe02efa455943484c8 +size 16453 diff --git a/checkpoint-6/scheduler.pt b/checkpoint-6/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..85029112ff4bc69bbfbc194f15f5e35546c74717 --- /dev/null +++ b/checkpoint-6/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f344ac6f893b1a00243de4b3ef12cab72330fa2b0c7cea6ef65a1fa27713e535 +size 1401 diff --git a/checkpoint-6/trainer_state.json b/checkpoint-6/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3667b41328b83c70faf4e00796621b5ccfe72baf --- /dev/null +++ b/checkpoint-6/trainer_state.json @@ -0,0 +1,131 @@ +{ + "best_metric": 0.012996690347790718, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6", + "epoch": 1.4210526315789473, + "eval_steps": 6, + "global_step": 6, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6/training_args.bin b/checkpoint-6/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-6/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-6/zero_to_fp32.py b/checkpoint-6/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-6/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-8/README.md b/checkpoint-8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4dbd19e3cdbcefaa269791fa922afe075b806c8 --- /dev/null +++ b/checkpoint-8/README.md @@ -0,0 +1,202 @@ +--- +base_model: /mnt/nvme5n1p1/distill-14b-rl-70 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-8/adapter_config.json b/checkpoint-8/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8d06e83be137d4af154849ed1686625c42e280 --- /dev/null +++ b/checkpoint-8/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/mnt/nvme5n1p1/distill-14b-rl-70", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-8/adapter_model.safetensors b/checkpoint-8/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6cbe9fe1cf6dfeecd984e6e33acba6083fa8be8b --- /dev/null +++ b/checkpoint-8/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ae5de7a62733274764831eca2837abcc6b0a0094e058b45df2e99bc88b14d8 +size 275342392 diff --git a/checkpoint-8/additional_config.json b/checkpoint-8/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/checkpoint-8/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/checkpoint-8/args.json b/checkpoint-8/args.json new file mode 100644 index 0000000000000000000000000000000000000000..e5fddc18fe4219cc621c4285a0faf4591e553d74 --- /dev/null +++ b/checkpoint-8/args.json @@ -0,0 +1,416 @@ +{ + "model": "/mnt/nvme5n1p1/distill-14b-rl-70", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "local_repo_path": null, + "template": "deepseek_r1", + "system": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\\\boxed{}.", + "max_length": 16384, + "truncation_strategy": "left", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "response_prefix": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "stage2_aime.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 52, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "remove_unused_columns": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 1.0, + "top_k": 50, + "top_p": 0.9, + "repetition_penalty": 1.1, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "external_plugins": [], + "seed": 42, + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "use_hf": true, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 4, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 15.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.1, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 1, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 2.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": true, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 6.0, + "dataloader_num_workers": 52, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "reward", + "greater_is_better": true, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "wandb" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 32, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "swanlab_token": null, + "swanlab_project": null, + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_mode": "cloud", + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": null, + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "zero_hpz_partition_size": null, + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "num_infer_workers": 8, + "vllm_max_num_seqs": 256, + "vllm_enforce_eager": false, + "vllm_limit_mm_per_prompt": null, + "vllm_enable_prefix_caching": true, + "cosine_min_len_value_wrong": 0.0, + "cosine_max_len_value_wrong": -0.2, + "cosine_min_len_value_correct": 0.8, + "cosine_max_len_value_correct": 0.4, + "cosine_max_len": 12288, + "repetition_n_grams": 40, + "repetition_max_penalty": -0.05, + "use_lmdeploy": false, + "lmdeploy_device": "auto", + "lmdeploy_session_len": null, + "lmdeploy_cache_max_entry_count": 0.8, + "async_generate": false, + "tensor_parallel_size": 1, + "sleep_level": 0, + "move_model_batches": null, + "offload_optimizer": false, + "offload_model": false, + "gc_collect_after_offload": false, + "num_generations": 8, + "max_completion_length": 12288, + "ds3_gather_for_generation": true, + "reward_funcs": [ + "cosine", + "repetition" + ], + "reward_weights": null, + "log_completions": true, + "use_vllm": false, + "vllm_device": [ + "auto" + ], + "vllm_gpu_memory_utilization": 0.9, + "vllm_max_model_len": null, + "num_iterations": 2, + "epsilon": 0.2, + "rlhf_type": "grpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.04, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 32, + "local_world_size": 8, + "model_suffix": "distill-14b-rl-70", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/mnt/nvme5n1p1/distill-14b-rl-70', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/mnt/nvme5n1p1/distill-14b-rl-70", + "hub": "", + "training_args": "GRPOConfig(output_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=15.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/runs', logging_strategy=, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=, save_steps=2, save_total_limit=100, save_safetensors=True, save_on_each_node=True, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=6, dataloader_num_workers=52, dataloader_prefetch_factor=None, past_index=-1, run_name='/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='reward', greater_is_better=True, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, max_prompt_length=512, num_generations=8, max_completion_length=12288, ds3_gather_for_generation=True, temperature=1.0, top_p=0.9, top_k=50, min_p=None, repetition_penalty=1.1, cache_implementation=None, use_vllm=False, vllm_server_host='0.0.0.0', vllm_server_port=8000, vllm_server_timeout=120.0, vllm_guided_decoding_regex=None, beta=0.04, num_iterations=2, epsilon=0.2, epsilon_high=None, reward_weights=None, scale_rewards=True, sync_ref_model=False, ref_model_mixup_alpha=0.6, ref_model_sync_steps=512, log_completions=True, vllm_device=['auto'], vllm_gpu_memory_utilization=0.9, vllm_dtype=None, vllm_max_model_len=None, vllm_enable_prefix_caching=True, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, is_encoder_decoder=False, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None, num_infer_workers=8, vllm_max_num_seqs=256, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, cosine_min_len_value_wrong=0.0, cosine_max_len_value_wrong=-0.2, cosine_min_len_value_correct=0.8, cosine_max_len_value_correct=0.4, cosine_max_len=12288, repetition_n_grams=40, repetition_max_penalty=-0.05, use_lmdeploy=False, lmdeploy_device='auto', lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, async_generate=False, tensor_parallel_size=1, sleep_level=0, move_model_batches=None, offload_optimizer=False, offload_model=False, gc_collect_after_offload=False, stop_words=[])" +} \ No newline at end of file diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04a53e39c4aa2e6564e552529f5cb00e49f83e9a --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48c5913dd11a5ba693de428012d29284caa135e335424348e2eef7bd98129703 +size 51616517 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c58e69aaaa718482563ec5036c611949299d0cb --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7c76d2f6af67c1449032bac268058dfe1872dee023c64a33bc4323a67cdd28 +size 51616005 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4b6d19e47948517a8ad72c7e5d7db5a566d23bb --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b02e51bf5f456c00ed571b76eaf92fabcda9e1f32b5f8601a5b756184e515bd7 +size 51616517 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..546b5b3f452a69f8d0358bd6d3165a91b0e6223c --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d29d4934a58511e763ac51a5103e9ab9a365cc6271db9096c4356c1af1900af +size 51616005 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4780aa312c9979297f3bb50681d29238be93a4d2 --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59b07a37da88050e7171e06cb261473994421b16fe5ab2062d99586a48f558f +size 51616517 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64dce52fe846201bdf3dac6196ef801481f9c4b7 --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52583acca28a73751c69ccd19a4d806ed00933366a53742b974f505e59a342a5 +size 51616005 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..034c6b2afe9ce12fff57b824c98b108d5df9dd87 --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd175b91d2979f3a490b7b349c7dda3449a944510ea3eaa91979b9ed2d219a2 +size 51616517 diff --git a/checkpoint-8/global_step8/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-8/global_step8/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4c1178cdfd1aeec8096a472cb0a616088a91722 --- /dev/null +++ b/checkpoint-8/global_step8/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9860cc5512981e9accea095d907689283a7ad233890a4bb1e917f3a78c791a2d +size 51616005 diff --git a/checkpoint-8/global_step8/mp_rank_00_model_states.pt b/checkpoint-8/global_step8/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d807088320fb7b2b5cf835c7385adcdbf22b42bb --- /dev/null +++ b/checkpoint-8/global_step8/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d14b3d4422464385abf7bb7c479c4e556485ced5ac5504652884dbb8c4b918 +size 275768601 diff --git a/checkpoint-8/latest b/checkpoint-8/latest new file mode 100644 index 0000000000000000000000000000000000000000..0de7bd061fc0ecf2083021d54ffaba3c34cab26a --- /dev/null +++ b/checkpoint-8/latest @@ -0,0 +1 @@ +global_step8 \ No newline at end of file diff --git a/checkpoint-8/rng_state_0.pth b/checkpoint-8/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1bc330e1a7ee1d1638fadf421a79dbc562d9bb4c --- /dev/null +++ b/checkpoint-8/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9085ca0aef107aba4ca757aff33ba1de996b57471fec333216785879cf157f +size 16389 diff --git a/checkpoint-8/rng_state_1.pth b/checkpoint-8/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..32c1a84ea58c2d2431b93cdc73fb05ee0879bb68 --- /dev/null +++ b/checkpoint-8/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418ca3c5e6b03fefa33ecaa0368d8df2c1dc98aa03932c1d557a1d36bf41e370 +size 16389 diff --git a/checkpoint-8/rng_state_2.pth b/checkpoint-8/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..509f6bd019a8e8c8cdd6bd418f312de357cee608 --- /dev/null +++ b/checkpoint-8/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77362e3f82962aa52695dfa595c9136f0dc31d866c119acea4bd30a5866b2cae +size 16389 diff --git a/checkpoint-8/rng_state_3.pth b/checkpoint-8/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..33a6ededfae45c753405afe27c22a3b88990fa56 --- /dev/null +++ b/checkpoint-8/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6438107fa8f60ac06990a38c96dcf6b66fca3e3d8128d709822ac4584d000f2 +size 16389 diff --git a/checkpoint-8/rng_state_4.pth b/checkpoint-8/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b545879c5df9d332f3117f429df2de2c3dc02ef --- /dev/null +++ b/checkpoint-8/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c32c3d9883685d73a2b360eede1986ccea9843c843b9361b1d6e325c11b2b38 +size 16389 diff --git a/checkpoint-8/rng_state_5.pth b/checkpoint-8/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..360c125a11f77e7a4297dafd52b3638db063b4fb --- /dev/null +++ b/checkpoint-8/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5557a6b62493a54d94d7eaf56141c8b7f34aeb463cb00873b23571eb4c950cf +size 16325 diff --git a/checkpoint-8/rng_state_6.pth b/checkpoint-8/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..aac0cc13ab2206e91e286ef4e2ae74818f600f6a --- /dev/null +++ b/checkpoint-8/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef1ba5661fe254e3ad3b2c653eff69a4a60eb661573188c80e66de36c715f7f0 +size 16389 diff --git a/checkpoint-8/rng_state_7.pth b/checkpoint-8/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e1ae5491346dc867467771554291b91eb2bbdc43 --- /dev/null +++ b/checkpoint-8/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d18e4c5cb76e76f350dd31ebaf3a356f2ada2cd1fa91e890e523c499131b477 +size 16453 diff --git a/checkpoint-8/scheduler.pt b/checkpoint-8/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..28050b96c373c5a0152f99d97875c24d2137d7da --- /dev/null +++ b/checkpoint-8/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7bc5d7a65c47658fa034b859c688d0d969b1a38a8c8e587dd72d0a12fc727be +size 1401 diff --git a/checkpoint-8/trainer_state.json b/checkpoint-8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1878466a3ed5546ada8088e46201029d0f48bef5 --- /dev/null +++ b/checkpoint-8/trainer_state.json @@ -0,0 +1,159 @@ +{ + "best_metric": 0.012996690347790718, + "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6", + "epoch": 1.8421052631578947, + "eval_steps": 6, + "global_step": 8, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 10352.974609375, + "epoch": 0.21052631578947367, + "grad_norm": 0.13259537518024445, + "kl": 0.0, + "learning_rate": 1.6666666666666667e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.11328125, + "reward": -0.002658387296833098, + "reward_std": 0.06134121119976044, + "rewards/CosineReward": -0.0026579967816360295, + "rewards/RepetitionPenalty": -3.8975886695880035e-07, + "step": 1, + "train_speed(iter/s)": 0.000242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.42105263157894735, + "grad_norm": 0.1320001482963562, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-05, + "loss": -0.11016345024108887, + "memory(GiB)": 182.91, + "step": 2, + "train_speed(iter/s)": 0.000467 + }, + { + "clip_ratio": 1.3441811461234465e-05, + "completion_length": 10439.369140625, + "epoch": 0.631578947368421, + "grad_norm": 0.08990391343832016, + "kl": 9.50181856751442e-07, + "learning_rate": 5e-05, + "loss": -0.06604708731174469, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.13671875, + "reward": 0.0006296975770965219, + "reward_std": 0.07172460854053497, + "rewards/CosineReward": 0.0006298604130279273, + "rewards/RepetitionPenalty": -1.6200439745261974e-07, + "step": 3, + "train_speed(iter/s)": 0.00035 + }, + { + "clip_ratio": 1.70210253145342e-05, + "epoch": 0.8421052631578947, + "grad_norm": 0.0967094898223877, + "kl": 1.1101365089416504e-05, + "learning_rate": 6.666666666666667e-05, + "loss": -0.06727766245603561, + "memory(GiB)": 182.91, + "step": 4, + "train_speed(iter/s)": 0.000458 + }, + { + "clip_ratio": 1.675608473306056e-05, + "completion_length": 10092.408203125, + "epoch": 1.2105263157894737, + "grad_norm": 0.142837256193161, + "kl": 0.00017762184143066406, + "learning_rate": 8.333333333333334e-05, + "loss": -0.09315311908721924, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.119140625, + "reward": -0.005135859013535082, + "reward_std": 0.07994875870645046, + "rewards/CosineReward": -0.005134060338605195, + "rewards/RepetitionPenalty": -1.7973881654143042e-06, + "step": 5, + "train_speed(iter/s)": 0.000387 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.18263348937034607, + "learning_rate": 0.0001, + "loss": -0.1041698157787323, + "memory(GiB)": 182.91, + "step": 6, + "train_speed(iter/s)": 0.000459 + }, + { + "epoch": 1.4210526315789473, + "eval_clip_ratio": 4.069424539920874e-05, + "eval_completion_length": 12289.0, + "eval_kl": 0.04833984375, + "eval_loss": -0.5377416610717773, + "eval_response_clip_ratio": 1.0, + "eval_reward": 0.012996690347790718, + "eval_reward_std": 0.08769983053207397, + "eval_rewards/CosineReward": 0.012996694073081017, + "eval_rewards/RepetitionPenalty": 0.0, + "eval_runtime": 1030.1127, + "eval_samples_per_second": 0.001, + "eval_steps_per_second": 0.001, + "step": 6 + }, + { + "clip_ratio": 0.0005237623976199757, + "completion_length": 10448.94921875, + "epoch": 1.631578947368421, + "grad_norm": 0.1291271299123764, + "kl": 0.017406463623046875, + "learning_rate": 9.991540791356342e-05, + "loss": -0.051375165581703186, + "memory(GiB)": 182.91, + "response_clip_ratio": 0.1484375, + "reward": 0.004909618757665157, + "reward_std": 0.08167182095348835, + "rewards/CosineReward": 0.004909833543933928, + "rewards/RepetitionPenalty": -2.1478646772266075e-07, + "step": 7, + "train_speed(iter/s)": 0.000382 + }, + { + "clip_ratio": 0.1706484742462635, + "epoch": 1.8421052631578947, + "grad_norm": 0.26641014218330383, + "kl": 0.089599609375, + "learning_rate": 9.966191788709716e-05, + "loss": -0.05105742812156677, + "memory(GiB)": 182.91, + "step": 8, + "train_speed(iter/s)": 0.000433 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 2, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8/training_args.bin b/checkpoint-8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a5c75ad76398d7f403ccc1a74c463d2dab6465ca --- /dev/null +++ b/checkpoint-8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb +size 9809 diff --git a/checkpoint-8/zero_to_fp32.py b/checkpoint-8/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-8/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/completions.jsonl b/completions.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..64d7f11d8bb549e230f74f7be12284ae7fde281c --- /dev/null +++ b/completions.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f346f7b39aa387fa72d61ce3ee2de45b0102460c7388eae4dddcb5467b4d18da +size 240255328 diff --git a/logging.jsonl b/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..328ba4f1d491f854f9a647c4b194d0d6e5f840c8 --- /dev/null +++ b/logging.jsonl @@ -0,0 +1,30 @@ +{"loss": -0.11016345, "grad_norm": 0.13259538, "learning_rate": 1.667e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000242, "completion_length": 10352.97460938, "response_clip_ratio": 0.11328125, "rewards/CosineReward": -0.002658, "rewards/RepetitionPenalty": -3.9e-07, "reward": -0.00265839, "reward_std": 0.06134121, "kl": 0.0, "clip_ratio": 0.0, "epoch": 0.21052632, "global_step/max_steps": "1/60", "percentage": "1.67%", "elapsed_time": "1h 8m 54s", "remaining_time": "2d 19h 45m 49s"} +{"loss": -0.11016345, "grad_norm": 0.13200015, "learning_rate": 3.333e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000467, "kl": 0.0, "clip_ratio": 0.0, "epoch": 0.42105263, "global_step/max_steps": "2/60", "percentage": "3.33%", "elapsed_time": "1h 11m 21s", "remaining_time": "1d 10h 29m 18s"} +{"loss": -0.06604709, "grad_norm": 0.08990391, "learning_rate": 5e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.00035, "completion_length": 10439.36914062, "response_clip_ratio": 0.13671875, "rewards/CosineReward": 0.00062986, "rewards/RepetitionPenalty": -1.6e-07, "reward": 0.0006297, "reward_std": 0.07172461, "kl": 9.5e-07, "clip_ratio": 1.344e-05, "epoch": 0.63157895, "global_step/max_steps": "3/60", "percentage": "5.00%", "elapsed_time": "2h 22m 48s", "remaining_time": "1d 21h 13m 12s"} +{"loss": -0.06727766, "grad_norm": 0.09670949, "learning_rate": 6.667e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000458, "kl": 1.11e-05, "clip_ratio": 1.702e-05, "epoch": 0.84210526, "global_step/max_steps": "4/60", "percentage": "6.67%", "elapsed_time": "2h 25m 27s", "remaining_time": "1d 9h 56m 30s"} +{"loss": -0.09315312, "grad_norm": 0.14283726, "learning_rate": 8.333e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000387, "completion_length": 10092.40820312, "response_clip_ratio": 0.11914062, "rewards/CosineReward": -0.00513406, "rewards/RepetitionPenalty": -1.8e-06, "reward": -0.00513586, "reward_std": 0.07994876, "kl": 0.00017762, "clip_ratio": 1.676e-05, "epoch": 1.21052632, "global_step/max_steps": "5/60", "percentage": "8.33%", "elapsed_time": "3h 35m 22s", "remaining_time": "1d 15h 29m 7s"} +{"loss": -0.10416982, "grad_norm": 0.18263349, "learning_rate": 0.0001, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000459, "epoch": 1.42105263, "global_step/max_steps": "6/60", "percentage": "10.00%", "elapsed_time": "3h 37m 55s", "remaining_time": "1d 8h 41m 23s"} +{"eval_loss": -0.53774166, "eval_completion_length": 12289.0, "eval_response_clip_ratio": 1.0, "eval_rewards/CosineReward": 0.01299669, "eval_rewards/RepetitionPenalty": 0.0, "eval_reward": 0.01299669, "eval_reward_std": 0.08769983, "eval_kl": 0.04833984, "eval_clip_ratio": 4.069e-05, "eval_runtime": 1030.1127, "eval_samples_per_second": 0.001, "eval_steps_per_second": 0.001, "epoch": 1.42105263, "global_step/max_steps": "6/60", "percentage": "10.00%", "elapsed_time": "3h 55m 6s", "remaining_time": "1d 11h 15m 54s"} +{"loss": -0.05137517, "grad_norm": 0.12912713, "learning_rate": 9.992e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000382, "kl": 0.01740646, "clip_ratio": 0.00052376, "completion_length": 10448.94921875, "response_clip_ratio": 0.1484375, "rewards/CosineReward": 0.00490983, "rewards/RepetitionPenalty": -2.1e-07, "reward": 0.00490962, "reward_std": 0.08167182, "epoch": 1.63157895, "global_step/max_steps": "7/60", "percentage": "11.67%", "elapsed_time": "5h 5m 17s", "remaining_time": "1d 14h 31m 31s"} +{"loss": -0.05105743, "grad_norm": 0.26641014, "learning_rate": 9.966e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000433, "kl": 0.08959961, "clip_ratio": 0.17064847, "epoch": 1.84210526, "global_step/max_steps": "8/60", "percentage": "13.33%", "elapsed_time": "5h 7m 58s", "remaining_time": "1d 9h 21m 52s"} +{"loss": -0.05842069, "grad_norm": 0.10375156, "learning_rate": 9.924e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000396, "completion_length": 10432.38476562, "response_clip_ratio": 0.25585938, "rewards/CosineReward": 0.03643618, "rewards/RepetitionPenalty": -8e-08, "reward": 0.0364361, "reward_std": 0.11898956, "kl": 0.09631348, "clip_ratio": 9.48e-06, "epoch": 2.21052632, "global_step/max_steps": "9/60", "percentage": "15.00%", "elapsed_time": "6h 19m 3s", "remaining_time": "1d 11h 47m 59s"} +{"loss": -0.06491819, "grad_norm": 0.09477334, "learning_rate": 9.865e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000436, "kl": 0.11853027, "clip_ratio": 0.00360884, "epoch": 2.42105263, "global_step/max_steps": "10/60", "percentage": "16.67%", "elapsed_time": "6h 21m 47s", "remaining_time": "1d 7h 48m 57s"} +{"loss": -0.04600232, "grad_norm": 0.0673914, "learning_rate": 9.79e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000405, "completion_length": 10559.296875, "response_clip_ratio": 0.36132812, "rewards/CosineReward": 0.02320497, "rewards/RepetitionPenalty": -3.3e-07, "reward": 0.02320464, "reward_std": 0.10593635, "kl": 0.12756348, "clip_ratio": 1.296e-05, "epoch": 2.63157895, "global_step/max_steps": "11/60", "percentage": "18.33%", "elapsed_time": "7h 32m 52s", "remaining_time": "1d 9h 37m 21s"} +{"loss": -0.05069057, "grad_norm": 0.05781339, "learning_rate": 9.698e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000439, "epoch": 2.84210526, "global_step/max_steps": "12/60", "percentage": "20.00%", "elapsed_time": "7h 35m 40s", "remaining_time": "1d 6h 22m 42s"} +{"eval_loss": 0.17524278, "eval_completion_length": 12289.0, "eval_response_clip_ratio": 1.0, "eval_rewards/CosineReward": 0.03234308, "eval_rewards/RepetitionPenalty": 0.0, "eval_reward": 0.03234308, "eval_reward_std": 0.10685289, "eval_kl": 0.22753906, "eval_clip_ratio": 4.392e-05, "eval_runtime": 1025.9041, "eval_samples_per_second": 0.001, "eval_steps_per_second": 0.001, "epoch": 2.84210526, "global_step/max_steps": "12/60", "percentage": "20.00%", "elapsed_time": "7h 52m 46s", "remaining_time": "1d 7h 31m 5s"} +{"loss": -0.02191038, "grad_norm": 0.01199417, "learning_rate": 9.591e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000399, "kl": 0.15112305, "clip_ratio": 0.00079083, "completion_length": 10652.93945312, "response_clip_ratio": 0.41992188, "rewards/CosineReward": 0.03598418, "rewards/RepetitionPenalty": -4.2e-07, "reward": 0.03598376, "reward_std": 0.1155337, "epoch": 3.21052632, "global_step/max_steps": "13/60", "percentage": "21.67%", "elapsed_time": "9h 3m 21s", "remaining_time": "1d 8h 44m 28s"} +{"loss": -0.02267258, "grad_norm": 0.01075426, "learning_rate": 9.468e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000427, "kl": 0.16918945, "clip_ratio": 0.00048213, "epoch": 3.42105263, "global_step/max_steps": "14/60", "percentage": "23.33%", "elapsed_time": "9h 5m 55s", "remaining_time": "1d 5h 53m 44s"} +{"loss": -0.05979916, "grad_norm": 0.01779361, "learning_rate": 9.33e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000406, "completion_length": 10482.14648438, "response_clip_ratio": 0.4765625, "rewards/CosineReward": 0.03584346, "rewards/RepetitionPenalty": -1.5e-07, "reward": 0.03584332, "reward_std": 0.11829412, "kl": 0.16674805, "clip_ratio": 1.962e-05, "epoch": 3.63157895, "global_step/max_steps": "15/60", "percentage": "25.00%", "elapsed_time": "10h 15m 57s", "remaining_time": "1d 6h 47m 53s"} +{"loss": -0.06071458, "grad_norm": 0.01321639, "learning_rate": 9.177e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000431, "kl": 0.16748047, "clip_ratio": 0.00011349, "epoch": 3.84210526, "global_step/max_steps": "16/60", "percentage": "26.67%", "elapsed_time": "10h 18m 45s", "remaining_time": "1d 4h 21m 34s"} +{"loss": -0.04504441, "grad_norm": 0.00835275, "learning_rate": 9.011e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.00041, "completion_length": 10822.3515625, "response_clip_ratio": 0.5625, "rewards/CosineReward": 0.02731912, "rewards/RepetitionPenalty": -2.3e-07, "reward": 0.02731888, "reward_std": 0.10441224, "kl": 0.17871094, "clip_ratio": 2.486e-05, "epoch": 4.21052632, "global_step/max_steps": "17/60", "percentage": "28.33%", "elapsed_time": "11h 31m 31s", "remaining_time": "1d 5h 9m 9s"} +{"loss": -0.04548755, "grad_norm": 0.0059984, "learning_rate": 8.83e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000432, "epoch": 4.42105263, "global_step/max_steps": "18/60", "percentage": "30.00%", "elapsed_time": "11h 34m 19s", "remaining_time": "1d 3h 0m 6s"} +{"eval_loss": -0.3821989, "eval_completion_length": 12289.0, "eval_response_clip_ratio": 1.0, "eval_rewards/CosineReward": 0.03729328, "eval_rewards/RepetitionPenalty": 0.0, "eval_reward": 0.03729328, "eval_reward_std": 0.10691347, "eval_kl": 0.18359375, "eval_clip_ratio": 2.287e-05, "eval_runtime": 1041.231, "eval_samples_per_second": 0.001, "eval_steps_per_second": 0.001, "epoch": 4.42105263, "global_step/max_steps": "18/60", "percentage": "30.00%", "elapsed_time": "11h 51m 41s", "remaining_time": "1d 3h 40m 35s"} +{"loss": -0.03466903, "grad_norm": 0.00707562, "learning_rate": 8.637e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000404, "kl": 0.18200684, "clip_ratio": 6.176e-05, "completion_length": 10454.50390625, "response_clip_ratio": 0.46679688, "rewards/CosineReward": 0.04070046, "rewards/RepetitionPenalty": -1.29e-06, "reward": 0.04069917, "reward_std": 0.11991006, "epoch": 4.63157895, "global_step/max_steps": "19/60", "percentage": "31.67%", "elapsed_time": "13h 2m 57s", "remaining_time": "1d 4h 9m 31s"} +{"loss": -0.03502114, "grad_norm": 0.00589657, "learning_rate": 8.431e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000424, "kl": 0.19287109, "clip_ratio": 6.069e-05, "epoch": 4.84210526, "global_step/max_steps": "20/60", "percentage": "33.33%", "elapsed_time": "13h 5m 40s", "remaining_time": "1d 2h 11m 20s"} +{"loss": -0.00866277, "grad_norm": 0.00415454, "learning_rate": 8.214e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000408, "completion_length": 10645.05664062, "response_clip_ratio": 0.5625, "rewards/CosineReward": 0.04996993, "rewards/RepetitionPenalty": -1.2e-07, "reward": 0.04996981, "reward_std": 0.1384942, "kl": 0.17626953, "clip_ratio": 3.873e-05, "epoch": 5.21052632, "global_step/max_steps": "21/60", "percentage": "35.00%", "elapsed_time": "14h 18m 46s", "remaining_time": "1d 2h 34m 52s"} +{"loss": -0.00874364, "grad_norm": 0.00430067, "learning_rate": 7.986e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000426, "kl": 0.17895508, "clip_ratio": 5.869e-05, "epoch": 5.42105263, "global_step/max_steps": "22/60", "percentage": "36.67%", "elapsed_time": "14h 21m 16s", "remaining_time": "1d 0h 47m 38s"} +{"loss": -0.03423421, "grad_norm": 0.01327698, "learning_rate": 7.748e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.00041, "completion_length": 10538.07226562, "response_clip_ratio": 0.58398438, "rewards/CosineReward": 0.03447545, "rewards/RepetitionPenalty": -6.61e-06, "reward": 0.03446883, "reward_std": 0.11841745, "kl": 0.1796875, "clip_ratio": 4.635e-05, "epoch": 5.63157895, "global_step/max_steps": "23/60", "percentage": "38.33%", "elapsed_time": "15h 34m 17s", "remaining_time": "1d 1h 2m 58s"} +{"loss": -0.03426633, "grad_norm": 0.01413172, "learning_rate": 7.5e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000427, "epoch": 5.84210526, "global_step/max_steps": "24/60", "percentage": "40.00%", "elapsed_time": "15h 37m 2s", "remaining_time": "23h 25m 34s"} +{"eval_loss": 0.36124694, "eval_completion_length": 12289.0, "eval_response_clip_ratio": 1.0, "eval_rewards/CosineReward": 0.04339283, "eval_rewards/RepetitionPenalty": 0.0, "eval_reward": 0.04339283, "eval_reward_std": 0.10456254, "eval_kl": 0.19824219, "eval_clip_ratio": 4.069e-05, "eval_runtime": 1045.0632, "eval_samples_per_second": 0.001, "eval_steps_per_second": 0.001, "epoch": 5.84210526, "global_step/max_steps": "24/60", "percentage": "40.00%", "elapsed_time": "15h 54m 27s", "remaining_time": "23h 51m 41s"} +{"loss": -0.02097315, "grad_norm": 0.00993353, "learning_rate": 7.244e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000406, "kl": 0.18005371, "clip_ratio": 5.057e-05, "completion_length": 10789.25976562, "response_clip_ratio": 0.6171875, "rewards/CosineReward": 0.03010232, "rewards/RepetitionPenalty": -2.6e-07, "reward": 0.03010206, "reward_std": 0.10742512, "epoch": 6.21052632, "global_step/max_steps": "25/60", "percentage": "41.67%", "elapsed_time": "17h 6m 5s", "remaining_time": "23h 56m 31s"} +{"loss": -0.02103914, "grad_norm": 0.00989576, "learning_rate": 6.98e-05, "memory(GiB)": 182.91, "train_speed(iter/s)": 0.000421, "kl": 0.18408203, "clip_ratio": 4.822e-05, "epoch": 6.42105263, "global_step/max_steps": "26/60", "percentage": "43.33%", "elapsed_time": "17h 8m 51s", "remaining_time": "22h 25m 25s"} diff --git a/val_dataset.jsonl b/val_dataset.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c86b3ed772bd43d5c8d9897ae6c0821941a608a1 --- /dev/null +++ b/val_dataset.jsonl @@ -0,0 +1 @@ +{"messages": [{"role": "user", "content": "Let $f(n)$ and $g(n)$ be functions satisfying \\[f(n) = \\begin{cases} \\sqrt{n} & \\text{ if } \\sqrt{n} \\text{ is an integer}\\\\ 1 + f(n+1) & \\text{ otherwise} \\end{cases}\\] and \\[g(n) = \\begin{cases}\\sqrt{n} & \\text{ if } \\sqrt{n} \\text{ is an integer}\\\\ 2 + g(n+2) & \\text{ otherwise} \\end{cases}\\] for positive integers $n$ . Find the least positive integer $n$ such that $\\tfrac{f(n)}{g(n)} = \\tfrac{4}{7}$ ."}, {"role": "assistant", "content": "258"}], "solution": "258"}