diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fba8f06ca60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..769434819ffdba0e62edfbea133a5f3f81419b36
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9e52255857ec7ffa31d4b7a55befca1557263cad59e8426b72706443dc8bf45
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fba8f06ca60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f451a96f9827b26925ff5d2aea804cd5b88d76d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d8a85458b2b2064332e1f0a7f65d87cdcd60c0c659d728f67111e110f934661
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ae9a2dad6329a572ce9c358363ba3641f862802
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:899c32d86e329a048bc56fc532041672db03a64112324ce74ede0991b822733d
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..946c94728c60c93a444d4187bc77833cb51b8214
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33638aa13e6f9200e30ea1ee43d5a987332a7b81042c25d8f358c427b6f6ca28
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68009d6a413cc382144abe0ded9c4129e8439b47
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06d72f5b84e6381c6e489dbf6625fcf8bb91d408b25438d55ad7f96ec3fed34b
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f00124e5a0ea627b8addbbf3644b01ed2e2e835
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3052be4c3806c2550bace460681d1af2b3beeb7973b270c598ff4095198a4fc
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0099b653969fd7512c604969ec5d2e07300ef118
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89a650af753c3f9853d157a148cbf7292ea7973999c01035dc62eaa2ffd1a958
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7b0f50bc9d960908ba49f55ba4c088080152779
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffdb58c9f32cb1cc2fec92b915d06c00271f6c23b3b162644a43d8bc322fad23
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f694c5feaaca0ce72187371fc6ee20956a7d7ecb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b9d5a7067f65dce1a3a7f5943fce25d44ab9d56b2097dfea4aafe4bf85b681
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest
new file mode 100644
index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/latest
@@ -0,0 +1 @@
+global_step20
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..37ac50652a3badbfb1bdeaccb8b1934575b584eb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0bc3650851dae439677613c9e23a5528de47b679
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0e00a6e8b4b743026f68d749a8cb3bdd4b746838
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5354141d42e077c356f9ca8c6b12bd7e5e41f2af
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9db8a98ca69cd5bfebe102039231d58d7ea374e
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c42b1ef948ce2918e44502f72db30bc09f3c40f0dfa68050c22f884d1aac4ff5
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fed2b18b2d15786dd38faf631940c18e0068ffa
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/trainer_state.json
@@ -0,0 +1,140 @@
+{
+  "best_metric": 0.43920898,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-20",
+  "epoch": 0.8080808080808081,
+  "eval_steps": 20,
+  "global_step": 20,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.741686768010279,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.3984375,
+      "logits/rejected": 0.2080078125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -272.0,
+      "loss": 1.7568359375,
+      "memory(GiB)": 14.02,
+      "nll_loss": 1.5859375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.062883
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 3.179213138025169,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.734375,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -358.0,
+      "logps/rejected": -516.0,
+      "loss": 1.7740478515625,
+      "memory(GiB)": 31.0,
+      "nll_loss": 1.0234375,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.287109375,
+      "rewards/margins": 0.1455078125,
+      "rewards/rejected": 0.1416015625,
+      "step": 5,
+      "train_speed(iter/s)": 0.084048
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 1.1865745356779005,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.5078125,
+      "logits/rejected": -0.036376953125,
+      "logps/chosen": -348.0,
+      "logps/rejected": -552.0,
+      "loss": 1.3421142578125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.8203125,
+      "rewards/accuracies": 0.8500000238418579,
+      "rewards/chosen": 1.6015625,
+      "rewards/margins": 0.91796875,
+      "rewards/rejected": 0.6875,
+      "step": 10,
+      "train_speed(iter/s)": 0.086989
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.2081256636056697,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.23828125,
+      "logits/rejected": -0.703125,
+      "logps/chosen": -368.0,
+      "logps/rejected": -284.0,
+      "loss": 0.77510986328125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.67578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 2.984375,
+      "rewards/margins": 3.3125,
+      "rewards/rejected": -0.33203125,
+      "step": 15,
+      "train_speed(iter/s)": 0.09061
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.12928414880774924,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.2060546875,
+      "logits/rejected": -0.61328125,
+      "logps/chosen": -458.0,
+      "logps/rejected": -310.0,
+      "loss": 0.53414306640625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 3.34375,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -1.859375,
+      "step": 20,
+      "train_speed(iter/s)": 0.091844
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.306640625,
+      "eval_logps/chosen": -4.78125,
+      "eval_logps/rejected": -160.0,
+      "eval_loss": 0.439208984375,
+      "eval_nll_loss": 0.2080078125,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 6.21875,
+      "eval_rewards/rejected": -0.2001953125,
+      "eval_runtime": 1.2472,
+      "eval_samples_per_second": 3.207,
+      "eval_steps_per_second": 0.802,
+      "step": 20
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8734807261184.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-20/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dc019ba4c600200e26b0892741a1594e15feaa03
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96f11fe94391435c7d5420e38387d3c3608d75f3f9be24e441266861cccdc8a8
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fba8f06ca60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e7c80fa1f00a97c8c17c13eef7ccab4a5ffc6a0
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:429e5d67e0755a91d26b3af6e20eebf35d48c9712e983c4912b6b2a32c6cdfba
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5b9e110bc8c35a2630a287afcce7fb3707c9419
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f9f3b35d4369219be01924f6bdfbcdf49a8f0c6e18fe467e925f35aa5a00e47
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2feccbcf03e3553114091c9f1f3dd1fb932171b
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64ed82bf699d994cbaae135846c7652886d6334cf796ddbc13b454e7e23f354c
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e25756d3539b7ab936a0195dd897b9a76ee80085
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5806f50ed505b0718d02f32d92d343f4ac2ca7e286eacbe1d3692a5768ae223e
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..521cff52824a521f9600db9459948a11f52abf09
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2caf3158f1f6f446801f4d26836267d7f8eace7459ddd0add1a9abd1b83631aa
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a726173f4adb98fe2e2b46df9a3a7a5386be326
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4dde91841e38e32dbb367a21a6ab2d644ee359442a82e929982af7d81f1339b
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ded5c487b0af4c917cf833cdafa2261290cfc53a
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42bebe291bfd2d5039060b95d8080361a7981aa9abfcb39bce2d72a9c2ebef8e
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6cc4d81a0b906cb49404ddc23dce222d0007551
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d27803d4e916264476fdba88d87a2e03b998cd73b3ea77e32f59ac3bde61a55
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest
new file mode 100644
index 0000000000000000000000000000000000000000..67f1c55b2b0a3119f2287d39e40e22b4f158741b
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/latest
@@ -0,0 +1 @@
+global_step39
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f8799407442db08820f995bcf1b9158f696af19f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0f39416636e7990907141a415603582d33812fc9
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f8e5c420bc296502c335bcadd512d01972f28a0
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2ab72c0a7472f98efb1865889d6039f3ae7d12fc3c8e7bfeea52279fc333219
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..079b90493176b0f049e8711e531298376f026695
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/trainer_state.json
@@ -0,0 +1,229 @@
+{
+  "best_metric": 0.42163086,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-40",
+  "epoch": 1.606060606060606,
+  "eval_steps": 20,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.741686768010279,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.3984375,
+      "logits/rejected": 0.2080078125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -272.0,
+      "loss": 1.7568359375,
+      "memory(GiB)": 14.02,
+      "nll_loss": 1.5859375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.062883
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 3.179213138025169,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.734375,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -358.0,
+      "logps/rejected": -516.0,
+      "loss": 1.7740478515625,
+      "memory(GiB)": 31.0,
+      "nll_loss": 1.0234375,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.287109375,
+      "rewards/margins": 0.1455078125,
+      "rewards/rejected": 0.1416015625,
+      "step": 5,
+      "train_speed(iter/s)": 0.084048
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 1.1865745356779005,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.5078125,
+      "logits/rejected": -0.036376953125,
+      "logps/chosen": -348.0,
+      "logps/rejected": -552.0,
+      "loss": 1.3421142578125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.8203125,
+      "rewards/accuracies": 0.8500000238418579,
+      "rewards/chosen": 1.6015625,
+      "rewards/margins": 0.91796875,
+      "rewards/rejected": 0.6875,
+      "step": 10,
+      "train_speed(iter/s)": 0.086989
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.2081256636056697,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.23828125,
+      "logits/rejected": -0.703125,
+      "logps/chosen": -368.0,
+      "logps/rejected": -284.0,
+      "loss": 0.77510986328125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.67578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 2.984375,
+      "rewards/margins": 3.3125,
+      "rewards/rejected": -0.33203125,
+      "step": 15,
+      "train_speed(iter/s)": 0.09061
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.12928414880774924,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.2060546875,
+      "logits/rejected": -0.61328125,
+      "logps/chosen": -458.0,
+      "logps/rejected": -310.0,
+      "loss": 0.53414306640625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 3.34375,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -1.859375,
+      "step": 20,
+      "train_speed(iter/s)": 0.091844
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.306640625,
+      "eval_logps/chosen": -4.78125,
+      "eval_logps/rejected": -160.0,
+      "eval_loss": 0.439208984375,
+      "eval_nll_loss": 0.2080078125,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 6.21875,
+      "eval_rewards/rejected": -0.2001953125,
+      "eval_runtime": 1.2472,
+      "eval_samples_per_second": 3.207,
+      "eval_steps_per_second": 0.802,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.19570778554820287,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.046630859375,
+      "logits/rejected": -0.140625,
+      "logps/chosen": -430.0,
+      "logps/rejected": -516.0,
+      "loss": 0.468353271484375,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.5078125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0,
+      "rewards/margins": 7.0625,
+      "rewards/rejected": -2.0625,
+      "step": 25,
+      "train_speed(iter/s)": 0.092514
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.09105661940994438,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.052490234375,
+      "logps/chosen": -280.0,
+      "logps/rejected": -502.0,
+      "loss": 0.4760528564453125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.40234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.53125,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -4.125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09323
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.08621388537462939,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.45703125,
+      "logits/rejected": 0.1806640625,
+      "logps/chosen": -212.0,
+      "logps/rejected": -528.0,
+      "loss": 0.4629302978515625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.462890625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -3.625,
+      "step": 35,
+      "train_speed(iter/s)": 0.093963
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.11934377803721716,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": 0.18359375,
+      "logits/rejected": -0.35546875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -282.0,
+      "loss": 0.4460845947265625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.404296875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.75,
+      "rewards/margins": 10.75,
+      "rewards/rejected": -3.984375,
+      "step": 40,
+      "train_speed(iter/s)": 0.094552
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.953125,
+      "eval_logps/chosen": -5.09375,
+      "eval_logps/rejected": -178.0,
+      "eval_loss": 0.421630859375,
+      "eval_nll_loss": 0.2216796875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.0,
+      "eval_rewards/margins": 8.0,
+      "eval_rewards/rejected": -2.0,
+      "eval_runtime": 1.4128,
+      "eval_samples_per_second": 2.831,
+      "eval_steps_per_second": 0.708,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 17165975126016.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-40/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7fbf70f5e9dbab491ad68ef42cab5d47b81bd6f9
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b833ad688223299d12ae89a33634d417b275c016962768f74f6c1b3349400502
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fba8f06ca60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e783cf95a2c851bb2b9b3af4fa47faf30d81e309
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e2b6e9968138490df7cb73722ffab10b88171cee622305bef82d535a3151dfa
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1f06cb9814b1bc14102116170c9c05fbcc6c075
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:880944dff0f0c20d4b6eec7c3fcc2270fdccf63d03e13b03c26d61c9d61cb540
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ca6988f7dbcbadd28909f525682b11a668311bb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61abab95371bd0b099f908aee125a63e3daddf8bcf7954b4e566c6845bf0713f
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6bd94df4047c4934c3677281e85edf1960a57649
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40fe6c96dfd64b22723661fe3c7d7acd78ff6f8a1ca900eb12d1c9e39a313aa1
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5011349fbf7bc231576172746d34f19d8be3a03
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae87355a9241c0ea4edf21abd0fb8d38012f4aa9723febcef39bbc7901c36db
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f27a9e89ff30e1b7e5fc2b97fce8cf811b4d1cb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84abe8df1ad89f1801c346cbe87bf7fead94c9cb08b42f184d0dfb8f1c359ccc
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6a5432b84d6286d0b574866aaab3d0e6352ed4f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:860eafdd10ecf7c0eaad3c6aca57b888ad1a5f906134d493e84eec89940369c2
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db55b37d6a437c84db4a22626123c43786e59401
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:124f015af78bbf627b57d97070f950f843b2b5ef2f1886ddbeae34a9ce2a2d10
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest
new file mode 100644
index 0000000000000000000000000000000000000000..099fa08342218cca7c00fb7043635561ebda9695
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/latest
@@ -0,0 +1 @@
+global_step59
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c54ea122b283c04f6b60c1eedefeb301763a8f9f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ea57ead2533e587fe50f62107d7cb32945fe1354
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4689a9445d07528dc4fd91011a7f034c11773a68
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..919b5e43a96a9afdeb196f402142bc3aab67f247
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95126866042baa544d6bc4555d944440b37fdb21
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e1521c1c8dfc88bc6566a95cc91f42709693a765076997f6318af86035c445
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1535f6cf60a3e9b40b7e1196105c3c1e2954f335
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/trainer_state.json
@@ -0,0 +1,318 @@
+{
+  "best_metric": 0.41357422,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60",
+  "epoch": 2.404040404040404,
+  "eval_steps": 20,
+  "global_step": 60,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.741686768010279,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.3984375,
+      "logits/rejected": 0.2080078125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -272.0,
+      "loss": 1.7568359375,
+      "memory(GiB)": 14.02,
+      "nll_loss": 1.5859375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.062883
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 3.179213138025169,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.734375,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -358.0,
+      "logps/rejected": -516.0,
+      "loss": 1.7740478515625,
+      "memory(GiB)": 31.0,
+      "nll_loss": 1.0234375,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.287109375,
+      "rewards/margins": 0.1455078125,
+      "rewards/rejected": 0.1416015625,
+      "step": 5,
+      "train_speed(iter/s)": 0.084048
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 1.1865745356779005,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.5078125,
+      "logits/rejected": -0.036376953125,
+      "logps/chosen": -348.0,
+      "logps/rejected": -552.0,
+      "loss": 1.3421142578125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.8203125,
+      "rewards/accuracies": 0.8500000238418579,
+      "rewards/chosen": 1.6015625,
+      "rewards/margins": 0.91796875,
+      "rewards/rejected": 0.6875,
+      "step": 10,
+      "train_speed(iter/s)": 0.086989
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.2081256636056697,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.23828125,
+      "logits/rejected": -0.703125,
+      "logps/chosen": -368.0,
+      "logps/rejected": -284.0,
+      "loss": 0.77510986328125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.67578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 2.984375,
+      "rewards/margins": 3.3125,
+      "rewards/rejected": -0.33203125,
+      "step": 15,
+      "train_speed(iter/s)": 0.09061
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.12928414880774924,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.2060546875,
+      "logits/rejected": -0.61328125,
+      "logps/chosen": -458.0,
+      "logps/rejected": -310.0,
+      "loss": 0.53414306640625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 3.34375,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -1.859375,
+      "step": 20,
+      "train_speed(iter/s)": 0.091844
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.306640625,
+      "eval_logps/chosen": -4.78125,
+      "eval_logps/rejected": -160.0,
+      "eval_loss": 0.439208984375,
+      "eval_nll_loss": 0.2080078125,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 6.21875,
+      "eval_rewards/rejected": -0.2001953125,
+      "eval_runtime": 1.2472,
+      "eval_samples_per_second": 3.207,
+      "eval_steps_per_second": 0.802,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.19570778554820287,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.046630859375,
+      "logits/rejected": -0.140625,
+      "logps/chosen": -430.0,
+      "logps/rejected": -516.0,
+      "loss": 0.468353271484375,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.5078125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0,
+      "rewards/margins": 7.0625,
+      "rewards/rejected": -2.0625,
+      "step": 25,
+      "train_speed(iter/s)": 0.092514
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.09105661940994438,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.052490234375,
+      "logps/chosen": -280.0,
+      "logps/rejected": -502.0,
+      "loss": 0.4760528564453125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.40234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.53125,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -4.125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09323
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.08621388537462939,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.45703125,
+      "logits/rejected": 0.1806640625,
+      "logps/chosen": -212.0,
+      "logps/rejected": -528.0,
+      "loss": 0.4629302978515625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.462890625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -3.625,
+      "step": 35,
+      "train_speed(iter/s)": 0.093963
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.11934377803721716,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": 0.18359375,
+      "logits/rejected": -0.35546875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -282.0,
+      "loss": 0.4460845947265625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.404296875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.75,
+      "rewards/margins": 10.75,
+      "rewards/rejected": -3.984375,
+      "step": 40,
+      "train_speed(iter/s)": 0.094552
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.953125,
+      "eval_logps/chosen": -5.09375,
+      "eval_logps/rejected": -178.0,
+      "eval_loss": 0.421630859375,
+      "eval_nll_loss": 0.2216796875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.0,
+      "eval_rewards/margins": 8.0,
+      "eval_rewards/rejected": -2.0,
+      "eval_runtime": 1.4128,
+      "eval_samples_per_second": 2.831,
+      "eval_steps_per_second": 0.708,
+      "step": 40
+    },
+    {
+      "epoch": 1.808080808080808,
+      "grad_norm": 0.1278829740469663,
+      "learning_rate": 3.411042902090492e-05,
+      "logits/chosen": -0.0233154296875,
+      "logits/rejected": 0.328125,
+      "logps/chosen": -310.0,
+      "logps/rejected": -494.0,
+      "loss": 0.539617919921875,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.51953125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.75,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -2.875,
+      "step": 45,
+      "train_speed(iter/s)": 0.094018
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20021892626227725,
+      "learning_rate": 2.3678391856132204e-05,
+      "logits/chosen": 0.181640625,
+      "logits/rejected": 0.0159912109375,
+      "logps/chosen": -300.0,
+      "logps/rejected": -280.0,
+      "loss": 0.437335205078125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.40234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.8125,
+      "rewards/margins": 11.3125,
+      "rewards/rejected": -3.5,
+      "step": 50,
+      "train_speed(iter/s)": 0.094129
+    },
+    {
+      "epoch": 2.202020202020202,
+      "grad_norm": 0.17561192586448465,
+      "learning_rate": 1.4644660940672627e-05,
+      "logits/chosen": 0.0888671875,
+      "logits/rejected": 0.3046875,
+      "logps/chosen": -298.0,
+      "logps/rejected": -620.0,
+      "loss": 0.48487548828125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.45703125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.96875,
+      "rewards/margins": 9.375,
+      "rewards/rejected": -2.40625,
+      "step": 55,
+      "train_speed(iter/s)": 0.094012
+    },
+    {
+      "epoch": 2.404040404040404,
+      "grad_norm": 0.06159661984448856,
+      "learning_rate": 7.489143213519301e-06,
+      "logits/chosen": 0.0595703125,
+      "logits/rejected": -0.3125,
+      "logps/chosen": -422.0,
+      "logps/rejected": -468.0,
+      "loss": 0.45642852783203125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.515625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 8.125,
+      "rewards/margins": 11.1875,
+      "rewards/rejected": -3.0625,
+      "step": 60,
+      "train_speed(iter/s)": 0.094412
+    },
+    {
+      "epoch": 2.404040404040404,
+      "eval_logits/chosen": -1.4140625,
+      "eval_logits/rejected": 1.171875,
+      "eval_logps/chosen": -4.8125,
+      "eval_logps/rejected": -194.0,
+      "eval_loss": 0.41357421875,
+      "eval_nll_loss": 0.208984375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 9.625,
+      "eval_rewards/rejected": -3.59375,
+      "eval_runtime": 1.3539,
+      "eval_samples_per_second": 2.954,
+      "eval_steps_per_second": 0.739,
+      "step": 60
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 26206478368768.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-60/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..19b533627149e49064da1e0499ae385be3ba91cf
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..07356f87962f58b0a54e8b164cfac213aee29101
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff451d5dacda09cd8d5e772db9d49595fba9aaf2a8d4142ee5134d33549539e6
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..74b472bbbc8748201a9f8fe1dbb9fc5e9bb7d0a7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_random20_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fba8f06ca60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f2ca773af6eb647432387f94fd0d38244fa9880
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06d5b6b9eb6990c77f4187385e1ecd43690a709b63536d9d4aac87b389862dcc
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0361ad7512916df91e9d9d5b6776f9b3a2f8aeb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61c12938e258e9f032221091191c41232e0f66e15b2e2a5b6fb707ad0af4701f
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bf980a291414bf2ca5063855c4ef33bdf2ee2d2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b320ed365a63b3109f4755f5bc13522b7b5a881053de0cb9353d4597c4d691d
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b089aa8a9cf884d93b7e2644bd3f6512ff86a7a
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98dddd7880cafd60833a0e6809618ef6b0d7d22e342d49d7609688cf91a3a09b
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a524f27b92645e5645dfcc9b75110e322a57a6b2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f7e868315deb2e0bb0064d1d57817ce3429b318a03d3a3dd62a9dfce05bbef
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e47b1e355f370a9f30c7c77ba856b6efddbb168f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b64cb77e899835cb8a51cfb333ad499ca2901e784ea087a7c21b702d596df647
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7726800cef0fc2d62c65ee7dd3f9e0cb17c01440
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3afc72b3e2aaabd252874436aacbe2ce7bff38be7845ea0f1ffb58987003f91e
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b02e5338327c0d40effa89d5444bb462c2d1d89c
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c3089aac8b20a3c6b4d889ebbe8a52e9c331dd4d49e66893559428bee931945
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest
new file mode 100644
index 0000000000000000000000000000000000000000..bbeadc7466d2728e3046120a012ebc37c29267cb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/latest
@@ -0,0 +1 @@
+global_step71
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..be2e24cc9d9ef8857272cec1451c810e205ec4e9
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..efcf4dd2e74596ac28af81f9f8bd0be9a807deb3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4c9222e37d4e9d1745c0e126e0fe0c4a348e298d
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7821bf0f5f0621fd0159152432f0a7bc66aa6823
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d0d6f336655bdacf5eb53294b71e20f2d0edb17
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2ba16a2cd6668009497101c7aa1ee348685f1df2d9a2a20c23be3737c813063
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b0a81da7c7b5decbdc85f2c036c193db226e332
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/trainer_state.json
@@ -0,0 +1,371 @@
+{
+  "best_metric": 0.41357422,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60",
+  "epoch": 2.888888888888889,
+  "eval_steps": 20,
+  "global_step": 72,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.741686768010279,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.3984375,
+      "logits/rejected": 0.2080078125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -272.0,
+      "loss": 1.7568359375,
+      "memory(GiB)": 14.02,
+      "nll_loss": 1.5859375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.062883
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 3.179213138025169,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.734375,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -358.0,
+      "logps/rejected": -516.0,
+      "loss": 1.7740478515625,
+      "memory(GiB)": 31.0,
+      "nll_loss": 1.0234375,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.287109375,
+      "rewards/margins": 0.1455078125,
+      "rewards/rejected": 0.1416015625,
+      "step": 5,
+      "train_speed(iter/s)": 0.084048
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 1.1865745356779005,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.5078125,
+      "logits/rejected": -0.036376953125,
+      "logps/chosen": -348.0,
+      "logps/rejected": -552.0,
+      "loss": 1.3421142578125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.8203125,
+      "rewards/accuracies": 0.8500000238418579,
+      "rewards/chosen": 1.6015625,
+      "rewards/margins": 0.91796875,
+      "rewards/rejected": 0.6875,
+      "step": 10,
+      "train_speed(iter/s)": 0.086989
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.2081256636056697,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.23828125,
+      "logits/rejected": -0.703125,
+      "logps/chosen": -368.0,
+      "logps/rejected": -284.0,
+      "loss": 0.77510986328125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.67578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 2.984375,
+      "rewards/margins": 3.3125,
+      "rewards/rejected": -0.33203125,
+      "step": 15,
+      "train_speed(iter/s)": 0.09061
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.12928414880774924,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.2060546875,
+      "logits/rejected": -0.61328125,
+      "logps/chosen": -458.0,
+      "logps/rejected": -310.0,
+      "loss": 0.53414306640625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.578125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 3.34375,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -1.859375,
+      "step": 20,
+      "train_speed(iter/s)": 0.091844
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.306640625,
+      "eval_logps/chosen": -4.78125,
+      "eval_logps/rejected": -160.0,
+      "eval_loss": 0.439208984375,
+      "eval_nll_loss": 0.2080078125,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 6.21875,
+      "eval_rewards/rejected": -0.2001953125,
+      "eval_runtime": 1.2472,
+      "eval_samples_per_second": 3.207,
+      "eval_steps_per_second": 0.802,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.19570778554820287,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.046630859375,
+      "logits/rejected": -0.140625,
+      "logps/chosen": -430.0,
+      "logps/rejected": -516.0,
+      "loss": 0.468353271484375,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.5078125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0,
+      "rewards/margins": 7.0625,
+      "rewards/rejected": -2.0625,
+      "step": 25,
+      "train_speed(iter/s)": 0.092514
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.09105661940994438,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.052490234375,
+      "logps/chosen": -280.0,
+      "logps/rejected": -502.0,
+      "loss": 0.4760528564453125,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.40234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.53125,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -4.125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09323
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.08621388537462939,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.45703125,
+      "logits/rejected": 0.1806640625,
+      "logps/chosen": -212.0,
+      "logps/rejected": -528.0,
+      "loss": 0.4629302978515625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.462890625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -3.625,
+      "step": 35,
+      "train_speed(iter/s)": 0.093963
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.11934377803721716,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": 0.18359375,
+      "logits/rejected": -0.35546875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -282.0,
+      "loss": 0.4460845947265625,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.404296875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.75,
+      "rewards/margins": 10.75,
+      "rewards/rejected": -3.984375,
+      "step": 40,
+      "train_speed(iter/s)": 0.094552
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.4375,
+      "eval_logits/rejected": 0.953125,
+      "eval_logps/chosen": -5.09375,
+      "eval_logps/rejected": -178.0,
+      "eval_loss": 0.421630859375,
+      "eval_nll_loss": 0.2216796875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.0,
+      "eval_rewards/margins": 8.0,
+      "eval_rewards/rejected": -2.0,
+      "eval_runtime": 1.4128,
+      "eval_samples_per_second": 2.831,
+      "eval_steps_per_second": 0.708,
+      "step": 40
+    },
+    {
+      "epoch": 1.808080808080808,
+      "grad_norm": 0.1278829740469663,
+      "learning_rate": 3.411042902090492e-05,
+      "logits/chosen": -0.0233154296875,
+      "logits/rejected": 0.328125,
+      "logps/chosen": -310.0,
+      "logps/rejected": -494.0,
+      "loss": 0.539617919921875,
+      "memory(GiB)": 45.84,
+      "nll_loss": 0.51953125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.75,
+      "rewards/margins": 10.625,
+      "rewards/rejected": -2.875,
+      "step": 45,
+      "train_speed(iter/s)": 0.094018
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20021892626227725,
+      "learning_rate": 2.3678391856132204e-05,
+      "logits/chosen": 0.181640625,
+      "logits/rejected": 0.0159912109375,
+      "logps/chosen": -300.0,
+      "logps/rejected": -280.0,
+      "loss": 0.437335205078125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.40234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.8125,
+      "rewards/margins": 11.3125,
+      "rewards/rejected": -3.5,
+      "step": 50,
+      "train_speed(iter/s)": 0.094129
+    },
+    {
+      "epoch": 2.202020202020202,
+      "grad_norm": 0.17561192586448465,
+      "learning_rate": 1.4644660940672627e-05,
+      "logits/chosen": 0.0888671875,
+      "logits/rejected": 0.3046875,
+      "logps/chosen": -298.0,
+      "logps/rejected": -620.0,
+      "loss": 0.48487548828125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.45703125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.96875,
+      "rewards/margins": 9.375,
+      "rewards/rejected": -2.40625,
+      "step": 55,
+      "train_speed(iter/s)": 0.094012
+    },
+    {
+      "epoch": 2.404040404040404,
+      "grad_norm": 0.06159661984448856,
+      "learning_rate": 7.489143213519301e-06,
+      "logits/chosen": 0.0595703125,
+      "logits/rejected": -0.3125,
+      "logps/chosen": -422.0,
+      "logps/rejected": -468.0,
+      "loss": 0.45642852783203125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.515625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 8.125,
+      "rewards/margins": 11.1875,
+      "rewards/rejected": -3.0625,
+      "step": 60,
+      "train_speed(iter/s)": 0.094412
+    },
+    {
+      "epoch": 2.404040404040404,
+      "eval_logits/chosen": -1.4140625,
+      "eval_logits/rejected": 1.171875,
+      "eval_logps/chosen": -4.8125,
+      "eval_logps/rejected": -194.0,
+      "eval_loss": 0.41357421875,
+      "eval_nll_loss": 0.208984375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 9.625,
+      "eval_rewards/rejected": -3.59375,
+      "eval_runtime": 1.3539,
+      "eval_samples_per_second": 2.954,
+      "eval_steps_per_second": 0.739,
+      "step": 60
+    },
+    {
+      "epoch": 2.606060606060606,
+      "grad_norm": 0.12507359172791535,
+      "learning_rate": 2.591967620451707e-06,
+      "logits/chosen": -0.41796875,
+      "logits/rejected": 0.447265625,
+      "logps/chosen": -206.0,
+      "logps/rejected": -488.0,
+      "loss": 0.47344970703125,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.55078125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.5,
+      "rewards/margins": 11.75,
+      "rewards/rejected": -4.25,
+      "step": 65,
+      "train_speed(iter/s)": 0.094381
+    },
+    {
+      "epoch": 2.808080808080808,
+      "grad_norm": 0.13343167302632133,
+      "learning_rate": 2.1329118524827662e-07,
+      "logits/chosen": 0.0218505859375,
+      "logits/rejected": 0.055908203125,
+      "logps/chosen": -274.0,
+      "logps/rejected": -446.0,
+      "loss": 0.43180007934570314,
+      "memory(GiB)": 45.85,
+      "nll_loss": 0.458984375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.5,
+      "rewards/margins": 11.25,
+      "rewards/rejected": -3.734375,
+      "step": 70,
+      "train_speed(iter/s)": 0.094656
+    },
+    {
+      "epoch": 2.888888888888889,
+      "eval_logits/chosen": -1.421875,
+      "eval_logits/rejected": 1.1875,
+      "eval_logps/chosen": -4.84375,
+      "eval_logps/rejected": -196.0,
+      "eval_loss": 0.414306640625,
+      "eval_nll_loss": 0.2099609375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.03125,
+      "eval_rewards/margins": 9.8125,
+      "eval_rewards/rejected": -3.796875,
+      "eval_runtime": 1.3572,
+      "eval_samples_per_second": 2.947,
+      "eval_steps_per_second": 0.737,
+      "step": 72
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 31365457739776.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e28ad510574d5753b3789c724d404c13a49b6c5
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f6654b83a8539228bcbbfceff4aad6393d55b5d0ea9f547726948c7dfea8ab
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/checkpoint-72/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c708d3a5785f9e917401606bb742dc2b3ee8a45
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..50eb007ae00ed25bfee2ae00af74f7fdacecf1bf
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logits_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..36cb6379c583bc0a50d2283464f25d5cc623b0c2
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..3a445b37d179a145bc175a21b9b8268965ee752c
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_logps_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..15368799be0c751785409a72cd4ec74180191bf0
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..c73aef62fd8c77e7da83b81ba38ccd696972c382
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_nll_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b88ed37a3c651cbf54899f712bcd1515e2be8fc
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_accuracies.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..bff48443ab4ba9b4dd6c8578efb9dfee8786c9b6
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0a039de67bfae0805a1115a43451703297be821
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_margins.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..ae27e9ee434a4df106c4ee15fe15663ffca4378f
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_rewards_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png
new file mode 100644
index 0000000000000000000000000000000000000000..aba91c89d2521852f673b1924c044d779882d40b
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_runtime.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..e7220036d6e5983d8e5812f0ac9dbd73f92913b3
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_samples_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7d287bc2108ba05a9f3f594ea8abe7dd3120dde
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/eval_steps_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf6c6bacafca8d256f07d43a8851a9463ac6e566
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_epoch.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..8378287c7533dfa720b80460c365cffd20f03bab
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_grad_norm.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d80e686cd9522b9efdfd0b12d052a155efc0d3b
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_learning_rate.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..bfcce7ff956d379e25b60149b98415666d43ec0e
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..242322449f519ae3288a6bf6b0a1487ff673f4d9
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logits_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..2f120ad7c37770ac9c2c78990a80575eb4c890f2
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..2bc7e0b79f310f3149946bef94dd22981e802a88
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_logps_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..6b3419f3056edd9a42e798ed8a5e5894464f86a2
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png
new file mode 100644
index 0000000000000000000000000000000000000000..6281dc99d60d4bb0d0bf0f1103caf4fc5b875e56
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_memory(GiB).png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..422b81beb4839d188316e2373bd68f55db735fbb
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_nll_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a4f76c5f8435fb589b78732175fc70cc6ef802a
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_accuracies.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..33cce2fdd4b67464b2da8ecd01063ce06a43e716
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png
new file mode 100644
index 0000000000000000000000000000000000000000..cf7f037662da793ec4504cfd52fabef4808cd1da
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_margins.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c9d91519cd50c651e00a0d9d1df9359313c6be5
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_rewards_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png
new file mode 100644
index 0000000000000000000000000000000000000000..06cb39f0566cc3c686d80bc540ff624c8e4ba232
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_total_flos.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a5bd1d3170d65f876cbeebb92fe21e10853b208
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png
new file mode 100644
index 0000000000000000000000000000000000000000..7216ef100f53ed5e665aaa9528c4ef127b3b1e50
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_runtime.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546d7732fcd75094eee28a57cf94529760d8d33
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_samples_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3c7491fbc21428e2362a18d07661f52331b465
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_speed(iter_s).png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbaf64d83cd46fd748c06dd7282aa47efa0dc2d
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/images/train_train_steps_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d81c19aacbb83eb86dda03c584ac6518b2b8b59
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/logging.jsonl
@@ -0,0 +1,21 @@
+{"loss": 1.75683594, "grad_norm": 2.74168677, "learning_rate": 2.5e-05, "memory(GiB)": 14.02, "train_speed(iter/s)": 0.062883, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -282.0, "logits/rejected": 0.20800781, "logits/chosen": -0.3984375, "nll_loss": 1.5859375, "epoch": 0.04040404, "global_step/max_steps": "1/72", "percentage": "1.39%", "elapsed_time": "12s", "remaining_time": "15m 12s"}
+{"loss": 1.77404785, "grad_norm": 3.17921314, "learning_rate": 9.995e-05, "memory(GiB)": 31.0, "train_speed(iter/s)": 0.084048, "rewards/chosen": 0.28710938, "rewards/rejected": 0.14160156, "rewards/accuracies": 0.5, "rewards/margins": 0.14550781, "logps/rejected": -516.0, "logps/chosen": -358.0, "logits/rejected": -0.18554688, "logits/chosen": -0.734375, "nll_loss": 1.0234375, "epoch": 0.2020202, "global_step/max_steps": "5/72", "percentage": "6.94%", "elapsed_time": "56s", "remaining_time": "12m 35s"}
+{"loss": 1.34211426, "grad_norm": 1.18657454, "learning_rate": 9.809e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.086989, "rewards/chosen": 1.6015625, "rewards/rejected": 0.6875, "rewards/accuracies": 0.85000002, "rewards/margins": 0.91796875, "logps/rejected": -552.0, "logps/chosen": -348.0, "logits/rejected": -0.03637695, "logits/chosen": -0.5078125, "nll_loss": 0.8203125, "epoch": 0.4040404, "global_step/max_steps": "10/72", "percentage": "13.89%", "elapsed_time": "1m 51s", "remaining_time": "11m 33s"}
+{"loss": 0.77510986, "grad_norm": 0.20812566, "learning_rate": 9.368e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09061, "rewards/chosen": 2.984375, "rewards/rejected": -0.33203125, "rewards/accuracies": 1.0, "rewards/margins": 3.3125, "logps/rejected": -284.0, "logps/chosen": -368.0, "logits/rejected": -0.703125, "logits/chosen": -0.23828125, "nll_loss": 0.67578125, "epoch": 0.60606061, "global_step/max_steps": "15/72", "percentage": "20.83%", "elapsed_time": "2m 42s", "remaining_time": "10m 17s"}
+{"loss": 0.53414307, "grad_norm": 0.12928415, "learning_rate": 8.695e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.091844, "rewards/chosen": 3.34375, "rewards/rejected": -1.859375, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -310.0, "logps/chosen": -458.0, "logits/rejected": -0.61328125, "logits/chosen": -0.20605469, "nll_loss": 0.578125, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 34s", "remaining_time": "9m 18s"}
+{"eval_loss": 0.43920898, "eval_runtime": 1.2472, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -0.20019531, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.21875, "eval_logps/rejected": -160.0, "eval_logps/chosen": -4.78125, "eval_logits/rejected": 0.30664062, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.20800781, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 35s", "remaining_time": "9m 21s"}
+{"loss": 0.46835327, "grad_norm": 0.19570779, "learning_rate": 7.826e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.092514, "rewards/chosen": 5.0, "rewards/rejected": -2.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -516.0, "logps/chosen": -430.0, "logits/rejected": -0.140625, "logits/chosen": 0.04663086, "nll_loss": 0.5078125, "epoch": 1.0, "global_step/max_steps": "25/72", "percentage": "34.72%", "elapsed_time": "4m 27s", "remaining_time": "8m 22s"}
+{"loss": 0.47605286, "grad_norm": 0.09105662, "learning_rate": 6.806e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09323, "rewards/chosen": 6.53125, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -502.0, "logps/chosen": -280.0, "logits/rejected": -0.05249023, "logits/chosen": -0.15820312, "nll_loss": 0.40234375, "epoch": 1.2020202, "global_step/max_steps": "30/72", "percentage": "41.67%", "elapsed_time": "5m 18s", "remaining_time": "7m 26s"}
+{"loss": 0.4629303, "grad_norm": 0.08621389, "learning_rate": 5.691e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.093963, "rewards/chosen": 7.0, "rewards/rejected": -3.625, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -528.0, "logps/chosen": -212.0, "logits/rejected": 0.18066406, "logits/chosen": -0.45703125, "nll_loss": 0.46289062, "epoch": 1.4040404, "global_step/max_steps": "35/72", "percentage": "48.61%", "elapsed_time": "6m 9s", "remaining_time": "6m 30s"}
+{"loss": 0.44608459, "grad_norm": 0.11934378, "learning_rate": 4.539e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094552, "rewards/chosen": 6.75, "rewards/rejected": -3.984375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -282.0, "logps/chosen": -344.0, "logits/rejected": -0.35546875, "logits/chosen": 0.18359375, "nll_loss": 0.40429688, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 59s", "remaining_time": "5m 35s"}
+{"eval_loss": 0.42163086, "eval_runtime": 1.4128, "eval_samples_per_second": 2.831, "eval_steps_per_second": 0.708, "eval_rewards/chosen": 6.0, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -178.0, "eval_logps/chosen": -5.09375, "eval_logits/rejected": 0.953125, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.22167969, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "7m 1s", "remaining_time": "5m 37s"}
+{"loss": 0.53961792, "grad_norm": 0.12788297, "learning_rate": 3.411e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094018, "rewards/chosen": 7.75, "rewards/rejected": -2.875, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -494.0, "logps/chosen": -310.0, "logits/rejected": 0.328125, "logits/chosen": -0.02331543, "nll_loss": 0.51953125, "epoch": 1.80808081, "global_step/max_steps": "45/72", "percentage": "62.50%", "elapsed_time": "7m 55s", "remaining_time": "4m 45s"}
+{"loss": 0.43733521, "grad_norm": 0.20021893, "learning_rate": 2.368e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094129, "rewards/chosen": 7.8125, "rewards/rejected": -3.5, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -280.0, "logps/chosen": -300.0, "logits/rejected": 0.01599121, "logits/chosen": 0.18164062, "nll_loss": 0.40234375, "epoch": 2.0, "global_step/max_steps": "50/72", "percentage": "69.44%", "elapsed_time": "8m 48s", "remaining_time": "3m 52s"}
+{"loss": 0.48487549, "grad_norm": 0.17561193, "learning_rate": 1.464e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094012, "rewards/chosen": 6.96875, "rewards/rejected": -2.40625, "rewards/accuracies": 1.0, "rewards/margins": 9.375, "logps/rejected": -620.0, "logps/chosen": -298.0, "logits/rejected": 0.3046875, "logits/chosen": 0.08886719, "nll_loss": 0.45703125, "epoch": 2.2020202, "global_step/max_steps": "55/72", "percentage": "76.39%", "elapsed_time": "9m 41s", "remaining_time": "2m 59s"}
+{"loss": 0.45642853, "grad_norm": 0.06159662, "learning_rate": 7.49e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094412, "rewards/chosen": 8.125, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -468.0, "logps/chosen": -422.0, "logits/rejected": -0.3125, "logits/chosen": 0.05957031, "nll_loss": 0.515625, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 32s", "remaining_time": "2m 6s"}
+{"eval_loss": 0.41357422, "eval_runtime": 1.3539, "eval_samples_per_second": 2.954, "eval_steps_per_second": 0.739, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.625, "eval_logps/rejected": -194.0, "eval_logps/chosen": -4.8125, "eval_logits/rejected": 1.171875, "eval_logits/chosen": -1.4140625, "eval_nll_loss": 0.20898438, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 33s", "remaining_time": "2m 6s"}
+{"loss": 0.47344971, "grad_norm": 0.12507359, "learning_rate": 2.59e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094381, "rewards/chosen": 7.5, "rewards/rejected": -4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.75, "logps/rejected": -488.0, "logps/chosen": -206.0, "logits/rejected": 0.44726562, "logits/chosen": -0.41796875, "nll_loss": 0.55078125, "epoch": 2.60606061, "global_step/max_steps": "65/72", "percentage": "90.28%", "elapsed_time": "11m 25s", "remaining_time": "1m 13s"}
+{"loss": 0.43180008, "grad_norm": 0.13343167, "learning_rate": 2.1e-07, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094656, "rewards/chosen": 7.5, "rewards/rejected": -3.734375, "rewards/accuracies": 1.0, "rewards/margins": 11.25, "logps/rejected": -446.0, "logps/chosen": -274.0, "logits/rejected": 0.0559082, "logits/chosen": 0.02185059, "nll_loss": 0.45898438, "epoch": 2.80808081, "global_step/max_steps": "70/72", "percentage": "97.22%", "elapsed_time": "12m 16s", "remaining_time": "21s"}
+{"eval_loss": 0.41430664, "eval_runtime": 1.3572, "eval_samples_per_second": 2.947, "eval_steps_per_second": 0.737, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -196.0, "eval_logps/chosen": -4.84375, "eval_logits/rejected": 1.1875, "eval_logits/chosen": -1.421875, "eval_nll_loss": 0.20996094, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 39s", "remaining_time": "0s"}
+{"train_runtime": 760.0145, "train_samples_per_second": 1.559, "train_steps_per_second": 0.095, "total_flos": 31365457739776.0, "train_loss": 0.64717012, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 39s", "remaining_time": "0s"}
+{"train_dataset": "1189.215190±496.010190, min=317.000000, max=4190.000000, size=395", "val_dataset": "1200.750000±508.140421, min=734.000000, max=2041.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-72", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v2-20250127-163825/checkpoint-60", "best_metric": 0.41357422, "global_step": 72, "log_history": [{"loss": 1.7568359375, "grad_norm": 2.741686768010279, "learning_rate": 2.5e-05, "memory(GiB)": 14.02, "train_speed(iter/s)": 0.062883, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -282.0, "logits/rejected": 0.2080078125, "logits/chosen": -0.3984375, "nll_loss": 1.5859375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 1.7740478515625, "grad_norm": 3.179213138025169, "learning_rate": 9.994664874011863e-05, "memory(GiB)": 31.0, "train_speed(iter/s)": 0.084048, "rewards/chosen": 0.287109375, "rewards/rejected": 0.1416015625, "rewards/accuracies": 0.5, "rewards/margins": 0.1455078125, "logps/rejected": -516.0, "logps/chosen": -358.0, "logits/rejected": -0.185546875, "logits/chosen": -0.734375, "nll_loss": 1.0234375, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.3421142578125, "grad_norm": 1.1865745356779005, "learning_rate": 9.809128215864097e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.086989, "rewards/chosen": 1.6015625, "rewards/rejected": 0.6875, "rewards/accuracies": 0.8500000238418579, "rewards/margins": 0.91796875, "logps/rejected": -552.0, "logps/chosen": -348.0, "logits/rejected": -0.036376953125, "logits/chosen": -0.5078125, "nll_loss": 0.8203125, "epoch": 0.40404040404040403, "step": 10}, {"loss": 0.77510986328125, "grad_norm": 0.2081256636056697, "learning_rate": 9.368111953231848e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09061, "rewards/chosen": 2.984375, "rewards/rejected": -0.33203125, "rewards/accuracies": 1.0, "rewards/margins": 3.3125, "logps/rejected": -284.0, "logps/chosen": -368.0, "logits/rejected": -0.703125, "logits/chosen": -0.23828125, "nll_loss": 0.67578125, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.53414306640625, "grad_norm": 0.12928414880774924, "learning_rate": 8.695044586103296e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.091844, "rewards/chosen": 3.34375, "rewards/rejected": -1.859375, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -310.0, "logps/chosen": -458.0, "logits/rejected": -0.61328125, "logits/chosen": -0.2060546875, "nll_loss": 0.578125, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.439208984375, "eval_runtime": 1.2472, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.802, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -0.2001953125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.21875, "eval_logps/rejected": -160.0, "eval_logps/chosen": -4.78125, "eval_logits/rejected": 0.306640625, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.2080078125, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.468353271484375, "grad_norm": 0.19570778554820287, "learning_rate": 7.82568207211296e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.092514, "rewards/chosen": 5.0, "rewards/rejected": -2.0625, "rewards/accuracies": 1.0, "rewards/margins": 7.0625, "logps/rejected": -516.0, "logps/chosen": -430.0, "logits/rejected": -0.140625, "logits/chosen": 0.046630859375, "nll_loss": 0.5078125, "epoch": 1.0, "step": 25}, {"loss": 0.4760528564453125, "grad_norm": 0.09105661940994438, "learning_rate": 6.806208330935766e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.09323, "rewards/chosen": 6.53125, "rewards/rejected": -4.125, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -502.0, "logps/chosen": -280.0, "logits/rejected": -0.052490234375, "logits/chosen": -0.158203125, "nll_loss": 0.40234375, "epoch": 1.202020202020202, "step": 30}, {"loss": 0.4629302978515625, "grad_norm": 0.08621388537462939, "learning_rate": 5.6907817747594116e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.093963, "rewards/chosen": 7.0, "rewards/rejected": -3.625, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -528.0, "logps/chosen": -212.0, "logits/rejected": 0.1806640625, "logits/chosen": -0.45703125, "nll_loss": 0.462890625, "epoch": 1.404040404040404, "step": 35}, {"loss": 0.4460845947265625, "grad_norm": 0.11934377803721716, "learning_rate": 4.5386582026834906e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094552, "rewards/chosen": 6.75, "rewards/rejected": -3.984375, "rewards/accuracies": 1.0, "rewards/margins": 10.75, "logps/rejected": -282.0, "logps/chosen": -344.0, "logits/rejected": -0.35546875, "logits/chosen": 0.18359375, "nll_loss": 0.404296875, "epoch": 1.606060606060606, "step": 40}, {"eval_loss": 0.421630859375, "eval_runtime": 1.4128, "eval_samples_per_second": 2.831, "eval_steps_per_second": 0.708, "eval_rewards/chosen": 6.0, "eval_rewards/rejected": -2.0, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.0, "eval_logps/rejected": -178.0, "eval_logps/chosen": -5.09375, "eval_logits/rejected": 0.953125, "eval_logits/chosen": -1.4375, "eval_nll_loss": 0.2216796875, "epoch": 1.606060606060606, "step": 40}, {"loss": 0.539617919921875, "grad_norm": 0.1278829740469663, "learning_rate": 3.411042902090492e-05, "memory(GiB)": 45.84, "train_speed(iter/s)": 0.094018, "rewards/chosen": 7.75, "rewards/rejected": -2.875, "rewards/accuracies": 1.0, "rewards/margins": 10.625, "logps/rejected": -494.0, "logps/chosen": -310.0, "logits/rejected": 0.328125, "logits/chosen": -0.0233154296875, "nll_loss": 0.51953125, "epoch": 1.808080808080808, "step": 45}, {"loss": 0.437335205078125, "grad_norm": 0.20021892626227725, "learning_rate": 2.3678391856132204e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094129, "rewards/chosen": 7.8125, "rewards/rejected": -3.5, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -280.0, "logps/chosen": -300.0, "logits/rejected": 0.0159912109375, "logits/chosen": 0.181640625, "nll_loss": 0.40234375, "epoch": 2.0, "step": 50}, {"loss": 0.48487548828125, "grad_norm": 0.17561192586448465, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094012, "rewards/chosen": 6.96875, "rewards/rejected": -2.40625, "rewards/accuracies": 1.0, "rewards/margins": 9.375, "logps/rejected": -620.0, "logps/chosen": -298.0, "logits/rejected": 0.3046875, "logits/chosen": 0.0888671875, "nll_loss": 0.45703125, "epoch": 2.202020202020202, "step": 55}, {"loss": 0.45642852783203125, "grad_norm": 0.06159661984448856, "learning_rate": 7.489143213519301e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094412, "rewards/chosen": 8.125, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 11.1875, "logps/rejected": -468.0, "logps/chosen": -422.0, "logits/rejected": -0.3125, "logits/chosen": 0.0595703125, "nll_loss": 0.515625, "epoch": 2.404040404040404, "step": 60}, {"eval_loss": 0.41357421875, "eval_runtime": 1.3539, "eval_samples_per_second": 2.954, "eval_steps_per_second": 0.739, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.59375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.625, "eval_logps/rejected": -194.0, "eval_logps/chosen": -4.8125, "eval_logits/rejected": 1.171875, "eval_logits/chosen": -1.4140625, "eval_nll_loss": 0.208984375, "epoch": 2.404040404040404, "step": 60}, {"loss": 0.47344970703125, "grad_norm": 0.12507359172791535, "learning_rate": 2.591967620451707e-06, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094381, "rewards/chosen": 7.5, "rewards/rejected": -4.25, "rewards/accuracies": 1.0, "rewards/margins": 11.75, "logps/rejected": -488.0, "logps/chosen": -206.0, "logits/rejected": 0.447265625, "logits/chosen": -0.41796875, "nll_loss": 0.55078125, "epoch": 2.606060606060606, "step": 65}, {"loss": 0.43180007934570314, "grad_norm": 0.13343167302632133, "learning_rate": 2.1329118524827662e-07, "memory(GiB)": 45.85, "train_speed(iter/s)": 0.094656, "rewards/chosen": 7.5, "rewards/rejected": -3.734375, "rewards/accuracies": 1.0, "rewards/margins": 11.25, "logps/rejected": -446.0, "logps/chosen": -274.0, "logits/rejected": 0.055908203125, "logits/chosen": 0.0218505859375, "nll_loss": 0.458984375, "epoch": 2.808080808080808, "step": 70}, {"eval_loss": 0.414306640625, "eval_runtime": 1.3572, "eval_samples_per_second": 2.947, "eval_steps_per_second": 0.737, "eval_rewards/chosen": 6.03125, "eval_rewards/rejected": -3.796875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.8125, "eval_logps/rejected": -196.0, "eval_logps/chosen": -4.84375, "eval_logits/rejected": 1.1875, "eval_logits/chosen": -1.421875, "eval_nll_loss": 0.2099609375, "epoch": 2.888888888888889, "step": 72}, {"train_runtime": 760.0145, "train_samples_per_second": 1.559, "train_steps_per_second": 0.095, "total_flos": 31365457739776.0, "train_loss": 0.6471701198154025, "epoch": 2.888888888888889, "step": 72}], "memory": 45.845703125}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0 b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0
new file mode 100644
index 0000000000000000000000000000000000000000..a9ec9284c429951058a208e798266988bb28eb48
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_random20_system/v0-20250127-163825/runs/events.out.tfevents.1737995978.kml-task-540432-record-9981983-prod-worker-0.21280.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b88558a3cd946b321eccf012afda6a2a48fe17126efee9fa3f3f5a9bd4b8f9ae
+size 23705
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fac738a0a60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4e76809f01a498fe02cd62a07b36a15db217c933
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4159bbc77660983c58e8f43a1c115af4f4ccff9c7155a2b2b8da97bf491c3162
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fac738a0a60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1dfb163f468b909fe0c58ce444e12c589f55b50
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b246226f7517e1095b0cba8cb6c3fb461d12b9753114a79bcce3664deafa2385
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..64215930e29c2a65027c1f36017d1b7273ca4434
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a82cc0f9cb5f56ef17584cddc4eb25c077dce638eda0d5b11b8acad396d63f8
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b498de605c3c33a1f6020c204e0fa403fa96366
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d1689336d82c68b9ffba45814111a23c49bbe0053646ce8bc2632924453aad
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..adf014d92fdbe44fb3e075639643fd2c72ac1aee
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce3eb8132cb6d0cef9b1a069aa8252e700e37918ceae0a5faa00e0752ddd8cce
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f00124e5a0ea627b8addbbf3644b01ed2e2e835
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3052be4c3806c2550bace460681d1af2b3beeb7973b270c598ff4095198a4fc
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0099b653969fd7512c604969ec5d2e07300ef118
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89a650af753c3f9853d157a148cbf7292ea7973999c01035dc62eaa2ffd1a958
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7b0f50bc9d960908ba49f55ba4c088080152779
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffdb58c9f32cb1cc2fec92b915d06c00271f6c23b3b162644a43d8bc322fad23
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f694c5feaaca0ce72187371fc6ee20956a7d7ecb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b9d5a7067f65dce1a3a7f5943fce25d44ab9d56b2097dfea4aafe4bf85b681
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest
new file mode 100644
index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/latest
@@ -0,0 +1 @@
+global_step20
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..37ac50652a3badbfb1bdeaccb8b1934575b584eb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0bc3650851dae439677613c9e23a5528de47b679
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0e00a6e8b4b743026f68d749a8cb3bdd4b746838
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5354141d42e077c356f9ca8c6b12bd7e5e41f2af
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9db8a98ca69cd5bfebe102039231d58d7ea374e
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c42b1ef948ce2918e44502f72db30bc09f3c40f0dfa68050c22f884d1aac4ff5
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e5fdb291bcd69a047683e120683c57884c066c6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/trainer_state.json
@@ -0,0 +1,140 @@
+{
+  "best_metric": 0.58154297,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20",
+  "epoch": 0.8080808080808081,
+  "eval_steps": 20,
+  "global_step": 20,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.3671343726657543,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.46875,
+      "logits/rejected": 0.228515625,
+      "logps/chosen": -286.0,
+      "logps/rejected": -272.0,
+      "loss": 1.8359375,
+      "memory(GiB)": 13.63,
+      "nll_loss": 1.7109375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.067542
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 2.798293390214536,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.7421875,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -362.0,
+      "logps/rejected": -512.0,
+      "loss": 1.83404541015625,
+      "memory(GiB)": 30.5,
+      "nll_loss": 1.1015625,
+      "rewards/accuracies": 0.375,
+      "rewards/chosen": 0.228515625,
+      "rewards/margins": 0.1005859375,
+      "rewards/rejected": 0.1279296875,
+      "step": 5,
+      "train_speed(iter/s)": 0.088059
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 2.205296809705217,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.421875,
+      "logits/rejected": -0.012451171875,
+      "logps/chosen": -350.0,
+      "logps/rejected": -548.0,
+      "loss": 1.5821044921875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.8515625,
+      "rewards/accuracies": 0.699999988079071,
+      "rewards/chosen": 1.765625,
+      "rewards/margins": 0.63671875,
+      "rewards/rejected": 1.1328125,
+      "step": 10,
+      "train_speed(iter/s)": 0.090254
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.4006731478550383,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.14453125,
+      "logits/rejected": -0.609375,
+      "logps/chosen": -366.0,
+      "logps/rejected": -260.0,
+      "loss": 1.147705078125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.75,
+      "rewards/accuracies": 0.949999988079071,
+      "rewards/chosen": 3.53125,
+      "rewards/margins": 1.625,
+      "rewards/rejected": 1.90625,
+      "step": 15,
+      "train_speed(iter/s)": 0.09299
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.5239399131286955,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.033203125,
+      "logits/rejected": -0.53515625,
+      "logps/chosen": -452.0,
+      "logps/rejected": -280.0,
+      "loss": 0.72830810546875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.609375,
+      "rewards/accuracies": 0.8999999761581421,
+      "rewards/chosen": 4.125,
+      "rewards/margins": 3.0625,
+      "rewards/rejected": 1.0703125,
+      "step": 20,
+      "train_speed(iter/s)": 0.093763
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.8671875,
+      "eval_logits/rejected": 0.349609375,
+      "eval_logps/chosen": -17.5,
+      "eval_logps/rejected": -172.0,
+      "eval_loss": 0.58154296875,
+      "eval_nll_loss": 0.76171875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 5.5,
+      "eval_rewards/margins": 6.3125,
+      "eval_rewards/rejected": -0.80078125,
+      "eval_runtime": 1.2034,
+      "eval_samples_per_second": 3.324,
+      "eval_steps_per_second": 0.831,
+      "step": 20
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8682004316160.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-20/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6a9e141468c2e7aa30d50198bf0fc3c3baabd9d
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:606674b8361c60bba8260ac0b95666f47368fb0cb1f386f6b25f1610a5ad7c1b
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fac738a0a60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4bb96e8a686b1c62d928e4d48061a040ea5d87ec
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc17147f84b9dfc398ff1d1f46c8a8aa7003211e1925d4d06ad06c7c2dd3558
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..15e7cea4c2332d19f9c8fefad50ceb6fc0b1dc11
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2825587ca9746368793c0c0852399766eab3f1c29ae7d6742583f7b30ae81ca1
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0d13fe824c6a9eaa2dc149dde07d891f913e81f7
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:656d9f585026bb51286c480d43ba0bd5fc46ead30f916682f7c7c2b5e129c6b0
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aa07aef4759012465b54e6e79c200e277c108544
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9981c1d4f779336e387c48f93caf59757a8856bde2431fabbdae899d23d8a9db
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..521cff52824a521f9600db9459948a11f52abf09
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2caf3158f1f6f446801f4d26836267d7f8eace7459ddd0add1a9abd1b83631aa
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a726173f4adb98fe2e2b46df9a3a7a5386be326
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4dde91841e38e32dbb367a21a6ab2d644ee359442a82e929982af7d81f1339b
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ded5c487b0af4c917cf833cdafa2261290cfc53a
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42bebe291bfd2d5039060b95d8080361a7981aa9abfcb39bce2d72a9c2ebef8e
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6cc4d81a0b906cb49404ddc23dce222d0007551
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/global_step39/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d27803d4e916264476fdba88d87a2e03b998cd73b3ea77e32f59ac3bde61a55
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest
new file mode 100644
index 0000000000000000000000000000000000000000..67f1c55b2b0a3119f2287d39e40e22b4f158741b
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/latest
@@ -0,0 +1 @@
+global_step39
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f8799407442db08820f995bcf1b9158f696af19f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0f39416636e7990907141a415603582d33812fc9
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f8e5c420bc296502c335bcadd512d01972f28a0
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2ab72c0a7472f98efb1865889d6039f3ae7d12fc3c8e7bfeea52279fc333219
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1b14557eea9691f7f69da1096d677e374b950af
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/trainer_state.json
@@ -0,0 +1,229 @@
+{
+  "best_metric": 0.43310547,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40",
+  "epoch": 1.606060606060606,
+  "eval_steps": 20,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.3671343726657543,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.46875,
+      "logits/rejected": 0.228515625,
+      "logps/chosen": -286.0,
+      "logps/rejected": -272.0,
+      "loss": 1.8359375,
+      "memory(GiB)": 13.63,
+      "nll_loss": 1.7109375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.067542
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 2.798293390214536,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.7421875,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -362.0,
+      "logps/rejected": -512.0,
+      "loss": 1.83404541015625,
+      "memory(GiB)": 30.5,
+      "nll_loss": 1.1015625,
+      "rewards/accuracies": 0.375,
+      "rewards/chosen": 0.228515625,
+      "rewards/margins": 0.1005859375,
+      "rewards/rejected": 0.1279296875,
+      "step": 5,
+      "train_speed(iter/s)": 0.088059
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 2.205296809705217,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.421875,
+      "logits/rejected": -0.012451171875,
+      "logps/chosen": -350.0,
+      "logps/rejected": -548.0,
+      "loss": 1.5821044921875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.8515625,
+      "rewards/accuracies": 0.699999988079071,
+      "rewards/chosen": 1.765625,
+      "rewards/margins": 0.63671875,
+      "rewards/rejected": 1.1328125,
+      "step": 10,
+      "train_speed(iter/s)": 0.090254
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.4006731478550383,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.14453125,
+      "logits/rejected": -0.609375,
+      "logps/chosen": -366.0,
+      "logps/rejected": -260.0,
+      "loss": 1.147705078125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.75,
+      "rewards/accuracies": 0.949999988079071,
+      "rewards/chosen": 3.53125,
+      "rewards/margins": 1.625,
+      "rewards/rejected": 1.90625,
+      "step": 15,
+      "train_speed(iter/s)": 0.09299
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.5239399131286955,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.033203125,
+      "logits/rejected": -0.53515625,
+      "logps/chosen": -452.0,
+      "logps/rejected": -280.0,
+      "loss": 0.72830810546875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.609375,
+      "rewards/accuracies": 0.8999999761581421,
+      "rewards/chosen": 4.125,
+      "rewards/margins": 3.0625,
+      "rewards/rejected": 1.0703125,
+      "step": 20,
+      "train_speed(iter/s)": 0.093763
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.8671875,
+      "eval_logits/rejected": 0.349609375,
+      "eval_logps/chosen": -17.5,
+      "eval_logps/rejected": -172.0,
+      "eval_loss": 0.58154296875,
+      "eval_nll_loss": 0.76171875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 5.5,
+      "eval_rewards/margins": 6.3125,
+      "eval_rewards/rejected": -0.80078125,
+      "eval_runtime": 1.2034,
+      "eval_samples_per_second": 3.324,
+      "eval_steps_per_second": 0.831,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5826068375172234,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.08447265625,
+      "logits/rejected": -0.142578125,
+      "logps/chosen": -434.0,
+      "logps/rejected": -496.0,
+      "loss": 0.532879638671875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0625,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -0.1357421875,
+      "step": 25,
+      "train_speed(iter/s)": 0.094482
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.1173239600840837,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.0703125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -492.0,
+      "loss": 0.484796142578125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.40625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.71875,
+      "rewards/margins": 9.8125,
+      "rewards/rejected": -3.078125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09487
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.10486166807457631,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.47265625,
+      "logits/rejected": 0.05126953125,
+      "logps/chosen": -217.0,
+      "logps/rejected": -524.0,
+      "loss": 0.477923583984375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.482421875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.03125,
+      "rewards/margins": 10.5625,
+      "rewards/rejected": -3.53125,
+      "step": 35,
+      "train_speed(iter/s)": 0.095273
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.13908151012153538,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": -0.005706787109375,
+      "logits/rejected": -0.498046875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -276.0,
+      "loss": 0.45271148681640627,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.408203125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.5,
+      "rewards/rejected": -3.515625,
+      "step": 40,
+      "train_speed(iter/s)": 0.095656
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.921875,
+      "eval_logits/rejected": 0.62109375,
+      "eval_logps/chosen": -5.34375,
+      "eval_logps/rejected": -175.0,
+      "eval_loss": 0.43310546875,
+      "eval_nll_loss": 0.232421875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.6875,
+      "eval_rewards/margins": 7.78125,
+      "eval_rewards/rejected": -1.1015625,
+      "eval_runtime": 1.3521,
+      "eval_samples_per_second": 2.958,
+      "eval_steps_per_second": 0.74,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 17048500207616.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-40/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3ed77d7a0f1181182cf7693e023a2eb1e120e450
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a684115bfa3fb108fb2545f221efd333a9896e688740202490c8c2a6be1f27ec
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fac738a0a60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ffb91e28c7385342a16c39d19bc6686dd65a8688
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f28849a47c3709b61ee79f2a597a2f0919cb0f7e6af861d5380fbc3a0c4ca3
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cac9f6bb648587d2a3b570fd4ee90f907390e5fd
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1d7090b00b7d986b8c044d7ee440dd7f1e07c5651b76a8bb23c8e7b0f09356
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1de3ca3f3be68e0b799dc048d14af9189d25534a
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f11cbe419db26dd81a200168126e634587ac942367bba71edfb14cc1b9764c37
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91408596191e7ed067d633143da5d47bfa826148
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f50e18fc41b88cad068f7e66295e39525d6739667e3563ed550d31533e19e9b1
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5011349fbf7bc231576172746d34f19d8be3a03
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae87355a9241c0ea4edf21abd0fb8d38012f4aa9723febcef39bbc7901c36db
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f27a9e89ff30e1b7e5fc2b97fce8cf811b4d1cb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84abe8df1ad89f1801c346cbe87bf7fead94c9cb08b42f184d0dfb8f1c359ccc
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6a5432b84d6286d0b574866aaab3d0e6352ed4f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:860eafdd10ecf7c0eaad3c6aca57b888ad1a5f906134d493e84eec89940369c2
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db55b37d6a437c84db4a22626123c43786e59401
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/global_step59/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:124f015af78bbf627b57d97070f950f843b2b5ef2f1886ddbeae34a9ce2a2d10
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest
new file mode 100644
index 0000000000000000000000000000000000000000..099fa08342218cca7c00fb7043635561ebda9695
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/latest
@@ -0,0 +1 @@
+global_step59
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c54ea122b283c04f6b60c1eedefeb301763a8f9f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ea57ead2533e587fe50f62107d7cb32945fe1354
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4689a9445d07528dc4fd91011a7f034c11773a68
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..919b5e43a96a9afdeb196f402142bc3aab67f247
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95126866042baa544d6bc4555d944440b37fdb21
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e1521c1c8dfc88bc6566a95cc91f42709693a765076997f6318af86035c445
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cab50a66dcea9a9cdb64373c69130e11f283d2e3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/trainer_state.json
@@ -0,0 +1,318 @@
+{
+  "best_metric": 0.42553711,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60",
+  "epoch": 2.404040404040404,
+  "eval_steps": 20,
+  "global_step": 60,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.3671343726657543,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.46875,
+      "logits/rejected": 0.228515625,
+      "logps/chosen": -286.0,
+      "logps/rejected": -272.0,
+      "loss": 1.8359375,
+      "memory(GiB)": 13.63,
+      "nll_loss": 1.7109375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.067542
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 2.798293390214536,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.7421875,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -362.0,
+      "logps/rejected": -512.0,
+      "loss": 1.83404541015625,
+      "memory(GiB)": 30.5,
+      "nll_loss": 1.1015625,
+      "rewards/accuracies": 0.375,
+      "rewards/chosen": 0.228515625,
+      "rewards/margins": 0.1005859375,
+      "rewards/rejected": 0.1279296875,
+      "step": 5,
+      "train_speed(iter/s)": 0.088059
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 2.205296809705217,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.421875,
+      "logits/rejected": -0.012451171875,
+      "logps/chosen": -350.0,
+      "logps/rejected": -548.0,
+      "loss": 1.5821044921875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.8515625,
+      "rewards/accuracies": 0.699999988079071,
+      "rewards/chosen": 1.765625,
+      "rewards/margins": 0.63671875,
+      "rewards/rejected": 1.1328125,
+      "step": 10,
+      "train_speed(iter/s)": 0.090254
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.4006731478550383,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.14453125,
+      "logits/rejected": -0.609375,
+      "logps/chosen": -366.0,
+      "logps/rejected": -260.0,
+      "loss": 1.147705078125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.75,
+      "rewards/accuracies": 0.949999988079071,
+      "rewards/chosen": 3.53125,
+      "rewards/margins": 1.625,
+      "rewards/rejected": 1.90625,
+      "step": 15,
+      "train_speed(iter/s)": 0.09299
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.5239399131286955,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.033203125,
+      "logits/rejected": -0.53515625,
+      "logps/chosen": -452.0,
+      "logps/rejected": -280.0,
+      "loss": 0.72830810546875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.609375,
+      "rewards/accuracies": 0.8999999761581421,
+      "rewards/chosen": 4.125,
+      "rewards/margins": 3.0625,
+      "rewards/rejected": 1.0703125,
+      "step": 20,
+      "train_speed(iter/s)": 0.093763
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.8671875,
+      "eval_logits/rejected": 0.349609375,
+      "eval_logps/chosen": -17.5,
+      "eval_logps/rejected": -172.0,
+      "eval_loss": 0.58154296875,
+      "eval_nll_loss": 0.76171875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 5.5,
+      "eval_rewards/margins": 6.3125,
+      "eval_rewards/rejected": -0.80078125,
+      "eval_runtime": 1.2034,
+      "eval_samples_per_second": 3.324,
+      "eval_steps_per_second": 0.831,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5826068375172234,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.08447265625,
+      "logits/rejected": -0.142578125,
+      "logps/chosen": -434.0,
+      "logps/rejected": -496.0,
+      "loss": 0.532879638671875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0625,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -0.1357421875,
+      "step": 25,
+      "train_speed(iter/s)": 0.094482
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.1173239600840837,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.0703125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -492.0,
+      "loss": 0.484796142578125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.40625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.71875,
+      "rewards/margins": 9.8125,
+      "rewards/rejected": -3.078125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09487
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.10486166807457631,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.47265625,
+      "logits/rejected": 0.05126953125,
+      "logps/chosen": -217.0,
+      "logps/rejected": -524.0,
+      "loss": 0.477923583984375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.482421875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.03125,
+      "rewards/margins": 10.5625,
+      "rewards/rejected": -3.53125,
+      "step": 35,
+      "train_speed(iter/s)": 0.095273
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.13908151012153538,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": -0.005706787109375,
+      "logits/rejected": -0.498046875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -276.0,
+      "loss": 0.45271148681640627,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.408203125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.5,
+      "rewards/rejected": -3.515625,
+      "step": 40,
+      "train_speed(iter/s)": 0.095656
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.921875,
+      "eval_logits/rejected": 0.62109375,
+      "eval_logps/chosen": -5.34375,
+      "eval_logps/rejected": -175.0,
+      "eval_loss": 0.43310546875,
+      "eval_nll_loss": 0.232421875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.6875,
+      "eval_rewards/margins": 7.78125,
+      "eval_rewards/rejected": -1.1015625,
+      "eval_runtime": 1.3521,
+      "eval_samples_per_second": 2.958,
+      "eval_steps_per_second": 0.74,
+      "step": 40
+    },
+    {
+      "epoch": 1.808080808080808,
+      "grad_norm": 0.15003750533951385,
+      "learning_rate": 3.411042902090492e-05,
+      "logits/chosen": -0.1572265625,
+      "logits/rejected": 0.1650390625,
+      "logps/chosen": -314.0,
+      "logps/rejected": -496.0,
+      "loss": 0.546685791015625,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.65625,
+      "rewards/margins": 10.875,
+      "rewards/rejected": -3.234375,
+      "step": 45,
+      "train_speed(iter/s)": 0.095055
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.17349498363870808,
+      "learning_rate": 2.3678391856132204e-05,
+      "logits/chosen": -0.0074462890625,
+      "logits/rejected": -0.140625,
+      "logps/chosen": -304.0,
+      "logps/rejected": -274.0,
+      "loss": 0.44422264099121095,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.41015625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.59375,
+      "rewards/margins": 10.4375,
+      "rewards/rejected": -2.859375,
+      "step": 50,
+      "train_speed(iter/s)": 0.095326
+    },
+    {
+      "epoch": 2.202020202020202,
+      "grad_norm": 0.21138809828743063,
+      "learning_rate": 1.4644660940672627e-05,
+      "logits/chosen": -0.09521484375,
+      "logits/rejected": 0.1259765625,
+      "logps/chosen": -300.0,
+      "logps/rejected": -616.0,
+      "loss": 0.49451904296875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.474609375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.25,
+      "rewards/margins": 9.3125,
+      "rewards/rejected": -2.078125,
+      "step": 55,
+      "train_speed(iter/s)": 0.095115
+    },
+    {
+      "epoch": 2.404040404040404,
+      "grad_norm": 0.06468135061973618,
+      "learning_rate": 7.489143213519301e-06,
+      "logits/chosen": -0.12353515625,
+      "logits/rejected": -0.482421875,
+      "logps/chosen": -420.0,
+      "logps/rejected": -470.0,
+      "loss": 0.460784912109375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.51171875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 8.8125,
+      "rewards/margins": 12.125,
+      "rewards/rejected": -3.328125,
+      "step": 60,
+      "train_speed(iter/s)": 0.095377
+    },
+    {
+      "epoch": 2.404040404040404,
+      "eval_logits/chosen": -1.9921875,
+      "eval_logits/rejected": 0.87890625,
+      "eval_logps/chosen": -5.1875,
+      "eval_logps/rejected": -179.0,
+      "eval_loss": 0.425537109375,
+      "eval_nll_loss": 0.2255859375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.75,
+      "eval_rewards/margins": 8.25,
+      "eval_rewards/rejected": -1.5,
+      "eval_runtime": 1.3466,
+      "eval_samples_per_second": 2.97,
+      "eval_steps_per_second": 0.743,
+      "step": 60
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 26025733423104.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be0c96a909959c265a0ab602cd067e0d3fbbde6
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/README.md
@@ -0,0 +1,202 @@
+---
+base_model: /home/wangruotong/LLM_test/Models/Marco-o1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea3100564f8f73f51fc508a1408e494e912544c8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_config.json
@@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d83da27a2c8d998634dacb835a64eb00d3dd2e8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75002a00d611e4875b86199e228d1535b69dae7bedd01ae668f2824526694ce8
+size 40422208
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/additional_config.json
@@ -0,0 +1 @@
+{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json
new file mode 100644
index 0000000000000000000000000000000000000000..efc7069ad1fbeb7ba8b7d21f876c17672c7df0a2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/args.json
@@ -0,0 +1,371 @@
+{
+  "model": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "model_type": "marco_o1",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "local_repo_path": null,
+  "template": "marco_o1",
+  "system": "You are a helpful assistant.",
+  "max_length": 4200,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "padding_side": "right",
+  "loss_scale": "last_round",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/home/wangruotong/LLM_test/data/train_400_0.5_dpo_what_system.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 1,
+  "streaming": false,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "strict": false,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.7,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": true,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 0.0001,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.999,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 200,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": null,
+  "disable_tqdm": null,
+  "remove_unused_columns": false,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "evaluation_strategy": "steps",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "dispatch_batches": null,
+  "split_batches": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "add_version": true,
+  "resume_only_model": false,
+  "check_model": true,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": "sigmoid",
+  "optimizer": null,
+  "metric": null,
+  "acc_strategy": "token",
+  "reward_model": null,
+  "reward_adapters": [],
+  "reward_model_type": null,
+  "reward_model_revision": null,
+  "num_ppo_epochs": 4,
+  "whiten_rewards": false,
+  "kl_coef": 0.05,
+  "cliprange": 0.2,
+  "vf_coef": 0.1,
+  "cliprange_value": 0.2,
+  "gamma": 1.0,
+  "lam": 0.95,
+  "num_mini_batches": 1,
+  "local_rollout_forward_batch_size": 64,
+  "num_sample_generations": 10,
+  "response_length": 512,
+  "missing_eos_penalty": null,
+  "rlhf_type": "dpo",
+  "ref_model": null,
+  "ref_model_type": null,
+  "ref_model_revision": null,
+  "beta": 0.1,
+  "label_smoothing": 0,
+  "rpo_alpha": 1.0,
+  "cpo_alpha": 1.0,
+  "simpo_gamma": 1,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "Marco-o1",
+  "model_info": "ModelInfo(model_type='marco_o1', model_dir='/home/wangruotong/LLM_test/Models/Marco-o1', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='marco_o1', model_groups=[ModelGroup(models=[Model(ms_model_id='AIDC-AI/Marco-o1', hf_model_id='AIDC-AI/Marco-o1', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='marco_o1', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fac738a0a60>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=['*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx', '*.ot', '*.h5', '*.bin', '*.safetensors'], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/home/wangruotong/LLM_test/Models/Marco-o1",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "training_args": "DPOConfig(output_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=200, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, beta=0.1, label_smoothing=0, loss_type='sigmoid', label_pad_token_id=None, padding_value=None, truncation_mode='keep_end', max_length=4200, max_prompt_length=None, max_target_length=None, max_completion_length=None, is_encoder_decoder=False, disable_dropout=True, generate_during_eval=False, precompute_ref_log_probs=False, dataset_num_proc=1, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, reference_free=False, force_use_ref_model=False, f_divergence_type=<FDivergenceType.REVERSE_KL: 'reverse_kl'>, f_alpha_divergence_coef=1.0, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, rpo_alpha=1.0, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)"
+}
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd1302acc77b849e3573039184c4a969c41784ea
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4870218bc5905167b4fdf693995be469a3f88fde5feec15eeac27c87a38117ff
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33fe548d8b646d5e438f35e2d5273e6273b28f81
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d423e087ba616b7cfbe7c4a687dea5c1165f732ee64fc0d2d912a8477c5f548
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e927fa6822aebf010559d7eb083bba70ade6a5d3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b64199c7f51ee6f7d6a3ee0e21de586030d7d0dac6100b8371b92ac6dc3fcd4
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cc0219d7b61da1d8baa80bf8f492b3740f01c1d
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d009cc82d34e2036e23f4a4a8791bb1f50b240ccacf4e168e8f57671ead9ca59
+size 60559280
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a524f27b92645e5645dfcc9b75110e322a57a6b2
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f7e868315deb2e0bb0064d1d57817ce3429b318a03d3a3dd62a9dfce05bbef
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e47b1e355f370a9f30c7c77ba856b6efddbb168f
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b64cb77e899835cb8a51cfb333ad499ca2901e784ea087a7c21b702d596df647
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7726800cef0fc2d62c65ee7dd3f9e0cb17c01440
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3afc72b3e2aaabd252874436aacbe2ce7bff38be7845ea0f1ffb58987003f91e
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b02e5338327c0d40effa89d5444bb462c2d1d89c
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/global_step71/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c3089aac8b20a3c6b4d889ebbe8a52e9c331dd4d49e66893559428bee931945
+size 388374
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest
new file mode 100644
index 0000000000000000000000000000000000000000..bbeadc7466d2728e3046120a012ebc37c29267cb
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/latest
@@ -0,0 +1 @@
+global_step71
\ No newline at end of file
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..be2e24cc9d9ef8857272cec1451c810e205ec4e9
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..efcf4dd2e74596ac28af81f9f8bd0be9a807deb3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4c9222e37d4e9d1745c0e126e0fe0c4a348e298d
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7821bf0f5f0621fd0159152432f0a7bc66aa6823
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4
+size 15024
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d0d6f336655bdacf5eb53294b71e20f2d0edb17
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2ba16a2cd6668009497101c7aa1ee348685f1df2d9a2a20c23be3737c813063
+size 1064
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccd49325958fc683960562dc1cf8d3bd5ef38687
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/trainer_state.json
@@ -0,0 +1,371 @@
+{
+  "best_metric": 0.42553711,
+  "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60",
+  "epoch": 2.888888888888889,
+  "eval_steps": 20,
+  "global_step": 72,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04040404040404041,
+      "grad_norm": 2.3671343726657543,
+      "learning_rate": 2.5e-05,
+      "logits/chosen": -0.46875,
+      "logits/rejected": 0.228515625,
+      "logps/chosen": -286.0,
+      "logps/rejected": -272.0,
+      "loss": 1.8359375,
+      "memory(GiB)": 13.63,
+      "nll_loss": 1.7109375,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1,
+      "train_speed(iter/s)": 0.067542
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 2.798293390214536,
+      "learning_rate": 9.994664874011863e-05,
+      "logits/chosen": -0.7421875,
+      "logits/rejected": -0.185546875,
+      "logps/chosen": -362.0,
+      "logps/rejected": -512.0,
+      "loss": 1.83404541015625,
+      "memory(GiB)": 30.5,
+      "nll_loss": 1.1015625,
+      "rewards/accuracies": 0.375,
+      "rewards/chosen": 0.228515625,
+      "rewards/margins": 0.1005859375,
+      "rewards/rejected": 0.1279296875,
+      "step": 5,
+      "train_speed(iter/s)": 0.088059
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 2.205296809705217,
+      "learning_rate": 9.809128215864097e-05,
+      "logits/chosen": -0.421875,
+      "logits/rejected": -0.012451171875,
+      "logps/chosen": -350.0,
+      "logps/rejected": -548.0,
+      "loss": 1.5821044921875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.8515625,
+      "rewards/accuracies": 0.699999988079071,
+      "rewards/chosen": 1.765625,
+      "rewards/margins": 0.63671875,
+      "rewards/rejected": 1.1328125,
+      "step": 10,
+      "train_speed(iter/s)": 0.090254
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.4006731478550383,
+      "learning_rate": 9.368111953231848e-05,
+      "logits/chosen": -0.14453125,
+      "logits/rejected": -0.609375,
+      "logps/chosen": -366.0,
+      "logps/rejected": -260.0,
+      "loss": 1.147705078125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.75,
+      "rewards/accuracies": 0.949999988079071,
+      "rewards/chosen": 3.53125,
+      "rewards/margins": 1.625,
+      "rewards/rejected": 1.90625,
+      "step": 15,
+      "train_speed(iter/s)": 0.09299
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.5239399131286955,
+      "learning_rate": 8.695044586103296e-05,
+      "logits/chosen": -0.033203125,
+      "logits/rejected": -0.53515625,
+      "logps/chosen": -452.0,
+      "logps/rejected": -280.0,
+      "loss": 0.72830810546875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.609375,
+      "rewards/accuracies": 0.8999999761581421,
+      "rewards/chosen": 4.125,
+      "rewards/margins": 3.0625,
+      "rewards/rejected": 1.0703125,
+      "step": 20,
+      "train_speed(iter/s)": 0.093763
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "eval_logits/chosen": -1.8671875,
+      "eval_logits/rejected": 0.349609375,
+      "eval_logps/chosen": -17.5,
+      "eval_logps/rejected": -172.0,
+      "eval_loss": 0.58154296875,
+      "eval_nll_loss": 0.76171875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 5.5,
+      "eval_rewards/margins": 6.3125,
+      "eval_rewards/rejected": -0.80078125,
+      "eval_runtime": 1.2034,
+      "eval_samples_per_second": 3.324,
+      "eval_steps_per_second": 0.831,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5826068375172234,
+      "learning_rate": 7.82568207211296e-05,
+      "logits/chosen": 0.08447265625,
+      "logits/rejected": -0.142578125,
+      "logps/chosen": -434.0,
+      "logps/rejected": -496.0,
+      "loss": 0.532879638671875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 5.0625,
+      "rewards/margins": 5.1875,
+      "rewards/rejected": -0.1357421875,
+      "step": 25,
+      "train_speed(iter/s)": 0.094482
+    },
+    {
+      "epoch": 1.202020202020202,
+      "grad_norm": 0.1173239600840837,
+      "learning_rate": 6.806208330935766e-05,
+      "logits/chosen": -0.158203125,
+      "logits/rejected": -0.0703125,
+      "logps/chosen": -282.0,
+      "logps/rejected": -492.0,
+      "loss": 0.484796142578125,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.40625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 6.71875,
+      "rewards/margins": 9.8125,
+      "rewards/rejected": -3.078125,
+      "step": 30,
+      "train_speed(iter/s)": 0.09487
+    },
+    {
+      "epoch": 1.404040404040404,
+      "grad_norm": 0.10486166807457631,
+      "learning_rate": 5.6907817747594116e-05,
+      "logits/chosen": -0.47265625,
+      "logits/rejected": 0.05126953125,
+      "logps/chosen": -217.0,
+      "logps/rejected": -524.0,
+      "loss": 0.477923583984375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.482421875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.03125,
+      "rewards/margins": 10.5625,
+      "rewards/rejected": -3.53125,
+      "step": 35,
+      "train_speed(iter/s)": 0.095273
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.13908151012153538,
+      "learning_rate": 4.5386582026834906e-05,
+      "logits/chosen": -0.005706787109375,
+      "logits/rejected": -0.498046875,
+      "logps/chosen": -344.0,
+      "logps/rejected": -276.0,
+      "loss": 0.45271148681640627,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.408203125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.0,
+      "rewards/margins": 10.5,
+      "rewards/rejected": -3.515625,
+      "step": 40,
+      "train_speed(iter/s)": 0.095656
+    },
+    {
+      "epoch": 1.606060606060606,
+      "eval_logits/chosen": -1.921875,
+      "eval_logits/rejected": 0.62109375,
+      "eval_logps/chosen": -5.34375,
+      "eval_logps/rejected": -175.0,
+      "eval_loss": 0.43310546875,
+      "eval_nll_loss": 0.232421875,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.6875,
+      "eval_rewards/margins": 7.78125,
+      "eval_rewards/rejected": -1.1015625,
+      "eval_runtime": 1.3521,
+      "eval_samples_per_second": 2.958,
+      "eval_steps_per_second": 0.74,
+      "step": 40
+    },
+    {
+      "epoch": 1.808080808080808,
+      "grad_norm": 0.15003750533951385,
+      "learning_rate": 3.411042902090492e-05,
+      "logits/chosen": -0.1572265625,
+      "logits/rejected": 0.1650390625,
+      "logps/chosen": -314.0,
+      "logps/rejected": -496.0,
+      "loss": 0.546685791015625,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5234375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.65625,
+      "rewards/margins": 10.875,
+      "rewards/rejected": -3.234375,
+      "step": 45,
+      "train_speed(iter/s)": 0.095055
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.17349498363870808,
+      "learning_rate": 2.3678391856132204e-05,
+      "logits/chosen": -0.0074462890625,
+      "logits/rejected": -0.140625,
+      "logps/chosen": -304.0,
+      "logps/rejected": -274.0,
+      "loss": 0.44422264099121095,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.41015625,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.59375,
+      "rewards/margins": 10.4375,
+      "rewards/rejected": -2.859375,
+      "step": 50,
+      "train_speed(iter/s)": 0.095326
+    },
+    {
+      "epoch": 2.202020202020202,
+      "grad_norm": 0.21138809828743063,
+      "learning_rate": 1.4644660940672627e-05,
+      "logits/chosen": -0.09521484375,
+      "logits/rejected": 0.1259765625,
+      "logps/chosen": -300.0,
+      "logps/rejected": -616.0,
+      "loss": 0.49451904296875,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.474609375,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.25,
+      "rewards/margins": 9.3125,
+      "rewards/rejected": -2.078125,
+      "step": 55,
+      "train_speed(iter/s)": 0.095115
+    },
+    {
+      "epoch": 2.404040404040404,
+      "grad_norm": 0.06468135061973618,
+      "learning_rate": 7.489143213519301e-06,
+      "logits/chosen": -0.12353515625,
+      "logits/rejected": -0.482421875,
+      "logps/chosen": -420.0,
+      "logps/rejected": -470.0,
+      "loss": 0.460784912109375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.51171875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 8.8125,
+      "rewards/margins": 12.125,
+      "rewards/rejected": -3.328125,
+      "step": 60,
+      "train_speed(iter/s)": 0.095377
+    },
+    {
+      "epoch": 2.404040404040404,
+      "eval_logits/chosen": -1.9921875,
+      "eval_logits/rejected": 0.87890625,
+      "eval_logps/chosen": -5.1875,
+      "eval_logps/rejected": -179.0,
+      "eval_loss": 0.425537109375,
+      "eval_nll_loss": 0.2255859375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.75,
+      "eval_rewards/margins": 8.25,
+      "eval_rewards/rejected": -1.5,
+      "eval_runtime": 1.3466,
+      "eval_samples_per_second": 2.97,
+      "eval_steps_per_second": 0.743,
+      "step": 60
+    },
+    {
+      "epoch": 2.606060606060606,
+      "grad_norm": 0.1291076581823192,
+      "learning_rate": 2.591967620451707e-06,
+      "logits/chosen": -0.55078125,
+      "logits/rejected": 0.322265625,
+      "logps/chosen": -207.0,
+      "logps/rejected": -480.0,
+      "loss": 0.4864105224609375,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.5703125,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.875,
+      "rewards/margins": 11.3125,
+      "rewards/rejected": -3.4375,
+      "step": 65,
+      "train_speed(iter/s)": 0.095248
+    },
+    {
+      "epoch": 2.808080808080808,
+      "grad_norm": 0.1432822314318524,
+      "learning_rate": 2.1329118524827662e-07,
+      "logits/chosen": -0.1259765625,
+      "logits/rejected": -0.10009765625,
+      "logps/chosen": -278.0,
+      "logps/rejected": -436.0,
+      "loss": 0.4423820495605469,
+      "memory(GiB)": 42.9,
+      "nll_loss": 0.482421875,
+      "rewards/accuracies": 1.0,
+      "rewards/chosen": 7.5,
+      "rewards/margins": 10.5625,
+      "rewards/rejected": -3.0625,
+      "step": 70,
+      "train_speed(iter/s)": 0.095483
+    },
+    {
+      "epoch": 2.888888888888889,
+      "eval_logits/chosen": -1.9765625,
+      "eval_logits/rejected": 0.89453125,
+      "eval_logps/chosen": -5.15625,
+      "eval_logps/rejected": -179.0,
+      "eval_loss": 0.42578125,
+      "eval_nll_loss": 0.224609375,
+      "eval_rewards/accuracies": 1.0,
+      "eval_rewards/chosen": 6.75,
+      "eval_rewards/margins": 8.25,
+      "eval_rewards/rejected": -1.5,
+      "eval_runtime": 1.3679,
+      "eval_samples_per_second": 2.924,
+      "eval_steps_per_second": 0.731,
+      "step": 72
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 72,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 31145554509824.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bdbd933eb77fb414a188444c0c44e522ae588c3
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65114710056e86d2565b3845f7913b58e4fc16f367cc03ffb9f3a9d09187f96d
+size 8888
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b7ee5d152a6b8f27bec936454c2927c132300e
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..30e50c872219c436cafa190c9bea9519772f3fb0
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logits_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..b67211038c92be8ff3a68ad63f189c77e1e385de
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..be3a1a9506fc8be15fab37bf1465ddcffd5a6772
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_logps_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..fb7503df5d30e918d16383ae6d4af436dc0b7d48
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac85650e3f453a38eecfd153751619d7dca8bd49
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_nll_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b88ed37a3c651cbf54899f712bcd1515e2be8fc
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_accuracies.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..867b3d823fb6ce1bdf230b62ef5608afbfa0d085
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png
new file mode 100644
index 0000000000000000000000000000000000000000..0da5b0c4f1d6a8e6d2a30974ead7ebce86963cfe
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_margins.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff513fa6bbb162af8abfa4f8dd4af6b4dfd77b78
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_rewards_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png
new file mode 100644
index 0000000000000000000000000000000000000000..5442b175cf9f1c352ee5bfbeb1b68309ed1aad82
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_runtime.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..61f05ca5dfb66c19ff266e9f472579591473854a
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_samples_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b40d6b9d5eed2e2aecaff1c0fce6fd5abd6fc04
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/eval_steps_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf6c6bacafca8d256f07d43a8851a9463ac6e566
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_epoch.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..108e9a91b60e0304f3fdb5997cd151380a4a483d
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_grad_norm.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d80e686cd9522b9efdfd0b12d052a155efc0d3b
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_learning_rate.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..213255979dfcf020eabb97f9110732f04a40bdbe
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..183420c06679bc7968c3d1887878fbea5a5709ef
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logits_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5e422d51e863b1562a64416a382b5209f0c737
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..d67d5806aeb5c0bd381d3fed296143c27788d63e
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_logps_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..73f143e15bf3137e1943190229c724fe8d22d82a
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png
new file mode 100644
index 0000000000000000000000000000000000000000..73b48531ac749afe5ca124bde45012ffd7a89fdb
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_memory(GiB).png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c05168f74b1a3dfa55a4969a3c5dbdbcf7961ac
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_nll_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b871edb45164987d9e330c19a86545b33c5d9c0
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_accuracies.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4031d46f34606f43b779ac4561ef170a0538f01
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_chosen.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png
new file mode 100644
index 0000000000000000000000000000000000000000..f236c5c70dad1d44d3852b1c0305a434ec8f4b3d
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_margins.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png
new file mode 100644
index 0000000000000000000000000000000000000000..0e27f8a3f6fde8cabd190fa72e00571e681d468e
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_rewards_rejected.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png
new file mode 100644
index 0000000000000000000000000000000000000000..9f7941da6a30938add08e508ee1df498e25cb4c6
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_total_flos.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..c43bac51918a40dc45b17d8ae2d125ea99c17db5
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_loss.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png
new file mode 100644
index 0000000000000000000000000000000000000000..983293655c86a195af96e12cdd729b7922bba799
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_runtime.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..5b747e7565dee73bb5420f4b686c8167da741906
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_samples_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png
new file mode 100644
index 0000000000000000000000000000000000000000..5590ed64260db5923b03ca4de7f2de69fb7764c6
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_speed(iter_s).png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png
new file mode 100644
index 0000000000000000000000000000000000000000..2bb26c7406b8e93d563d928d553468bb97cbf0df
Binary files /dev/null and b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/images/train_train_steps_per_second.png differ
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e0bfe643dfea6871d3a0582158e029089ceb273e
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/logging.jsonl
@@ -0,0 +1,21 @@
+{"loss": 1.8359375, "grad_norm": 2.36713437, "learning_rate": 2.5e-05, "memory(GiB)": 13.63, "train_speed(iter/s)": 0.067542, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -286.0, "logits/rejected": 0.22851562, "logits/chosen": -0.46875, "nll_loss": 1.7109375, "epoch": 0.04040404, "global_step/max_steps": "1/72", "percentage": "1.39%", "elapsed_time": "12s", "remaining_time": "14m 13s"}
+{"loss": 1.83404541, "grad_norm": 2.79829339, "learning_rate": 9.995e-05, "memory(GiB)": 30.5, "train_speed(iter/s)": 0.088059, "rewards/chosen": 0.22851562, "rewards/rejected": 0.12792969, "rewards/accuracies": 0.375, "rewards/margins": 0.10058594, "logps/rejected": -512.0, "logps/chosen": -362.0, "logits/rejected": -0.18554688, "logits/chosen": -0.7421875, "nll_loss": 1.1015625, "epoch": 0.2020202, "global_step/max_steps": "5/72", "percentage": "6.94%", "elapsed_time": "54s", "remaining_time": "12m 3s"}
+{"loss": 1.58210449, "grad_norm": 2.20529681, "learning_rate": 9.809e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.090254, "rewards/chosen": 1.765625, "rewards/rejected": 1.1328125, "rewards/accuracies": 0.69999999, "rewards/margins": 0.63671875, "logps/rejected": -548.0, "logps/chosen": -350.0, "logits/rejected": -0.01245117, "logits/chosen": -0.421875, "nll_loss": 0.8515625, "epoch": 0.4040404, "global_step/max_steps": "10/72", "percentage": "13.89%", "elapsed_time": "1m 48s", "remaining_time": "11m 9s"}
+{"loss": 1.14770508, "grad_norm": 1.40067315, "learning_rate": 9.368e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09299, "rewards/chosen": 3.53125, "rewards/rejected": 1.90625, "rewards/accuracies": 0.94999999, "rewards/margins": 1.625, "logps/rejected": -260.0, "logps/chosen": -366.0, "logits/rejected": -0.609375, "logits/chosen": -0.14453125, "nll_loss": 0.75, "epoch": 0.60606061, "global_step/max_steps": "15/72", "percentage": "20.83%", "elapsed_time": "2m 38s", "remaining_time": "10m 2s"}
+{"loss": 0.72830811, "grad_norm": 0.52393991, "learning_rate": 8.695e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.093763, "rewards/chosen": 4.125, "rewards/rejected": 1.0703125, "rewards/accuracies": 0.89999998, "rewards/margins": 3.0625, "logps/rejected": -280.0, "logps/chosen": -452.0, "logits/rejected": -0.53515625, "logits/chosen": -0.03320312, "nll_loss": 0.609375, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 30s", "remaining_time": "9m 7s"}
+{"eval_loss": 0.58154297, "eval_runtime": 1.2034, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.831, "eval_rewards/chosen": 5.5, "eval_rewards/rejected": -0.80078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/rejected": -172.0, "eval_logps/chosen": -17.5, "eval_logits/rejected": 0.34960938, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.76171875, "epoch": 0.80808081, "global_step/max_steps": "20/72", "percentage": "27.78%", "elapsed_time": "3m 31s", "remaining_time": "9m 10s"}
+{"loss": 0.53287964, "grad_norm": 0.58260684, "learning_rate": 7.826e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.094482, "rewards/chosen": 5.0625, "rewards/rejected": -0.13574219, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -496.0, "logps/chosen": -434.0, "logits/rejected": -0.14257812, "logits/chosen": 0.08447266, "nll_loss": 0.5625, "epoch": 1.0, "global_step/max_steps": "25/72", "percentage": "34.72%", "elapsed_time": "4m 21s", "remaining_time": "8m 12s"}
+{"loss": 0.48479614, "grad_norm": 0.11732396, "learning_rate": 6.806e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09487, "rewards/chosen": 6.71875, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -492.0, "logps/chosen": -282.0, "logits/rejected": -0.0703125, "logits/chosen": -0.15820312, "nll_loss": 0.40625, "epoch": 1.2020202, "global_step/max_steps": "30/72", "percentage": "41.67%", "elapsed_time": "5m 13s", "remaining_time": "7m 18s"}
+{"loss": 0.47792358, "grad_norm": 0.10486167, "learning_rate": 5.691e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095273, "rewards/chosen": 7.03125, "rewards/rejected": -3.53125, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -524.0, "logps/chosen": -217.0, "logits/rejected": 0.05126953, "logits/chosen": -0.47265625, "nll_loss": 0.48242188, "epoch": 1.4040404, "global_step/max_steps": "35/72", "percentage": "48.61%", "elapsed_time": "6m 4s", "remaining_time": "6m 25s"}
+{"loss": 0.45271149, "grad_norm": 0.13908151, "learning_rate": 4.539e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095656, "rewards/chosen": 7.0, "rewards/rejected": -3.515625, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -276.0, "logps/chosen": -344.0, "logits/rejected": -0.49804688, "logits/chosen": -0.00570679, "nll_loss": 0.40820312, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 55s", "remaining_time": "5m 32s"}
+{"eval_loss": 0.43310547, "eval_runtime": 1.3521, "eval_samples_per_second": 2.958, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 6.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.78125, "eval_logps/rejected": -175.0, "eval_logps/chosen": -5.34375, "eval_logits/rejected": 0.62109375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.23242188, "epoch": 1.60606061, "global_step/max_steps": "40/72", "percentage": "55.56%", "elapsed_time": "6m 56s", "remaining_time": "5m 33s"}
+{"loss": 0.54668579, "grad_norm": 0.15003751, "learning_rate": 3.411e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095055, "rewards/chosen": 7.65625, "rewards/rejected": -3.234375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -496.0, "logps/chosen": -314.0, "logits/rejected": 0.16503906, "logits/chosen": -0.15722656, "nll_loss": 0.5234375, "epoch": 1.80808081, "global_step/max_steps": "45/72", "percentage": "62.50%", "elapsed_time": "7m 50s", "remaining_time": "4m 42s"}
+{"loss": 0.44422264, "grad_norm": 0.17349498, "learning_rate": 2.368e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095326, "rewards/chosen": 7.59375, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -274.0, "logps/chosen": -304.0, "logits/rejected": -0.140625, "logits/chosen": -0.00744629, "nll_loss": 0.41015625, "epoch": 2.0, "global_step/max_steps": "50/72", "percentage": "69.44%", "elapsed_time": "8m 41s", "remaining_time": "3m 49s"}
+{"loss": 0.49451904, "grad_norm": 0.2113881, "learning_rate": 1.464e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095115, "rewards/chosen": 7.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -616.0, "logps/chosen": -300.0, "logits/rejected": 0.12597656, "logits/chosen": -0.09521484, "nll_loss": 0.47460938, "epoch": 2.2020202, "global_step/max_steps": "55/72", "percentage": "76.39%", "elapsed_time": "9m 35s", "remaining_time": "2m 57s"}
+{"loss": 0.46078491, "grad_norm": 0.06468135, "learning_rate": 7.49e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095377, "rewards/chosen": 8.8125, "rewards/rejected": -3.328125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -470.0, "logps/chosen": -420.0, "logits/rejected": -0.48242188, "logits/chosen": -0.12353516, "nll_loss": 0.51171875, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 26s", "remaining_time": "2m 5s"}
+{"eval_loss": 0.42553711, "eval_runtime": 1.3466, "eval_samples_per_second": 2.97, "eval_steps_per_second": 0.743, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.1875, "eval_logits/rejected": 0.87890625, "eval_logits/chosen": -1.9921875, "eval_nll_loss": 0.22558594, "epoch": 2.4040404, "global_step/max_steps": "60/72", "percentage": "83.33%", "elapsed_time": "10m 27s", "remaining_time": "2m 5s"}
+{"loss": 0.48641052, "grad_norm": 0.12910766, "learning_rate": 2.59e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095248, "rewards/chosen": 7.875, "rewards/rejected": -3.4375, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -207.0, "logits/rejected": 0.32226562, "logits/chosen": -0.55078125, "nll_loss": 0.5703125, "epoch": 2.60606061, "global_step/max_steps": "65/72", "percentage": "90.28%", "elapsed_time": "11m 19s", "remaining_time": "1m 13s"}
+{"loss": 0.44238205, "grad_norm": 0.14328223, "learning_rate": 2.1e-07, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095483, "rewards/chosen": 7.5, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -436.0, "logps/chosen": -278.0, "logits/rejected": -0.10009766, "logits/chosen": -0.12597656, "nll_loss": 0.48242188, "epoch": 2.80808081, "global_step/max_steps": "70/72", "percentage": "97.22%", "elapsed_time": "12m 10s", "remaining_time": "20s"}
+{"eval_loss": 0.42578125, "eval_runtime": 1.3679, "eval_samples_per_second": 2.924, "eval_steps_per_second": 0.731, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.15625, "eval_logits/rejected": 0.89453125, "eval_logits/chosen": -1.9765625, "eval_nll_loss": 0.22460938, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 32s", "remaining_time": "0s"}
+{"train_runtime": 753.8745, "train_samples_per_second": 1.572, "train_steps_per_second": 0.096, "total_flos": 31145554509824.0, "train_loss": 0.71831854, "epoch": 2.88888889, "global_step/max_steps": "72/72", "percentage": "100.00%", "elapsed_time": "12m 33s", "remaining_time": "0s"}
+{"train_dataset": "1180.088608±494.952093, min=317.000000, max=4171.000000, size=395", "val_dataset": "1196.000000±512.550973, min=715.000000, max=2041.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-72", "best_model_checkpoint": "/home/wangruotong/LLM_test/output_0127/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/checkpoint-60", "best_metric": 0.42553711, "global_step": 72, "log_history": [{"loss": 1.8359375, "grad_norm": 2.3671343726657543, "learning_rate": 2.5e-05, "memory(GiB)": 13.63, "train_speed(iter/s)": 0.067542, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/rejected": -272.0, "logps/chosen": -286.0, "logits/rejected": 0.228515625, "logits/chosen": -0.46875, "nll_loss": 1.7109375, "epoch": 0.04040404040404041, "step": 1}, {"loss": 1.83404541015625, "grad_norm": 2.798293390214536, "learning_rate": 9.994664874011863e-05, "memory(GiB)": 30.5, "train_speed(iter/s)": 0.088059, "rewards/chosen": 0.228515625, "rewards/rejected": 0.1279296875, "rewards/accuracies": 0.375, "rewards/margins": 0.1005859375, "logps/rejected": -512.0, "logps/chosen": -362.0, "logits/rejected": -0.185546875, "logits/chosen": -0.7421875, "nll_loss": 1.1015625, "epoch": 0.20202020202020202, "step": 5}, {"loss": 1.5821044921875, "grad_norm": 2.205296809705217, "learning_rate": 9.809128215864097e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.090254, "rewards/chosen": 1.765625, "rewards/rejected": 1.1328125, "rewards/accuracies": 0.699999988079071, "rewards/margins": 0.63671875, "logps/rejected": -548.0, "logps/chosen": -350.0, "logits/rejected": -0.012451171875, "logits/chosen": -0.421875, "nll_loss": 0.8515625, "epoch": 0.40404040404040403, "step": 10}, {"loss": 1.147705078125, "grad_norm": 1.4006731478550383, "learning_rate": 9.368111953231848e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09299, "rewards/chosen": 3.53125, "rewards/rejected": 1.90625, "rewards/accuracies": 0.949999988079071, "rewards/margins": 1.625, "logps/rejected": -260.0, "logps/chosen": -366.0, "logits/rejected": -0.609375, "logits/chosen": -0.14453125, "nll_loss": 0.75, "epoch": 0.6060606060606061, "step": 15}, {"loss": 0.72830810546875, "grad_norm": 0.5239399131286955, "learning_rate": 8.695044586103296e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.093763, "rewards/chosen": 4.125, "rewards/rejected": 1.0703125, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 3.0625, "logps/rejected": -280.0, "logps/chosen": -452.0, "logits/rejected": -0.53515625, "logits/chosen": -0.033203125, "nll_loss": 0.609375, "epoch": 0.8080808080808081, "step": 20}, {"eval_loss": 0.58154296875, "eval_runtime": 1.2034, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.831, "eval_rewards/chosen": 5.5, "eval_rewards/rejected": -0.80078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/rejected": -172.0, "eval_logps/chosen": -17.5, "eval_logits/rejected": 0.349609375, "eval_logits/chosen": -1.8671875, "eval_nll_loss": 0.76171875, "epoch": 0.8080808080808081, "step": 20}, {"loss": 0.532879638671875, "grad_norm": 0.5826068375172234, "learning_rate": 7.82568207211296e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.094482, "rewards/chosen": 5.0625, "rewards/rejected": -0.1357421875, "rewards/accuracies": 1.0, "rewards/margins": 5.1875, "logps/rejected": -496.0, "logps/chosen": -434.0, "logits/rejected": -0.142578125, "logits/chosen": 0.08447265625, "nll_loss": 0.5625, "epoch": 1.0, "step": 25}, {"loss": 0.484796142578125, "grad_norm": 0.1173239600840837, "learning_rate": 6.806208330935766e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.09487, "rewards/chosen": 6.71875, "rewards/rejected": -3.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.8125, "logps/rejected": -492.0, "logps/chosen": -282.0, "logits/rejected": -0.0703125, "logits/chosen": -0.158203125, "nll_loss": 0.40625, "epoch": 1.202020202020202, "step": 30}, {"loss": 0.477923583984375, "grad_norm": 0.10486166807457631, "learning_rate": 5.6907817747594116e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095273, "rewards/chosen": 7.03125, "rewards/rejected": -3.53125, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -524.0, "logps/chosen": -217.0, "logits/rejected": 0.05126953125, "logits/chosen": -0.47265625, "nll_loss": 0.482421875, "epoch": 1.404040404040404, "step": 35}, {"loss": 0.45271148681640627, "grad_norm": 0.13908151012153538, "learning_rate": 4.5386582026834906e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095656, "rewards/chosen": 7.0, "rewards/rejected": -3.515625, "rewards/accuracies": 1.0, "rewards/margins": 10.5, "logps/rejected": -276.0, "logps/chosen": -344.0, "logits/rejected": -0.498046875, "logits/chosen": -0.005706787109375, "nll_loss": 0.408203125, "epoch": 1.606060606060606, "step": 40}, {"eval_loss": 0.43310546875, "eval_runtime": 1.3521, "eval_samples_per_second": 2.958, "eval_steps_per_second": 0.74, "eval_rewards/chosen": 6.6875, "eval_rewards/rejected": -1.1015625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 7.78125, "eval_logps/rejected": -175.0, "eval_logps/chosen": -5.34375, "eval_logits/rejected": 0.62109375, "eval_logits/chosen": -1.921875, "eval_nll_loss": 0.232421875, "epoch": 1.606060606060606, "step": 40}, {"loss": 0.546685791015625, "grad_norm": 0.15003750533951385, "learning_rate": 3.411042902090492e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095055, "rewards/chosen": 7.65625, "rewards/rejected": -3.234375, "rewards/accuracies": 1.0, "rewards/margins": 10.875, "logps/rejected": -496.0, "logps/chosen": -314.0, "logits/rejected": 0.1650390625, "logits/chosen": -0.1572265625, "nll_loss": 0.5234375, "epoch": 1.808080808080808, "step": 45}, {"loss": 0.44422264099121095, "grad_norm": 0.17349498363870808, "learning_rate": 2.3678391856132204e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095326, "rewards/chosen": 7.59375, "rewards/rejected": -2.859375, "rewards/accuracies": 1.0, "rewards/margins": 10.4375, "logps/rejected": -274.0, "logps/chosen": -304.0, "logits/rejected": -0.140625, "logits/chosen": -0.0074462890625, "nll_loss": 0.41015625, "epoch": 2.0, "step": 50}, {"loss": 0.49451904296875, "grad_norm": 0.21138809828743063, "learning_rate": 1.4644660940672627e-05, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095115, "rewards/chosen": 7.25, "rewards/rejected": -2.078125, "rewards/accuracies": 1.0, "rewards/margins": 9.3125, "logps/rejected": -616.0, "logps/chosen": -300.0, "logits/rejected": 0.1259765625, "logits/chosen": -0.09521484375, "nll_loss": 0.474609375, "epoch": 2.202020202020202, "step": 55}, {"loss": 0.460784912109375, "grad_norm": 0.06468135061973618, "learning_rate": 7.489143213519301e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095377, "rewards/chosen": 8.8125, "rewards/rejected": -3.328125, "rewards/accuracies": 1.0, "rewards/margins": 12.125, "logps/rejected": -470.0, "logps/chosen": -420.0, "logits/rejected": -0.482421875, "logits/chosen": -0.12353515625, "nll_loss": 0.51171875, "epoch": 2.404040404040404, "step": 60}, {"eval_loss": 0.425537109375, "eval_runtime": 1.3466, "eval_samples_per_second": 2.97, "eval_steps_per_second": 0.743, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.1875, "eval_logits/rejected": 0.87890625, "eval_logits/chosen": -1.9921875, "eval_nll_loss": 0.2255859375, "epoch": 2.404040404040404, "step": 60}, {"loss": 0.4864105224609375, "grad_norm": 0.1291076581823192, "learning_rate": 2.591967620451707e-06, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095248, "rewards/chosen": 7.875, "rewards/rejected": -3.4375, "rewards/accuracies": 1.0, "rewards/margins": 11.3125, "logps/rejected": -480.0, "logps/chosen": -207.0, "logits/rejected": 0.322265625, "logits/chosen": -0.55078125, "nll_loss": 0.5703125, "epoch": 2.606060606060606, "step": 65}, {"loss": 0.4423820495605469, "grad_norm": 0.1432822314318524, "learning_rate": 2.1329118524827662e-07, "memory(GiB)": 42.9, "train_speed(iter/s)": 0.095483, "rewards/chosen": 7.5, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 10.5625, "logps/rejected": -436.0, "logps/chosen": -278.0, "logits/rejected": -0.10009765625, "logits/chosen": -0.1259765625, "nll_loss": 0.482421875, "epoch": 2.808080808080808, "step": 70}, {"eval_loss": 0.42578125, "eval_runtime": 1.3679, "eval_samples_per_second": 2.924, "eval_steps_per_second": 0.731, "eval_rewards/chosen": 6.75, "eval_rewards/rejected": -1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 8.25, "eval_logps/rejected": -179.0, "eval_logps/chosen": -5.15625, "eval_logits/rejected": 0.89453125, "eval_logits/chosen": -1.9765625, "eval_nll_loss": 0.224609375, "epoch": 2.888888888888889, "step": 72}, {"train_runtime": 753.8745, "train_samples_per_second": 1.572, "train_steps_per_second": 0.096, "total_flos": 31145554509824.0, "train_loss": 0.7183185418446859, "epoch": 2.888888888888889, "step": 72}], "memory": 42.8984375}
diff --git a/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0 b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0
new file mode 100644
index 0000000000000000000000000000000000000000..36efed6dc11e1ef3d65fd6b82dc17ce715401753
--- /dev/null
+++ b/Marco-o1_400_0.5_dpo_4200_rank8_epoch3_what_system/v0-20250127-071018/runs/events.out.tfevents.1737961881.kml-task-547024-record-9975763-prod-worker-0.21092.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f24324580e4cc61a421b16bf7736b01a10b1584ed690487134d262bf24f7ecb
+size 23693