diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/README.md b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fca4e142c186eed8cc62de799f497379c37cf395 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./output_dir/fix_codeLlama +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_config.json b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..575a448f63fb53fbd72f81c498d8a392a21028f0 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./output_dir/fix_codeLlama", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "fc_out", + "q_proj", + "k_proj", + "fc_in", + "wte", + "v_proj", + "out_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_model.safetensors b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2d3a426dd48435a4235416bda666af1cc3fc1ae --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3bc6527e61727868deb335a388d96da4e34312a12e2813a5de86145429eae49 +size 25191360 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/added_tokens.json b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c09203d8a52151e0d3b5f3c8e6daedc5b60832b5 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/added_tokens.json @@ -0,0 +1,3 @@ +{ + "": 32016 +} diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/optimizer.pt b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3050b773e6fb2b6a3c6a32256e096ddaee9fdae4 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10ccd2e39e796e699e4a18d72274aa953b12ab9ff05892840247e7f21689b079 +size 50445242 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_0.pth b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..821db9d32dc925919011ebfc6eaa9775b9c28453 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d70f4816d9856643f7986b709d87aa477584114329562b62b70a8ee6bedbc11b +size 14960 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_1.pth b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..49e1e337a584b0935e42549c91b360c37b575900 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861efdf0dceef25cf08029e4518f52f53768678173fcd28906388233c3fb3b02 +size 14960 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_2.pth b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..19c08616c8adbf113b6c07188c76e959b541c970 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebdb27a64a2feccfa8a360d15c28e460fbe78b54e9d7e847e02a63fc8fa3eb33 +size 14960 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_3.pth b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..25edf7d4a6204cc520c57c2eda32dddccd4be297 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8dd3637e6e034724517d6f63740144aba4bf2af43d797bbf16033e48567a69 +size 14960 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/scheduler.pt b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..265180afd994aa0d0089f5d8f496a23cf7feb4ed --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0355f6d2acfbddd46776507259ab052a22b6ee6d2b38dce536d622af0a82d05 +size 1064 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/special_tokens_map.json b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..2ece23e993f7e1c7063cb51148b6fa5c6c224775 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer.model b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer_config.json b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc4d9a55d56b17600905bfc69a0e45e2fae7a1c4 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/tokenizer_config.json @@ -0,0 +1,50 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/trainer_state.json b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5114f6b35fbdb36d91275c19358e6dedccb3585 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/trainer_state.json @@ -0,0 +1,2031 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.883116883116884, + "eval_steps": 400, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.12987012987012986, + "grad_norm": 106.52960205078125, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -2.8220086097717285, + "logits/rejected": -2.813664674758911, + "logps/chosen": -6.851963043212891, + "logps/rejected": -22.329076766967773, + "loss": 1.7802, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0019315744284540415, + "rewards/margins": -0.00016089165001176298, + "rewards/rejected": 0.0020924662239849567, + "step": 10 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 249.00958251953125, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -2.8003315925598145, + "logits/rejected": -2.791141986846924, + "logps/chosen": -7.967398166656494, + "logps/rejected": -21.916549682617188, + "loss": 3.02, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0019492346327751875, + "rewards/margins": -0.0002492312341928482, + "rewards/rejected": -0.0017000030493363738, + "step": 20 + }, + { + "epoch": 0.38961038961038963, + "grad_norm": 231.99557495117188, + "learning_rate": 1.5e-06, + "logits/chosen": -2.801818609237671, + "logits/rejected": -2.795116424560547, + "logps/chosen": -8.219635009765625, + "logps/rejected": -22.34463882446289, + "loss": 2.6654, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.000893857330083847, + "rewards/margins": -0.003195409430190921, + "rewards/rejected": 0.004089267458766699, + "step": 30 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 94.2333755493164, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.8080127239227295, + "logits/rejected": -2.804051637649536, + "logps/chosen": -9.42918586730957, + "logps/rejected": -22.583698272705078, + "loss": 2.3196, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0048586102202534676, + "rewards/margins": 0.002601353684440255, + "rewards/rejected": 0.0022572563029825687, + "step": 40 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 213.9645538330078, + "learning_rate": 2.5e-06, + "logits/chosen": -2.805826187133789, + "logits/rejected": -2.7987563610076904, + "logps/chosen": -7.2015862464904785, + "logps/rejected": -23.5020694732666, + "loss": 2.6731, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0019127244595438242, + "rewards/margins": 0.002104248385876417, + "rewards/rejected": -0.000191524246474728, + "step": 50 + }, + { + "epoch": 0.7792207792207793, + "grad_norm": 110.76453399658203, + "learning_rate": 3e-06, + "logits/chosen": -2.801133394241333, + "logits/rejected": -2.7940406799316406, + "logps/chosen": -9.063034057617188, + "logps/rejected": -20.352069854736328, + "loss": 2.5189, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.003256978467106819, + "rewards/margins": -0.0020958157256245613, + "rewards/rejected": -0.0011611627414822578, + "step": 60 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 174.06761169433594, + "learning_rate": 3.5e-06, + "logits/chosen": -2.815706968307495, + "logits/rejected": -2.8055267333984375, + "logps/chosen": -7.804632663726807, + "logps/rejected": -22.440189361572266, + "loss": 1.7947, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.002698649885132909, + "rewards/margins": -0.0054735904559493065, + "rewards/rejected": 0.008172241039574146, + "step": 70 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 87.83943939208984, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.8100368976593018, + "logits/rejected": -2.798546075820923, + "logps/chosen": -7.676709175109863, + "logps/rejected": -21.805500030517578, + "loss": 1.678, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.00427329121157527, + "rewards/margins": -0.006891848053783178, + "rewards/rejected": 0.011165140196681023, + "step": 80 + }, + { + "epoch": 1.1688311688311688, + "grad_norm": 116.58035278320312, + "learning_rate": 4.5e-06, + "logits/chosen": -2.8018991947174072, + "logits/rejected": -2.7894458770751953, + "logps/chosen": -8.77466869354248, + "logps/rejected": -22.66843032836914, + "loss": 1.872, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.0032441741786897182, + "rewards/margins": -0.009967333637177944, + "rewards/rejected": 0.01321150828152895, + "step": 90 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 94.74404907226562, + "learning_rate": 5e-06, + "logits/chosen": -2.8042826652526855, + "logits/rejected": -2.7987048625946045, + "logps/chosen": -7.8375115394592285, + "logps/rejected": -21.124469757080078, + "loss": 1.7389, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00530009251087904, + "rewards/margins": -0.010122200474143028, + "rewards/rejected": 0.015422293916344643, + "step": 100 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 36.4909782409668, + "learning_rate": 4.997252228714279e-06, + "logits/chosen": -2.8379225730895996, + "logits/rejected": -2.831869602203369, + "logps/chosen": -8.803790092468262, + "logps/rejected": -20.375246047973633, + "loss": 1.3703, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.01235434040427208, + "rewards/margins": -0.006987647153437138, + "rewards/rejected": 0.019341984763741493, + "step": 110 + }, + { + "epoch": 1.5584415584415585, + "grad_norm": 39.18604278564453, + "learning_rate": 4.989014955054746e-06, + "logits/chosen": -2.817542552947998, + "logits/rejected": -2.8139848709106445, + "logps/chosen": -7.656688690185547, + "logps/rejected": -20.828929901123047, + "loss": 1.1774, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.01700626313686371, + "rewards/margins": -0.011658851057291031, + "rewards/rejected": 0.02866511419415474, + "step": 120 + }, + { + "epoch": 1.6883116883116882, + "grad_norm": 65.63705444335938, + "learning_rate": 4.975306286336628e-06, + "logits/chosen": -2.8285748958587646, + "logits/rejected": -2.823660373687744, + "logps/chosen": -7.743475914001465, + "logps/rejected": -20.759389877319336, + "loss": 1.3562, + "rewards/accuracies": 0.2750000059604645, + "rewards/chosen": 0.020265722647309303, + "rewards/margins": -0.01016728300601244, + "rewards/rejected": 0.03043300285935402, + "step": 130 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 170.55665588378906, + "learning_rate": 4.95615635718894e-06, + "logits/chosen": -2.821213722229004, + "logits/rejected": -2.8140625953674316, + "logps/chosen": -8.116856575012207, + "logps/rejected": -19.581222534179688, + "loss": 1.7842, + "rewards/accuracies": 0.2750000059604645, + "rewards/chosen": 0.017684193328022957, + "rewards/margins": -0.010469591245055199, + "rewards/rejected": 0.028153782710433006, + "step": 140 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 71.12332916259766, + "learning_rate": 4.931607263312033e-06, + "logits/chosen": -2.823256015777588, + "logits/rejected": -2.812701463699341, + "logps/chosen": -6.964502811431885, + "logps/rejected": -21.734210968017578, + "loss": 1.5393, + "rewards/accuracies": 0.22499999403953552, + "rewards/chosen": 0.010581249371170998, + "rewards/margins": -0.01839107647538185, + "rewards/rejected": 0.0289723239839077, + "step": 150 + }, + { + "epoch": 2.0779220779220777, + "grad_norm": 173.57005310058594, + "learning_rate": 4.901712968942101e-06, + "logits/chosen": -2.808175563812256, + "logits/rejected": -2.798382520675659, + "logps/chosen": -7.128758907318115, + "logps/rejected": -22.715810775756836, + "loss": 1.4333, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.012118353508412838, + "rewards/margins": -0.01597173698246479, + "rewards/rejected": 0.028090089559555054, + "step": 160 + }, + { + "epoch": 2.207792207792208, + "grad_norm": 119.67935180664062, + "learning_rate": 4.866539188226086e-06, + "logits/chosen": -2.788814067840576, + "logits/rejected": -2.781687021255493, + "logps/chosen": -9.231825828552246, + "logps/rejected": -23.992216110229492, + "loss": 1.2406, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01427288819104433, + "rewards/margins": -0.011877929791808128, + "rewards/rejected": 0.026150818914175034, + "step": 170 + }, + { + "epoch": 2.3376623376623376, + "grad_norm": 78.3140869140625, + "learning_rate": 4.826163240767717e-06, + "logits/chosen": -2.819803476333618, + "logits/rejected": -2.812873363494873, + "logps/chosen": -8.85770320892334, + "logps/rejected": -21.352455139160156, + "loss": 1.6327, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01637933775782585, + "rewards/margins": -0.017695654183626175, + "rewards/rejected": 0.034074995666742325, + "step": 180 + }, + { + "epoch": 2.4675324675324677, + "grad_norm": 34.724788665771484, + "learning_rate": 4.780673881662242e-06, + "logits/chosen": -2.802525043487549, + "logits/rejected": -2.7943992614746094, + "logps/chosen": -9.141613006591797, + "logps/rejected": -21.16362762451172, + "loss": 1.3604, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.019131088629364967, + "rewards/margins": -0.012987020425498486, + "rewards/rejected": 0.03211811184883118, + "step": 190 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 155.53683471679688, + "learning_rate": 4.730171106393466e-06, + "logits/chosen": -2.8240277767181396, + "logits/rejected": -2.8137762546539307, + "logps/chosen": -6.279611587524414, + "logps/rejected": -22.48813247680664, + "loss": 1.3818, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.018246710300445557, + "rewards/margins": -0.011857626028358936, + "rewards/rejected": 0.03010433353483677, + "step": 200 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 64.57522583007812, + "learning_rate": 4.674765931021976e-06, + "logits/chosen": -2.8272862434387207, + "logits/rejected": -2.8214235305786133, + "logps/chosen": -8.027566909790039, + "logps/rejected": -20.517822265625, + "loss": 1.2211, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01772388443350792, + "rewards/margins": -0.015222163870930672, + "rewards/rejected": 0.03294604271650314, + "step": 210 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 138.96739196777344, + "learning_rate": 4.614580148147744e-06, + "logits/chosen": -2.8115546703338623, + "logits/rejected": -2.804287910461426, + "logps/chosen": -6.3580474853515625, + "logps/rejected": -20.281545639038086, + "loss": 1.3669, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": 0.0185546912252903, + "rewards/margins": -0.011832709424197674, + "rewards/rejected": 0.03038739785552025, + "step": 220 + }, + { + "epoch": 2.987012987012987, + "grad_norm": 55.348880767822266, + "learning_rate": 4.5497460591835615e-06, + "logits/chosen": -2.8048064708709717, + "logits/rejected": -2.799748420715332, + "logps/chosen": -8.203588485717773, + "logps/rejected": -20.907344818115234, + "loss": 1.1073, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.02281924895942211, + "rewards/margins": -0.005632441025227308, + "rewards/rejected": 0.02845168672502041, + "step": 230 + }, + { + "epoch": 3.116883116883117, + "grad_norm": 114.07877349853516, + "learning_rate": 4.480406183527823e-06, + "logits/chosen": -2.820413589477539, + "logits/rejected": -2.810023307800293, + "logps/chosen": -7.28661584854126, + "logps/rejected": -22.23759651184082, + "loss": 1.2872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02074371837079525, + "rewards/margins": -0.00726896058768034, + "rewards/rejected": 0.028012678027153015, + "step": 240 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 114.84168243408203, + "learning_rate": 4.406712945275955e-06, + "logits/chosen": -2.8098714351654053, + "logits/rejected": -2.8057053089141846, + "logps/chosen": -8.777189254760742, + "logps/rejected": -23.82990074157715, + "loss": 1.2292, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": 0.020751068368554115, + "rewards/margins": -0.011817799881100655, + "rewards/rejected": 0.03256886824965477, + "step": 250 + }, + { + "epoch": 3.3766233766233764, + "grad_norm": 125.23298645019531, + "learning_rate": 4.328828338159173e-06, + "logits/chosen": -2.8179001808166504, + "logits/rejected": -2.809523820877075, + "logps/chosen": -7.79488468170166, + "logps/rejected": -21.827838897705078, + "loss": 1.0056, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.018612362444400787, + "rewards/margins": -0.005726366303861141, + "rewards/rejected": 0.024338727816939354, + "step": 260 + }, + { + "epoch": 3.5064935064935066, + "grad_norm": 118.24935150146484, + "learning_rate": 4.246923569447105e-06, + "logits/chosen": -2.827606201171875, + "logits/rejected": -2.8264949321746826, + "logps/chosen": -9.784859657287598, + "logps/rejected": -19.85146141052246, + "loss": 1.2282, + "rewards/accuracies": 0.22499999403953552, + "rewards/chosen": 0.01882827654480934, + "rewards/margins": -0.013741184957325459, + "rewards/rejected": 0.032569460570812225, + "step": 270 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 231.2564697265625, + "learning_rate": 4.161178683597055e-06, + "logits/chosen": -2.8154025077819824, + "logits/rejected": -2.808724880218506, + "logps/chosen": -10.802755355834961, + "logps/rejected": -20.200098037719727, + "loss": 1.2421, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.016843315213918686, + "rewards/margins": -0.009146241471171379, + "rewards/rejected": 0.025989552959799767, + "step": 280 + }, + { + "epoch": 3.7662337662337664, + "grad_norm": 178.36927795410156, + "learning_rate": 4.071782166477213e-06, + "logits/chosen": -2.827821969985962, + "logits/rejected": -2.817737102508545, + "logps/chosen": -7.973962306976318, + "logps/rejected": -20.839879989624023, + "loss": 0.9562, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02151758037507534, + "rewards/margins": 0.0009062625467777252, + "rewards/rejected": 0.020611315965652466, + "step": 290 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 177.05746459960938, + "learning_rate": 3.978930531033807e-06, + "logits/chosen": -2.81160306930542, + "logits/rejected": -2.807624340057373, + "logps/chosen": -7.694178581237793, + "logps/rejected": -20.271114349365234, + "loss": 1.2155, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.02011878788471222, + "rewards/margins": -0.011032785288989544, + "rewards/rejected": 0.03115157224237919, + "step": 300 + }, + { + "epoch": 4.025974025974026, + "grad_norm": 85.1644287109375, + "learning_rate": 3.882827885312999e-06, + "logits/chosen": -2.826159954071045, + "logits/rejected": -2.8190231323242188, + "logps/chosen": -7.839417934417725, + "logps/rejected": -20.949127197265625, + "loss": 1.1714, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.018428083509206772, + "rewards/margins": -0.0033446471206843853, + "rewards/rejected": 0.021772734820842743, + "step": 310 + }, + { + "epoch": 4.1558441558441555, + "grad_norm": 96.2293472290039, + "learning_rate": 3.783685483787105e-06, + "logits/chosen": -2.7987990379333496, + "logits/rejected": -2.7919256687164307, + "logps/chosen": -8.851567268371582, + "logps/rejected": -21.075613021850586, + "loss": 1.0528, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.027806680649518967, + "rewards/margins": 0.007172185927629471, + "rewards/rejected": 0.020634492859244347, + "step": 320 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 64.6633071899414, + "learning_rate": 3.6817212629714135e-06, + "logits/chosen": -2.8131086826324463, + "logits/rejected": -2.8076319694519043, + "logps/chosen": -8.776361465454102, + "logps/rejected": -21.563058853149414, + "loss": 0.9623, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": 0.02237451635301113, + "rewards/margins": -0.005439778324216604, + "rewards/rejected": 0.027814293280243874, + "step": 330 + }, + { + "epoch": 4.415584415584416, + "grad_norm": 61.17387771606445, + "learning_rate": 3.5771593623524263e-06, + "logits/chosen": -2.8211700916290283, + "logits/rejected": -2.8142738342285156, + "logps/chosen": -8.312505722045898, + "logps/rejected": -22.2635440826416, + "loss": 0.8663, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.022781318053603172, + "rewards/margins": -0.0001775051496224478, + "rewards/rejected": 0.022958822548389435, + "step": 340 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 244.74371337890625, + "learning_rate": 3.4702296316806243e-06, + "logits/chosen": -2.8051793575286865, + "logits/rejected": -2.8000447750091553, + "logps/chosen": -8.497454643249512, + "logps/rejected": -21.671079635620117, + "loss": 0.9413, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.018070612102746964, + "rewards/margins": -0.0006186591344885528, + "rewards/rejected": 0.018689271062612534, + "step": 350 + }, + { + "epoch": 4.675324675324675, + "grad_norm": 180.42735290527344, + "learning_rate": 3.3611671257108323e-06, + "logits/chosen": -2.8005387783050537, + "logits/rejected": -2.7906429767608643, + "logps/chosen": -8.441190719604492, + "logps/rejected": -22.472454071044922, + "loss": 1.0874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02211115136742592, + "rewards/margins": 0.009135196916759014, + "rewards/rejected": 0.012975958175957203, + "step": 360 + }, + { + "epoch": 4.805194805194805, + "grad_norm": 37.385963439941406, + "learning_rate": 3.2502115875008523e-06, + "logits/chosen": -2.822923183441162, + "logits/rejected": -2.8172943592071533, + "logps/chosen": -6.2603559494018555, + "logps/rejected": -19.10385513305664, + "loss": 1.1011, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.018987987190485, + "rewards/margins": 0.00487549277022481, + "rewards/rejected": 0.014112496748566628, + "step": 370 + }, + { + "epoch": 4.935064935064935, + "grad_norm": 251.71954345703125, + "learning_rate": 3.1376069214041917e-06, + "logits/chosen": -2.826533079147339, + "logits/rejected": -2.8127052783966064, + "logps/chosen": -8.482178688049316, + "logps/rejected": -22.023988723754883, + "loss": 1.0408, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02052697166800499, + "rewards/margins": 0.00905714649707079, + "rewards/rejected": 0.011469824239611626, + "step": 380 + }, + { + "epoch": 5.064935064935065, + "grad_norm": 133.91131591796875, + "learning_rate": 3.023600656915362e-06, + "logits/chosen": -2.822097063064575, + "logits/rejected": -2.810638189315796, + "logps/chosen": -6.029572486877441, + "logps/rejected": -21.33598518371582, + "loss": 0.9734, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023712964728474617, + "rewards/margins": 0.003511254210025072, + "rewards/rejected": 0.020201710984110832, + "step": 390 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 133.3660125732422, + "learning_rate": 2.9084434045463255e-06, + "logits/chosen": -2.80812668800354, + "logits/rejected": -2.7997758388519287, + "logps/chosen": -7.040734767913818, + "logps/rejected": -20.19479751586914, + "loss": 0.9652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020205114036798477, + "rewards/margins": 0.010116524063050747, + "rewards/rejected": 0.01008858997374773, + "step": 400 + }, + { + "epoch": 5.194805194805195, + "eval_logits/chosen": -2.801586627960205, + "eval_logits/rejected": -2.8042006492614746, + "eval_logps/chosen": -13.702254295349121, + "eval_logps/rejected": -15.930770874023438, + "eval_loss": 0.7836798429489136, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": 0.04482783377170563, + "eval_rewards/margins": 0.010822022333741188, + "eval_rewards/rejected": 0.03400580957531929, + "eval_runtime": 1.1767, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 1.7, + "step": 400 + }, + { + "epoch": 5.324675324675325, + "grad_norm": 103.42053985595703, + "learning_rate": 2.792388304930207e-06, + "logits/chosen": -2.8027472496032715, + "logits/rejected": -2.7954494953155518, + "logps/chosen": -8.696739196777344, + "logps/rejected": -21.20206642150879, + "loss": 0.8734, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025552362203598022, + "rewards/margins": 0.018355753272771835, + "rewards/rejected": 0.007196612656116486, + "step": 410 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 236.04249572753906, + "learning_rate": 2.6756904723632325e-06, + "logits/chosen": -2.820538282394409, + "logits/rejected": -2.8109259605407715, + "logps/chosen": -8.277128219604492, + "logps/rejected": -21.056652069091797, + "loss": 1.0417, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02177886664867401, + "rewards/margins": 0.017902854830026627, + "rewards/rejected": 0.0038760111201554537, + "step": 420 + }, + { + "epoch": 5.584415584415584, + "grad_norm": 94.26475524902344, + "learning_rate": 2.5586064340081516e-06, + "logits/chosen": -2.8236119747161865, + "logits/rejected": -2.8140316009521484, + "logps/chosen": -7.59194803237915, + "logps/rejected": -21.763874053955078, + "loss": 1.1263, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.017852844670414925, + "rewards/margins": 0.01606573723256588, + "rewards/rejected": 0.0017871044110506773, + "step": 430 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 195.20712280273438, + "learning_rate": 2.441393565991849e-06, + "logits/chosen": -2.807347297668457, + "logits/rejected": -2.7958061695098877, + "logps/chosen": -8.326894760131836, + "logps/rejected": -21.803070068359375, + "loss": 0.9669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.019576936960220337, + "rewards/margins": 0.02373579703271389, + "rewards/rejected": -0.0041588591411709785, + "step": 440 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 175.50625610351562, + "learning_rate": 2.3243095276367687e-06, + "logits/chosen": -2.8178904056549072, + "logits/rejected": -2.8124308586120605, + "logps/chosen": -8.626742362976074, + "logps/rejected": -21.10752296447754, + "loss": 1.4178, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01709837280213833, + "rewards/margins": 0.018973568454384804, + "rewards/rejected": -0.001875193091109395, + "step": 450 + }, + { + "epoch": 5.974025974025974, + "grad_norm": 171.52513122558594, + "learning_rate": 2.207611695069794e-06, + "logits/chosen": -2.8153228759765625, + "logits/rejected": -2.8087317943573, + "logps/chosen": -8.533574104309082, + "logps/rejected": -22.59617805480957, + "loss": 1.1735, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.019586745649576187, + "rewards/margins": 0.03454780578613281, + "rewards/rejected": -0.014961063861846924, + "step": 460 + }, + { + "epoch": 6.103896103896104, + "grad_norm": 1.1001964807510376, + "learning_rate": 2.0915565954536745e-06, + "logits/chosen": -2.8197810649871826, + "logits/rejected": -2.8137242794036865, + "logps/chosen": -6.954007148742676, + "logps/rejected": -20.310867309570312, + "loss": 0.9806, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02565738931298256, + "rewards/margins": 0.02424320951104164, + "rewards/rejected": 0.001414179103448987, + "step": 470 + }, + { + "epoch": 6.233766233766234, + "grad_norm": 129.70095825195312, + "learning_rate": 1.9763993430846394e-06, + "logits/chosen": -2.8019607067108154, + "logits/rejected": -2.798675537109375, + "logps/chosen": -8.585060119628906, + "logps/rejected": -22.695144653320312, + "loss": 1.1706, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.017484549432992935, + "rewards/margins": 0.01860320381820202, + "rewards/rejected": -0.0011186569463461637, + "step": 480 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 194.80984497070312, + "learning_rate": 1.8623930785958092e-06, + "logits/chosen": -2.8055920600891113, + "logits/rejected": -2.7976577281951904, + "logps/chosen": -8.328341484069824, + "logps/rejected": -21.830015182495117, + "loss": 0.9653, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.023755336180329323, + "rewards/margins": 0.03609599173069, + "rewards/rejected": -0.012340660206973553, + "step": 490 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 1.1828835010528564, + "learning_rate": 1.7497884124991487e-06, + "logits/chosen": -2.8199102878570557, + "logits/rejected": -2.8055901527404785, + "logps/chosen": -6.756843566894531, + "logps/rejected": -20.936315536499023, + "loss": 0.749, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026989247649908066, + "rewards/margins": 0.038383591920137405, + "rewards/rejected": -0.011394346132874489, + "step": 500 + }, + { + "epoch": 6.623376623376624, + "grad_norm": 38.752262115478516, + "learning_rate": 1.6388328742891679e-06, + "logits/chosen": -2.82857084274292, + "logits/rejected": -2.8192813396453857, + "logps/chosen": -5.166688919067383, + "logps/rejected": -21.360883712768555, + "loss": 0.9419, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02238740213215351, + "rewards/margins": 0.04248126968741417, + "rewards/rejected": -0.020093869417905807, + "step": 510 + }, + { + "epoch": 6.753246753246753, + "grad_norm": 160.1698455810547, + "learning_rate": 1.5297703683193755e-06, + "logits/chosen": -2.813995599746704, + "logits/rejected": -2.8053503036499023, + "logps/chosen": -8.157999038696289, + "logps/rejected": -21.715497970581055, + "loss": 0.9404, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.025560980662703514, + "rewards/margins": 0.04841204732656479, + "rewards/rejected": -0.022851066663861275, + "step": 520 + }, + { + "epoch": 6.883116883116883, + "grad_norm": 94.66495513916016, + "learning_rate": 1.4228406376475741e-06, + "logits/chosen": -2.8161110877990723, + "logits/rejected": -2.806417942047119, + "logps/chosen": -9.14362907409668, + "logps/rejected": -21.98373794555664, + "loss": 1.3017, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022670337930321693, + "rewards/margins": 0.04901245981454849, + "rewards/rejected": -0.02634212002158165, + "step": 530 + }, + { + "epoch": 7.012987012987013, + "grad_norm": 89.24470520019531, + "learning_rate": 1.3182787370285865e-06, + "logits/chosen": -2.8100571632385254, + "logits/rejected": -2.802196502685547, + "logps/chosen": -8.180788040161133, + "logps/rejected": -22.61843490600586, + "loss": 0.9971, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02741624414920807, + "rewards/margins": 0.052984196692705154, + "rewards/rejected": -0.025567958131432533, + "step": 540 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 46.27058410644531, + "learning_rate": 1.2163145162128948e-06, + "logits/chosen": -2.811455249786377, + "logits/rejected": -2.801848888397217, + "logps/chosen": -8.542337417602539, + "logps/rejected": -21.399410247802734, + "loss": 0.9509, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02186908759176731, + "rewards/margins": 0.0516178198158741, + "rewards/rejected": -0.029748734086751938, + "step": 550 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 69.24237823486328, + "learning_rate": 1.1171721146870015e-06, + "logits/chosen": -2.8135528564453125, + "logits/rejected": -2.803555965423584, + "logps/chosen": -8.623903274536133, + "logps/rejected": -22.410194396972656, + "loss": 0.9666, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03136039525270462, + "rewards/margins": 0.06211704760789871, + "rewards/rejected": -0.030756641179323196, + "step": 560 + }, + { + "epoch": 7.402597402597403, + "grad_norm": 1.2535158395767212, + "learning_rate": 1.021069468966194e-06, + "logits/chosen": -2.8163511753082275, + "logits/rejected": -2.808074474334717, + "logps/chosen": -7.339944362640381, + "logps/rejected": -22.135278701782227, + "loss": 0.8673, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.025004085153341293, + "rewards/margins": 0.048856452107429504, + "rewards/rejected": -0.023852365091443062, + "step": 570 + }, + { + "epoch": 7.532467532467533, + "grad_norm": 1.1791399717330933, + "learning_rate": 9.282178335227885e-07, + "logits/chosen": -2.7872376441955566, + "logits/rejected": -2.7806403636932373, + "logps/chosen": -8.17931079864502, + "logps/rejected": -23.90004539489746, + "loss": 1.4371, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025283193215727806, + "rewards/margins": 0.06855350732803345, + "rewards/rejected": -0.04327031224966049, + "step": 580 + }, + { + "epoch": 7.662337662337662, + "grad_norm": 1.237028956413269, + "learning_rate": 8.38821316402946e-07, + "logits/chosen": -2.828946352005005, + "logits/rejected": -2.82279634475708, + "logps/chosen": -7.834005832672119, + "logps/rejected": -23.059555053710938, + "loss": 0.872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.023193147033452988, + "rewards/margins": 0.051843591034412384, + "rewards/rejected": -0.0286504365503788, + "step": 590 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 1.1478286981582642, + "learning_rate": 7.530764305528959e-07, + "logits/chosen": -2.8227028846740723, + "logits/rejected": -2.815317153930664, + "logps/chosen": -7.287829399108887, + "logps/rejected": -21.66531753540039, + "loss": 0.9423, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02044813148677349, + "rewards/margins": 0.050043750554323196, + "rewards/rejected": -0.029595619067549706, + "step": 600 + }, + { + "epoch": 7.922077922077922, + "grad_norm": 0.9912703037261963, + "learning_rate": 6.711716618408282e-07, + "logits/chosen": -2.838146686553955, + "logits/rejected": -2.8304896354675293, + "logps/chosen": -6.6907477378845215, + "logps/rejected": -19.75206756591797, + "loss": 1.0092, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03283644840121269, + "rewards/margins": 0.04647618532180786, + "rewards/rejected": -0.01363973505795002, + "step": 610 + }, + { + "epoch": 8.051948051948052, + "grad_norm": 205.0751953125, + "learning_rate": 5.932870547240455e-07, + "logits/chosen": -2.814697504043579, + "logits/rejected": -2.803783893585205, + "logps/chosen": -7.918333530426025, + "logps/rejected": -22.431245803833008, + "loss": 1.1134, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026031214743852615, + "rewards/margins": 0.05806659907102585, + "rewards/rejected": -0.03203538805246353, + "step": 620 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 181.82443237304688, + "learning_rate": 5.195938164721767e-07, + "logits/chosen": -2.825852394104004, + "logits/rejected": -2.8194408416748047, + "logps/chosen": -7.493607997894287, + "logps/rejected": -22.741878509521484, + "loss": 0.9249, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.031044036149978638, + "rewards/margins": 0.054368507117033005, + "rewards/rejected": -0.023324472829699516, + "step": 630 + }, + { + "epoch": 8.311688311688311, + "grad_norm": 1.184661626815796, + "learning_rate": 4.502539408164386e-07, + "logits/chosen": -2.8212850093841553, + "logits/rejected": -2.8142733573913574, + "logps/chosen": -8.27627944946289, + "logps/rejected": -20.00022315979004, + "loss": 0.9228, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.025791505351662636, + "rewards/margins": 0.04540405049920082, + "rewards/rejected": -0.019612547010183334, + "step": 640 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 128.95013427734375, + "learning_rate": 3.8541985185225645e-07, + "logits/chosen": -2.824355363845825, + "logits/rejected": -2.8176026344299316, + "logps/chosen": -7.515383243560791, + "logps/rejected": -21.355083465576172, + "loss": 1.0536, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02296631410717964, + "rewards/margins": 0.056810516864061356, + "rewards/rejected": -0.03384420648217201, + "step": 650 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 276.641357421875, + "learning_rate": 3.252340689780245e-07, + "logits/chosen": -2.82700777053833, + "logits/rejected": -2.822455883026123, + "logps/chosen": -7.617165565490723, + "logps/rejected": -23.340801239013672, + "loss": 1.2275, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02447410859167576, + "rewards/margins": 0.07831484079360962, + "rewards/rejected": -0.05384073406457901, + "step": 660 + }, + { + "epoch": 8.7012987012987, + "grad_norm": 241.392333984375, + "learning_rate": 2.698288936065338e-07, + "logits/chosen": -2.7932610511779785, + "logits/rejected": -2.7827224731445312, + "logps/chosen": -7.8216118812561035, + "logps/rejected": -20.492076873779297, + "loss": 1.0025, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024464499205350876, + "rewards/margins": 0.055416546761989594, + "rewards/rejected": -0.030952051281929016, + "step": 670 + }, + { + "epoch": 8.831168831168831, + "grad_norm": 1.2769261598587036, + "learning_rate": 2.1932611833775846e-07, + "logits/chosen": -2.8169243335723877, + "logits/rejected": -2.8119499683380127, + "logps/chosen": -8.214499473571777, + "logps/rejected": -22.425582885742188, + "loss": 1.2029, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02633916214108467, + "rewards/margins": 0.05255778878927231, + "rewards/rejected": -0.026218628510832787, + "step": 680 + }, + { + "epoch": 8.96103896103896, + "grad_norm": 198.49639892578125, + "learning_rate": 1.7383675923228372e-07, + "logits/chosen": -2.7998576164245605, + "logits/rejected": -2.7877182960510254, + "logps/chosen": -10.261676788330078, + "logps/rejected": -22.23526954650879, + "loss": 1.2189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023275194689631462, + "rewards/margins": 0.05330119654536247, + "rewards/rejected": -0.030026007443666458, + "step": 690 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 1.1314146518707275, + "learning_rate": 1.3346081177391474e-07, + "logits/chosen": -2.795860767364502, + "logits/rejected": -2.784841299057007, + "logps/chosen": -10.019143104553223, + "logps/rejected": -22.597219467163086, + "loss": 0.8856, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.026558348909020424, + "rewards/margins": 0.05729994922876358, + "rewards/rejected": -0.030741602182388306, + "step": 700 + }, + { + "epoch": 9.220779220779221, + "grad_norm": 179.00376892089844, + "learning_rate": 9.828703105789983e-08, + "logits/chosen": -2.8192873001098633, + "logits/rejected": -2.8140900135040283, + "logps/chosen": -9.172324180603027, + "logps/rejected": -22.661869049072266, + "loss": 1.0784, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023906812071800232, + "rewards/margins": 0.06065645068883896, + "rewards/rejected": -0.036749642342329025, + "step": 710 + }, + { + "epoch": 9.35064935064935, + "grad_norm": 69.8018798828125, + "learning_rate": 6.839273668796747e-08, + "logits/chosen": -2.8201510906219482, + "logits/rejected": -2.8087105751037598, + "logps/chosen": -8.500712394714355, + "logps/rejected": -22.419025421142578, + "loss": 0.9798, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.023749757558107376, + "rewards/margins": 0.05873861163854599, + "rewards/rejected": -0.034988854080438614, + "step": 720 + }, + { + "epoch": 9.480519480519481, + "grad_norm": 70.18726348876953, + "learning_rate": 4.384364281105974e-08, + "logits/chosen": -2.83191180229187, + "logits/rejected": -2.824836015701294, + "logps/chosen": -9.021523475646973, + "logps/rejected": -23.541057586669922, + "loss": 0.8762, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024020517244935036, + "rewards/margins": 0.06196972727775574, + "rewards/rejected": -0.03794920817017555, + "step": 730 + }, + { + "epoch": 9.61038961038961, + "grad_norm": 308.2706298828125, + "learning_rate": 2.4693713663372643e-08, + "logits/chosen": -2.799358367919922, + "logits/rejected": -2.7913246154785156, + "logps/chosen": -7.899996757507324, + "logps/rejected": -21.6778564453125, + "loss": 1.3393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01837027817964554, + "rewards/margins": 0.05194631963968277, + "rewards/rejected": -0.03357603773474693, + "step": 740 + }, + { + "epoch": 9.74025974025974, + "grad_norm": 42.1520881652832, + "learning_rate": 1.0985044945254763e-08, + "logits/chosen": -2.8439416885375977, + "logits/rejected": -2.839962959289551, + "logps/chosen": -9.0369291305542, + "logps/rejected": -19.963855743408203, + "loss": 0.917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02196243405342102, + "rewards/margins": 0.04523957893252373, + "rewards/rejected": -0.023277146741747856, + "step": 750 + }, + { + "epoch": 9.87012987012987, + "grad_norm": 237.67958068847656, + "learning_rate": 3.97807508777894e-06, + "logits/chosen": -2.8095672130584717, + "logits/rejected": -2.8042306900024414, + "logps/chosen": -8.958699226379395, + "logps/rejected": -20.755828857421875, + "loss": 1.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021344048902392387, + "rewards/margins": 0.05193667858839035, + "rewards/rejected": -0.030592631548643112, + "step": 760 + }, + { + "epoch": 10.0, + "grad_norm": 415.3313293457031, + "learning_rate": 3.949264905820697e-06, + "logits/chosen": -2.8104586601257324, + "logits/rejected": -2.8043599128723145, + "logps/chosen": -8.099912643432617, + "logps/rejected": -22.543119430541992, + "loss": 1.0218, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.029804136604070663, + "rewards/margins": 0.06551304459571838, + "rewards/rejected": -0.03570891544222832, + "step": 770 + }, + { + "epoch": 10.12987012987013, + "grad_norm": 1.33208429813385, + "learning_rate": 3.92016186682789e-06, + "logits/chosen": -2.8259775638580322, + "logits/rejected": -2.8215713500976562, + "logps/chosen": -9.199613571166992, + "logps/rejected": -21.45523452758789, + "loss": 0.8068, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02821979485452175, + "rewards/margins": 0.05966518074274063, + "rewards/rejected": -0.03144538030028343, + "step": 780 + }, + { + "epoch": 10.25974025974026, + "grad_norm": 226.0507354736328, + "learning_rate": 3.8907718517334405e-06, + "logits/chosen": -2.8059029579162598, + "logits/rejected": -2.7993156909942627, + "logps/chosen": -8.171636581420898, + "logps/rejected": -22.19893455505371, + "loss": 0.9794, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.022764720022678375, + "rewards/margins": 0.069257453083992, + "rewards/rejected": -0.04649273306131363, + "step": 790 + }, + { + "epoch": 10.38961038961039, + "grad_norm": 1.1935486793518066, + "learning_rate": 3.861100799460336e-06, + "logits/chosen": -2.816051959991455, + "logits/rejected": -2.81080961227417, + "logps/chosen": -7.233284950256348, + "logps/rejected": -22.618621826171875, + "loss": 1.1384, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030704358592629433, + "rewards/margins": 0.08574527502059937, + "rewards/rejected": -0.05504090338945389, + "step": 800 + }, + { + "epoch": 10.38961038961039, + "eval_logits/chosen": -2.8031392097473145, + "eval_logits/rejected": -2.8060073852539062, + "eval_logps/chosen": -13.673626899719238, + "eval_logps/rejected": -16.335693359375, + "eval_loss": 0.6743522882461548, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 0.047690678387880325, + "eval_rewards/margins": 0.05417700111865997, + "eval_rewards/rejected": -0.006486321333795786, + "eval_runtime": 1.1696, + "eval_samples_per_second": 11.97, + "eval_steps_per_second": 1.71, + "step": 800 + }, + { + "epoch": 10.519480519480519, + "grad_norm": 148.8427734375, + "learning_rate": 3.831154705721542e-06, + "logits/chosen": -2.7960445880889893, + "logits/rejected": -2.791477680206299, + "logps/chosen": -9.04710578918457, + "logps/rejected": -22.726436614990234, + "loss": 0.8774, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03348678722977638, + "rewards/margins": 0.08231980353593826, + "rewards/rejected": -0.04883301258087158, + "step": 810 + }, + { + "epoch": 10.64935064935065, + "grad_norm": 75.88983154296875, + "learning_rate": 3.800939621808419e-06, + "logits/chosen": -2.8040685653686523, + "logits/rejected": -2.7940433025360107, + "logps/chosen": -7.935153007507324, + "logps/rejected": -22.834848403930664, + "loss": 1.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02770247682929039, + "rewards/margins": 0.09637979418039322, + "rewards/rejected": -0.06867731362581253, + "step": 820 + }, + { + "epoch": 10.779220779220779, + "grad_norm": 94.64012145996094, + "learning_rate": 3.770461653367934e-06, + "logits/chosen": -2.8252198696136475, + "logits/rejected": -2.8190674781799316, + "logps/chosen": -7.881258964538574, + "logps/rejected": -22.468507766723633, + "loss": 0.8938, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03068472072482109, + "rewards/margins": 0.0954260528087616, + "rewards/rejected": -0.06474132835865021, + "step": 830 + }, + { + "epoch": 10.909090909090908, + "grad_norm": 126.55635070800781, + "learning_rate": 3.7397269591688666e-06, + "logits/chosen": -2.818159580230713, + "logits/rejected": -2.8095242977142334, + "logps/chosen": -7.300267219543457, + "logps/rejected": -21.571325302124023, + "loss": 1.1946, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04063314571976662, + "rewards/margins": 0.09662587940692902, + "rewards/rejected": -0.055992741137742996, + "step": 840 + }, + { + "epoch": 11.03896103896104, + "grad_norm": 89.22303771972656, + "learning_rate": 3.7087417498572946e-06, + "logits/chosen": -2.8335649967193604, + "logits/rejected": -2.8254220485687256, + "logps/chosen": -7.027431488037109, + "logps/rejected": -24.15895652770996, + "loss": 1.6269, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021446553990244865, + "rewards/margins": 0.0891275405883789, + "rewards/rejected": -0.06768098473548889, + "step": 850 + }, + { + "epoch": 11.168831168831169, + "grad_norm": 113.87052154541016, + "learning_rate": 3.677512286701587e-06, + "logits/chosen": -2.822108030319214, + "logits/rejected": -2.8177809715270996, + "logps/chosen": -7.26416540145874, + "logps/rejected": -22.180326461791992, + "loss": 0.9047, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03205590695142746, + "rewards/margins": 0.09417016804218292, + "rewards/rejected": -0.06211426109075546, + "step": 860 + }, + { + "epoch": 11.2987012987013, + "grad_norm": 1.4642364978790283, + "learning_rate": 3.646044880327176e-06, + "logits/chosen": -2.8277363777160645, + "logits/rejected": -2.8215818405151367, + "logps/chosen": -8.22972297668457, + "logps/rejected": -20.709514617919922, + "loss": 0.9662, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.027242153882980347, + "rewards/margins": 0.07722898572683334, + "rewards/rejected": -0.04998684674501419, + "step": 870 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 227.3729248046875, + "learning_rate": 3.6143458894413463e-06, + "logits/chosen": -2.815782308578491, + "logits/rejected": -2.809896230697632, + "logps/chosen": -8.15450382232666, + "logps/rejected": -23.144954681396484, + "loss": 1.009, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04248720407485962, + "rewards/margins": 0.11930873245000839, + "rewards/rejected": -0.07682152092456818, + "step": 880 + }, + { + "epoch": 11.558441558441558, + "grad_norm": 321.1402893066406, + "learning_rate": 3.5824217195483178e-06, + "logits/chosen": -2.8049838542938232, + "logits/rejected": -2.7966904640197754, + "logps/chosen": -7.33872127532959, + "logps/rejected": -21.830829620361328, + "loss": 1.1877, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.031283579766750336, + "rewards/margins": 0.09999363124370575, + "rewards/rejected": -0.06871005892753601, + "step": 890 + }, + { + "epoch": 11.688311688311689, + "grad_norm": 148.70529174804688, + "learning_rate": 3.550278821654866e-06, + "logits/chosen": -2.804623603820801, + "logits/rejected": -2.7956459522247314, + "logps/chosen": -7.542906761169434, + "logps/rejected": -21.376934051513672, + "loss": 0.899, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03491160273551941, + "rewards/margins": 0.10313661396503448, + "rewards/rejected": -0.06822501122951508, + "step": 900 + }, + { + "epoch": 11.818181818181818, + "grad_norm": 131.51914978027344, + "learning_rate": 3.517923690966747e-06, + "logits/chosen": -2.81646990776062, + "logits/rejected": -2.803173065185547, + "logps/chosen": -7.835005760192871, + "logps/rejected": -24.5490665435791, + "loss": 1.2002, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03581169620156288, + "rewards/margins": 0.12795209884643555, + "rewards/rejected": -0.09214041382074356, + "step": 910 + }, + { + "epoch": 11.948051948051948, + "grad_norm": 1.8541675806045532, + "learning_rate": 3.4853628655761946e-06, + "logits/chosen": -2.8321688175201416, + "logits/rejected": -2.8234333992004395, + "logps/chosen": -8.250754356384277, + "logps/rejected": -21.983013153076172, + "loss": 1.2209, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02724345028400421, + "rewards/margins": 0.11046279966831207, + "rewards/rejected": -0.08321934938430786, + "step": 920 + }, + { + "epoch": 12.077922077922079, + "grad_norm": 1.8971673250198364, + "learning_rate": 3.452602925140751e-06, + "logits/chosen": -2.8104381561279297, + "logits/rejected": -2.805176258087158, + "logps/chosen": -10.041051864624023, + "logps/rejected": -22.36850929260254, + "loss": 0.8314, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04497869685292244, + "rewards/margins": 0.1275285929441452, + "rewards/rejected": -0.08254990726709366, + "step": 930 + }, + { + "epoch": 12.207792207792208, + "grad_norm": 78.92677307128906, + "learning_rate": 3.4196504895536948e-06, + "logits/chosen": -2.821592330932617, + "logits/rejected": -2.812694549560547, + "logps/chosen": -8.034502029418945, + "logps/rejected": -21.927505493164062, + "loss": 1.4864, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03274466469883919, + "rewards/margins": 0.14650548994541168, + "rewards/rejected": -0.11376082897186279, + "step": 940 + }, + { + "epoch": 12.337662337662337, + "grad_norm": 151.75515747070312, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": -2.815185070037842, + "logits/rejected": -2.798635721206665, + "logps/chosen": -7.672248840332031, + "logps/rejected": -24.917593002319336, + "loss": 0.7744, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03489464521408081, + "rewards/margins": 0.16525618731975555, + "rewards/rejected": -0.13036152720451355, + "step": 950 + }, + { + "epoch": 12.467532467532468, + "grad_norm": 287.6182556152344, + "learning_rate": 3.3531948056424766e-06, + "logits/chosen": -2.8206896781921387, + "logits/rejected": -2.808096170425415, + "logps/chosen": -8.610795021057129, + "logps/rejected": -20.904489517211914, + "loss": 1.5907, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04022675007581711, + "rewards/margins": 0.12955889105796814, + "rewards/rejected": -0.08933213353157043, + "step": 960 + }, + { + "epoch": 12.597402597402597, + "grad_norm": 102.08235168457031, + "learning_rate": 3.319704986205223e-06, + "logits/chosen": -2.8291194438934326, + "logits/rejected": -2.821276903152466, + "logps/chosen": -7.160222053527832, + "logps/rejected": -22.413909912109375, + "loss": 2.002, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029990587383508682, + "rewards/margins": 0.1707821935415268, + "rewards/rejected": -0.1407916247844696, + "step": 970 + }, + { + "epoch": 12.727272727272727, + "grad_norm": 293.0462646484375, + "learning_rate": 3.28604952667656e-06, + "logits/chosen": -2.811882734298706, + "logits/rejected": -2.8050527572631836, + "logps/chosen": -8.143568992614746, + "logps/rejected": -20.997821807861328, + "loss": 1.7693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03135104477405548, + "rewards/margins": 0.13260604441165924, + "rewards/rejected": -0.10125498473644257, + "step": 980 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 270.28076171875, + "learning_rate": 3.2522352279098256e-06, + "logits/chosen": -2.8084704875946045, + "logits/rejected": -2.8023064136505127, + "logps/chosen": -7.580096244812012, + "logps/rejected": -23.871841430664062, + "loss": 1.0787, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03514163941144943, + "rewards/margins": 0.19183096289634705, + "rewards/rejected": -0.15668931603431702, + "step": 990 + }, + { + "epoch": 12.987012987012987, + "grad_norm": 147.3173065185547, + "learning_rate": 3.218268922855452e-06, + "logits/chosen": -2.8337278366088867, + "logits/rejected": -2.822435140609741, + "logps/chosen": -7.920820713043213, + "logps/rejected": -23.01383399963379, + "loss": 1.3496, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.044772375375032425, + "rewards/margins": 0.16832272708415985, + "rewards/rejected": -0.12355033308267593, + "step": 1000 + }, + { + "epoch": 13.116883116883116, + "grad_norm": 1.7505887746810913, + "learning_rate": 3.184157475180208e-06, + "logits/chosen": -2.8243801593780518, + "logits/rejected": -2.8188118934631348, + "logps/chosen": -6.851785182952881, + "logps/rejected": -22.512981414794922, + "loss": 0.7081, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.046257633715867996, + "rewards/margins": 0.17940297722816467, + "rewards/rejected": -0.13314534723758698, + "step": 1010 + }, + { + "epoch": 13.246753246753247, + "grad_norm": 226.2203826904297, + "learning_rate": 3.149907777880239e-06, + "logits/chosen": -2.8116049766540527, + "logits/rejected": -2.802031993865967, + "logps/chosen": -9.796255111694336, + "logps/rejected": -23.3902587890625, + "loss": 1.5793, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.034330084919929504, + "rewards/margins": 0.17266739904880524, + "rewards/rejected": -0.13833732903003693, + "step": 1020 + }, + { + "epoch": 13.376623376623376, + "grad_norm": 103.82469940185547, + "learning_rate": 3.1155267518881816e-06, + "logits/chosen": -2.8263697624206543, + "logits/rejected": -2.8141000270843506, + "logps/chosen": -6.771908760070801, + "logps/rejected": -24.233366012573242, + "loss": 1.0598, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04042463377118111, + "rewards/margins": 0.23938068747520447, + "rewards/rejected": -0.19895607233047485, + "step": 1030 + }, + { + "epoch": 13.506493506493506, + "grad_norm": 283.45172119140625, + "learning_rate": 3.0810213446746323e-06, + "logits/chosen": -2.8335254192352295, + "logits/rejected": -2.8218533992767334, + "logps/chosen": -8.464487075805664, + "logps/rejected": -21.802452087402344, + "loss": 2.5313, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0437016561627388, + "rewards/margins": 0.1905747503042221, + "rewards/rejected": -0.1468731015920639, + "step": 1040 + }, + { + "epoch": 13.636363636363637, + "grad_norm": 309.3778076171875, + "learning_rate": 3.046398528844248e-06, + "logits/chosen": -2.8289742469787598, + "logits/rejected": -2.8251280784606934, + "logps/chosen": -7.5258283615112305, + "logps/rejected": -20.029939651489258, + "loss": 1.7409, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.05584300309419632, + "rewards/margins": 0.15009096264839172, + "rewards/rejected": -0.0942479595541954, + "step": 1050 + }, + { + "epoch": 13.766233766233766, + "grad_norm": 229.9144744873047, + "learning_rate": 3.0116653007267753e-06, + "logits/chosen": -2.8172080516815186, + "logits/rejected": -2.810072183609009, + "logps/chosen": -7.817892551422119, + "logps/rejected": -21.542694091796875, + "loss": 1.3013, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05069408565759659, + "rewards/margins": 0.1945628523826599, + "rewards/rejected": -0.14386877417564392, + "step": 1060 + }, + { + "epoch": 13.896103896103895, + "grad_norm": 74.20500946044922, + "learning_rate": 2.9768286789632845e-06, + "logits/chosen": -2.8088202476501465, + "logits/rejected": -2.7986044883728027, + "logps/chosen": -9.273926734924316, + "logps/rejected": -22.66620445251465, + "loss": 0.8765, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.058945827186107635, + "rewards/margins": 0.23349115252494812, + "rewards/rejected": -0.1745453178882599, + "step": 1070 + }, + { + "epoch": 14.025974025974026, + "grad_norm": 569.8877563476562, + "learning_rate": 2.9418957030878876e-06, + "logits/chosen": -2.825758457183838, + "logits/rejected": -2.819606304168701, + "logps/chosen": -7.458970546722412, + "logps/rejected": -22.59342384338379, + "loss": 1.9946, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04193786159157753, + "rewards/margins": 0.21687336266040802, + "rewards/rejected": -0.1749354898929596, + "step": 1080 + }, + { + "epoch": 14.155844155844155, + "grad_norm": 1.655894160270691, + "learning_rate": 2.9068734321052445e-06, + "logits/chosen": -2.829101800918579, + "logits/rejected": -2.821521282196045, + "logps/chosen": -8.147821426391602, + "logps/rejected": -21.687742233276367, + "loss": 1.6034, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.058260608464479446, + "rewards/margins": 0.22970101237297058, + "rewards/rejected": -0.17144039273262024, + "step": 1090 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 226.83702087402344, + "learning_rate": 2.871768943064129e-06, + "logits/chosen": -2.8222107887268066, + "logits/rejected": -2.81400990486145, + "logps/chosen": -6.330902576446533, + "logps/rejected": -25.222558975219727, + "loss": 0.9428, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.043078385293483734, + "rewards/margins": 0.31059008836746216, + "rewards/rejected": -0.26751166582107544, + "step": 1100 + }, + { + "epoch": 14.415584415584416, + "grad_norm": 542.2830200195312, + "learning_rate": 2.836589329627349e-06, + "logits/chosen": -2.829596996307373, + "logits/rejected": -2.8249754905700684, + "logps/chosen": -7.770198822021484, + "logps/rejected": -23.904422760009766, + "loss": 1.6431, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05304824188351631, + "rewards/margins": 0.2782563269138336, + "rewards/rejected": -0.2252080738544464, + "step": 1110 + }, + { + "epoch": 14.545454545454545, + "grad_norm": 566.6854248046875, + "learning_rate": 2.8013417006383078e-06, + "logits/chosen": -2.814929962158203, + "logits/rejected": -2.807624340057373, + "logps/chosen": -8.236068725585938, + "logps/rejected": -23.396425247192383, + "loss": 1.9727, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04831843078136444, + "rewards/margins": 0.24669453501701355, + "rewards/rejected": -0.19837608933448792, + "step": 1120 + }, + { + "epoch": 14.675324675324676, + "grad_norm": 144.7385711669922, + "learning_rate": 2.766033178684506e-06, + "logits/chosen": -2.812544107437134, + "logits/rejected": -2.8035783767700195, + "logps/chosen": -7.520627498626709, + "logps/rejected": -23.25860023498535, + "loss": 1.074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04993298649787903, + "rewards/margins": 0.31407758593559265, + "rewards/rejected": -0.2641445994377136, + "step": 1130 + }, + { + "epoch": 14.805194805194805, + "grad_norm": 1.8980822563171387, + "learning_rate": 2.730670898658255e-06, + "logits/chosen": -2.816342830657959, + "logits/rejected": -2.805983066558838, + "logps/chosen": -8.278943061828613, + "logps/rejected": -24.814987182617188, + "loss": 1.2085, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.056690942496061325, + "rewards/margins": 0.3225446939468384, + "rewards/rejected": -0.26585373282432556, + "step": 1140 + }, + { + "epoch": 14.935064935064934, + "grad_norm": 323.37408447265625, + "learning_rate": 2.695262006314912e-06, + "logits/chosen": -2.8120298385620117, + "logits/rejected": -2.800677537918091, + "logps/chosen": -8.421112060546875, + "logps/rejected": -23.05984115600586, + "loss": 2.2793, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.057875752449035645, + "rewards/margins": 0.2153436243534088, + "rewards/rejected": -0.15746784210205078, + "step": 1150 + }, + { + "epoch": 15.064935064935066, + "grad_norm": 192.21383666992188, + "learning_rate": 2.6598136568289144e-06, + "logits/chosen": -2.8322479724884033, + "logits/rejected": -2.827678918838501, + "logps/chosen": -6.597090244293213, + "logps/rejected": -23.71731948852539, + "loss": 1.8258, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.050219256430864334, + "rewards/margins": 0.29276466369628906, + "rewards/rejected": -0.24254541099071503, + "step": 1160 + }, + { + "epoch": 15.194805194805195, + "grad_norm": 1.7898062467575073, + "learning_rate": 2.6243330133479173e-06, + "logits/chosen": -2.8162477016448975, + "logits/rejected": -2.810333251953125, + "logps/chosen": -7.49225378036499, + "logps/rejected": -24.20693588256836, + "loss": 2.364, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05513688176870346, + "rewards/margins": 0.2901547849178314, + "rewards/rejected": -0.23501792550086975, + "step": 1170 + }, + { + "epoch": 15.324675324675324, + "grad_norm": 1.774614691734314, + "learning_rate": 2.5888272455453136e-06, + "logits/chosen": -2.841599225997925, + "logits/rejected": -2.830899715423584, + "logps/chosen": -7.024038791656494, + "logps/rejected": -24.798709869384766, + "loss": 1.3787, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.05138504505157471, + "rewards/margins": 0.30870115756988525, + "rewards/rejected": -0.25731611251831055, + "step": 1180 + }, + { + "epoch": 15.454545454545455, + "grad_norm": 168.11526489257812, + "learning_rate": 2.5533035281714368e-06, + "logits/chosen": -2.8249008655548096, + "logits/rejected": -2.819967031478882, + "logps/chosen": -7.072245121002197, + "logps/rejected": -24.478059768676758, + "loss": 1.5998, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.05316594988107681, + "rewards/margins": 0.33188968896865845, + "rewards/rejected": -0.2787237763404846, + "step": 1190 + }, + { + "epoch": 15.584415584415584, + "grad_norm": 1.7886942625045776, + "learning_rate": 2.517769039603744e-06, + "logits/chosen": -2.824219226837158, + "logits/rejected": -2.8117549419403076, + "logps/chosen": -7.654397487640381, + "logps/rejected": -26.292510986328125, + "loss": 1.1387, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06867384910583496, + "rewards/margins": 0.3651869297027588, + "rewards/rejected": -0.29651308059692383, + "step": 1200 + }, + { + "epoch": 15.584415584415584, + "eval_logits/chosen": -2.8089284896850586, + "eval_logits/rejected": -2.8120827674865723, + "eval_logps/chosen": -13.108169555664062, + "eval_logps/rejected": -16.744564056396484, + "eval_loss": 0.6140245795249939, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": 0.10423628985881805, + "eval_rewards/margins": 0.15160974860191345, + "eval_rewards/rejected": -0.0473734587430954, + "eval_runtime": 1.1713, + "eval_samples_per_second": 11.952, + "eval_steps_per_second": 1.707, + "step": 1200 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 127.52613830566406, + "learning_rate": 2.482230960396256e-06, + "logits/chosen": -2.817732334136963, + "logits/rejected": -2.805518865585327, + "logps/chosen": -7.076968193054199, + "logps/rejected": -23.43122673034668, + "loss": 1.1089, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.057178787887096405, + "rewards/margins": 0.35119739174842834, + "rewards/rejected": -0.29401862621307373, + "step": 1210 + }, + { + "epoch": 15.844155844155845, + "grad_norm": 418.4096984863281, + "learning_rate": 2.4466964718285636e-06, + "logits/chosen": -2.824800729751587, + "logits/rejected": -2.817626714706421, + "logps/chosen": -8.242257118225098, + "logps/rejected": -24.571439743041992, + "loss": 1.7635, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0650298148393631, + "rewards/margins": 0.3450847566127777, + "rewards/rejected": -0.2800549864768982, + "step": 1220 + }, + { + "epoch": 15.974025974025974, + "grad_norm": 1.7215478420257568, + "learning_rate": 2.411172754454688e-06, + "logits/chosen": -2.83309006690979, + "logits/rejected": -2.8249144554138184, + "logps/chosen": -9.375545501708984, + "logps/rejected": -23.499141693115234, + "loss": 1.3375, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06410713493824005, + "rewards/margins": 0.33921390771865845, + "rewards/rejected": -0.275106817483902, + "step": 1230 + }, + { + "epoch": 16.103896103896105, + "grad_norm": 342.5730895996094, + "learning_rate": 2.375666986652083e-06, + "logits/chosen": -2.83019757270813, + "logits/rejected": -2.8198351860046387, + "logps/chosen": -9.095269203186035, + "logps/rejected": -22.90569305419922, + "loss": 1.4406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04684508591890335, + "rewards/margins": 0.3012949824333191, + "rewards/rejected": -0.25444987416267395, + "step": 1240 + }, + { + "epoch": 16.233766233766232, + "grad_norm": 1.895964503288269, + "learning_rate": 2.3401863431710864e-06, + "logits/chosen": -2.8324759006500244, + "logits/rejected": -2.823629856109619, + "logps/chosen": -6.349139213562012, + "logps/rejected": -22.462413787841797, + "loss": 0.7482, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06610914319753647, + "rewards/margins": 0.3332839906215668, + "rewards/rejected": -0.2671748101711273, + "step": 1250 + }, + { + "epoch": 16.363636363636363, + "grad_norm": 246.20245361328125, + "learning_rate": 2.3047379936850885e-06, + "logits/chosen": -2.8041343688964844, + "logits/rejected": -2.794372320175171, + "logps/chosen": -8.607588768005371, + "logps/rejected": -21.551204681396484, + "loss": 2.8934, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05655663087964058, + "rewards/margins": 0.2559330463409424, + "rewards/rejected": -0.1993764191865921, + "step": 1260 + }, + { + "epoch": 16.493506493506494, + "grad_norm": 180.12371826171875, + "learning_rate": 2.269329101341745e-06, + "logits/chosen": -2.825704336166382, + "logits/rejected": -2.813232898712158, + "logps/chosen": -8.257078170776367, + "logps/rejected": -24.829893112182617, + "loss": 1.095, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.056694962084293365, + "rewards/margins": 0.37473762035369873, + "rewards/rejected": -0.3180426359176636, + "step": 1270 + }, + { + "epoch": 16.623376623376622, + "grad_norm": 266.23565673828125, + "learning_rate": 2.2339668213154943e-06, + "logits/chosen": -2.8129374980926514, + "logits/rejected": -2.806753158569336, + "logps/chosen": -9.956947326660156, + "logps/rejected": -27.90561294555664, + "loss": 1.3426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06363432109355927, + "rewards/margins": 0.42332082986831665, + "rewards/rejected": -0.3596864640712738, + "step": 1280 + }, + { + "epoch": 16.753246753246753, + "grad_norm": 268.4725036621094, + "learning_rate": 2.1986582993616926e-06, + "logits/chosen": -2.8167190551757812, + "logits/rejected": -2.8063576221466064, + "logps/chosen": -7.815527439117432, + "logps/rejected": -25.4665584564209, + "loss": 2.4032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0594390332698822, + "rewards/margins": 0.34777015447616577, + "rewards/rejected": -0.28833115100860596, + "step": 1290 + }, + { + "epoch": 16.883116883116884, + "grad_norm": 2.1606481075286865, + "learning_rate": 2.163410670372652e-06, + "logits/chosen": -2.8243236541748047, + "logits/rejected": -2.814610719680786, + "logps/chosen": -7.246170997619629, + "logps/rejected": -25.429073333740234, + "loss": 1.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07013025879859924, + "rewards/margins": 0.4054412841796875, + "rewards/rejected": -0.33531102538108826, + "step": 1300 + } + ], + "logging_steps": 10, + "max_steps": 2310, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/training_args.bin b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35883251f645e16fe9f020fb00125462447a8620 --- /dev/null +++ b/weight_dir/DpoWeight/DPOP_Fix_ND3V1/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcfbbe20fe71ddf66bf6192d5e1ac58297b542012aef9e192577497ecf0bcf70 +size 5944 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/README.md b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..105007ea041cb936ba3417e0221e47fec447213d --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./CodeLlama-7b-Instruct-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_config.json b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8faaa17fff7e4931b18e07a687c5b35532df41ba --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./CodeLlama-7b-Instruct-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_model.bin b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c599266290a655636b42f9e42939f3d0c73c9b7 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b282a5cf93953598ceac2f52df00bf903105e30948b8eed7d963192ddaa2f1d +size 8433034 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/added_tokens.json b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c09203d8a52151e0d3b5f3c8e6daedc5b60832b5 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/added_tokens.json @@ -0,0 +1,3 @@ +{ + "": 32016 +} diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fadfb38b9e43ce2369b534a65a7ddcd145d48de --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c549fd6e8d5b46fcb98fa5ffd3b8d4f98982318b8abbf744a59a5f6ec07540b6 +size 12589776 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9ee00d74029ec1ce61999f94813a3c989a38545 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155a0ff6ca2c8b215e2de05a2046e4af0f44dfbf36ddc1e5540840f63b6566c1 +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e469cb43c418d88f5fde9579800b5cf5c14c6a6 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef4e8acc41a38ca5bae3ca53f95ce12cfd17f6a3c6f3d1314c44ab63d8eff2fb +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e42934764a8919f59284ca1b5c98bf8f755b1e43 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eb2ca7d094608660a1769f9e8795610fdcd849fd4767a35767f3bd988030590 +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/mp_rank_00_model_states.pt b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a482a79edb954e465c4b52f27016b16a5a2933cc --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/global_step8000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b0370a3b4b12f58d3ede5c81eb8710c88abb7eb5d7c909f3c900de0c20606fa +size 8507372 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/latest b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c558c76ac9e7515267f79d24d9fde4a7f8688f1 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/latest @@ -0,0 +1 @@ +global_step8000 \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_0.pth b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8330735db6838f0e2e9dbc45bed2e58e8bb363a4 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5c221a5a18ee088159763149e88b1f6677df21c4df4a48421f3a71d3c952fb +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_1.pth b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e0d65b2a26a21db37de3fb368ebd0be093d6828a --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da57800cf6a9e3b0749c2b2264d6781ea412ede7c7a3eff1a5481b49fb4a948f +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_2.pth b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..eec3c430e30e16aaf8764e3a8fb1b12e52e7172d --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a38c281b64adeb9cd597bdf895b41a68c82730b2db11abc5bf968012fbc4842 +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_3.pth b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f8eb7c65e8f2257f6cf925b85932df861a436d2 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72aeb50b87ec962a05c4fd8f40284d218377094be833b3547db1bc4e15c91b9 +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/special_tokens_map.json b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..2ece23e993f7e1c7063cb51148b6fa5c6c224775 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer.model b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer_config.json b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc4d9a55d56b17600905bfc69a0e45e2fae7a1c4 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/tokenizer_config.json @@ -0,0 +1,50 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/trainer_state.json b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e5d3e0f5203c78d7f8372ec2d0dd92b05048f64 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/trainer_state.json @@ -0,0 +1,1153 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.548582351067219, + "eval_steps": 10000, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01592863969417012, + "grad_norm": 0.2921621799468994, + "learning_rate": 4.992513062316809e-05, + "loss": 4.2405, + "step": 50 + }, + { + "epoch": 0.03185727938834024, + "grad_norm": 0.10916037112474442, + "learning_rate": 4.9845482349942656e-05, + "loss": 0.0569, + "step": 100 + }, + { + "epoch": 0.047785919082510356, + "grad_norm": 0.11249323934316635, + "learning_rate": 4.976583407671722e-05, + "loss": 0.0494, + "step": 150 + }, + { + "epoch": 0.06371455877668047, + "grad_norm": 0.21314933896064758, + "learning_rate": 4.968618580349178e-05, + "loss": 0.0506, + "step": 200 + }, + { + "epoch": 0.07964319847085059, + "grad_norm": 0.122999407351017, + "learning_rate": 4.960653753026634e-05, + "loss": 0.0465, + "step": 250 + }, + { + "epoch": 0.09557183816502071, + "grad_norm": 0.09448697417974472, + "learning_rate": 4.952688925704091e-05, + "loss": 0.0481, + "step": 300 + }, + { + "epoch": 0.11150047785919083, + "grad_norm": 0.09307976812124252, + "learning_rate": 4.944724098381547e-05, + "loss": 0.0472, + "step": 350 + }, + { + "epoch": 0.12742911755336095, + "grad_norm": 0.14354585111141205, + "learning_rate": 4.936759271059004e-05, + "loss": 0.043, + "step": 400 + }, + { + "epoch": 0.14335775724753105, + "grad_norm": 0.10726131498813629, + "learning_rate": 4.92879444373646e-05, + "loss": 0.0415, + "step": 450 + }, + { + "epoch": 0.15928639694170119, + "grad_norm": 0.12725014984607697, + "learning_rate": 4.920829616413916e-05, + "loss": 0.0468, + "step": 500 + }, + { + "epoch": 0.1752150366358713, + "grad_norm": 0.12311285734176636, + "learning_rate": 4.9128647890913724e-05, + "loss": 0.0443, + "step": 550 + }, + { + "epoch": 0.19114367633004142, + "grad_norm": 0.1121256873011589, + "learning_rate": 4.904899961768829e-05, + "loss": 0.0397, + "step": 600 + }, + { + "epoch": 0.20707231602421153, + "grad_norm": 0.08886804431676865, + "learning_rate": 4.896935134446285e-05, + "loss": 0.0407, + "step": 650 + }, + { + "epoch": 0.22300095571838166, + "grad_norm": 0.09782330691814423, + "learning_rate": 4.888970307123742e-05, + "loss": 0.0445, + "step": 700 + }, + { + "epoch": 0.23892959541255177, + "grad_norm": 0.08494489639997482, + "learning_rate": 4.881005479801198e-05, + "loss": 0.0414, + "step": 750 + }, + { + "epoch": 0.2548582351067219, + "grad_norm": 0.0991586446762085, + "learning_rate": 4.873040652478654e-05, + "loss": 0.0432, + "step": 800 + }, + { + "epoch": 0.27078687480089203, + "grad_norm": 0.10855372995138168, + "learning_rate": 4.8650758251561105e-05, + "loss": 0.0394, + "step": 850 + }, + { + "epoch": 0.2867155144950621, + "grad_norm": 0.08806545287370682, + "learning_rate": 4.857110997833567e-05, + "loss": 0.0402, + "step": 900 + }, + { + "epoch": 0.30264415418923224, + "grad_norm": 0.13318926095962524, + "learning_rate": 4.8491461705110234e-05, + "loss": 0.0374, + "step": 950 + }, + { + "epoch": 0.31857279388340237, + "grad_norm": 0.1285628378391266, + "learning_rate": 4.84118134318848e-05, + "loss": 0.0426, + "step": 1000 + }, + { + "epoch": 0.33450143357757245, + "grad_norm": 0.07570258527994156, + "learning_rate": 4.833216515865936e-05, + "loss": 0.0406, + "step": 1050 + }, + { + "epoch": 0.3504300732717426, + "grad_norm": 0.14918510615825653, + "learning_rate": 4.825251688543392e-05, + "loss": 0.041, + "step": 1100 + }, + { + "epoch": 0.3663587129659127, + "grad_norm": 0.14587858319282532, + "learning_rate": 4.8172868612208485e-05, + "loss": 0.0387, + "step": 1150 + }, + { + "epoch": 0.38228735266008285, + "grad_norm": 0.15989379584789276, + "learning_rate": 4.809322033898305e-05, + "loss": 0.0429, + "step": 1200 + }, + { + "epoch": 0.3982159923542529, + "grad_norm": 0.09545516967773438, + "learning_rate": 4.8013572065757615e-05, + "loss": 0.0369, + "step": 1250 + }, + { + "epoch": 0.41414463204842306, + "grad_norm": 0.10800140351057053, + "learning_rate": 4.793392379253218e-05, + "loss": 0.042, + "step": 1300 + }, + { + "epoch": 0.4300732717425932, + "grad_norm": 0.10968200862407684, + "learning_rate": 4.7854275519306744e-05, + "loss": 0.0368, + "step": 1350 + }, + { + "epoch": 0.4460019114367633, + "grad_norm": 0.1094348132610321, + "learning_rate": 4.777462724608131e-05, + "loss": 0.038, + "step": 1400 + }, + { + "epoch": 0.4619305511309334, + "grad_norm": 0.09409014135599136, + "learning_rate": 4.7694978972855866e-05, + "loss": 0.0399, + "step": 1450 + }, + { + "epoch": 0.47785919082510353, + "grad_norm": 0.09957227855920792, + "learning_rate": 4.761533069963043e-05, + "loss": 0.0431, + "step": 1500 + }, + { + "epoch": 0.49378783051927366, + "grad_norm": 0.10479158908128738, + "learning_rate": 4.7535682426404995e-05, + "loss": 0.0396, + "step": 1550 + }, + { + "epoch": 0.5097164702134438, + "grad_norm": 0.0960434228181839, + "learning_rate": 4.745603415317956e-05, + "loss": 0.0404, + "step": 1600 + }, + { + "epoch": 0.5256451099076139, + "grad_norm": 0.14295724034309387, + "learning_rate": 4.7376385879954124e-05, + "loss": 0.0401, + "step": 1650 + }, + { + "epoch": 0.5415737496017841, + "grad_norm": 0.0905664786696434, + "learning_rate": 4.729673760672869e-05, + "loss": 0.0378, + "step": 1700 + }, + { + "epoch": 0.5575023892959541, + "grad_norm": 0.13186714053153992, + "learning_rate": 4.7217089333503254e-05, + "loss": 0.0391, + "step": 1750 + }, + { + "epoch": 0.5734310289901242, + "grad_norm": 0.07855305820703506, + "learning_rate": 4.713744106027782e-05, + "loss": 0.0386, + "step": 1800 + }, + { + "epoch": 0.5893596686842943, + "grad_norm": 0.10626554489135742, + "learning_rate": 4.705779278705238e-05, + "loss": 0.038, + "step": 1850 + }, + { + "epoch": 0.6052883083784645, + "grad_norm": 0.09309270232915878, + "learning_rate": 4.697814451382694e-05, + "loss": 0.04, + "step": 1900 + }, + { + "epoch": 0.6212169480726346, + "grad_norm": 0.10274514555931091, + "learning_rate": 4.6898496240601505e-05, + "loss": 0.0399, + "step": 1950 + }, + { + "epoch": 0.6371455877668047, + "grad_norm": 0.08140850067138672, + "learning_rate": 4.681884796737607e-05, + "loss": 0.0395, + "step": 2000 + }, + { + "epoch": 0.6530742274609749, + "grad_norm": 0.09800681471824646, + "learning_rate": 4.6739199694150634e-05, + "loss": 0.0396, + "step": 2050 + }, + { + "epoch": 0.6690028671551449, + "grad_norm": 0.12131080776453018, + "learning_rate": 4.66595514209252e-05, + "loss": 0.0404, + "step": 2100 + }, + { + "epoch": 0.684931506849315, + "grad_norm": 0.1389102041721344, + "learning_rate": 4.6579903147699763e-05, + "loss": 0.0378, + "step": 2150 + }, + { + "epoch": 0.7008601465434852, + "grad_norm": 0.12080667912960052, + "learning_rate": 4.650025487447433e-05, + "loss": 0.0366, + "step": 2200 + }, + { + "epoch": 0.7167887862376553, + "grad_norm": 0.09532010555267334, + "learning_rate": 4.642060660124889e-05, + "loss": 0.038, + "step": 2250 + }, + { + "epoch": 0.7327174259318254, + "grad_norm": 0.09826835989952087, + "learning_rate": 4.634095832802345e-05, + "loss": 0.0358, + "step": 2300 + }, + { + "epoch": 0.7486460656259956, + "grad_norm": 0.11227104812860489, + "learning_rate": 4.6261310054798015e-05, + "loss": 0.0362, + "step": 2350 + }, + { + "epoch": 0.7645747053201657, + "grad_norm": 0.10029356181621552, + "learning_rate": 4.618166178157258e-05, + "loss": 0.0396, + "step": 2400 + }, + { + "epoch": 0.7805033450143358, + "grad_norm": 0.10683028399944305, + "learning_rate": 4.6102013508347144e-05, + "loss": 0.0389, + "step": 2450 + }, + { + "epoch": 0.7964319847085058, + "grad_norm": 0.08704536408185959, + "learning_rate": 4.602236523512171e-05, + "loss": 0.0364, + "step": 2500 + }, + { + "epoch": 0.812360624402676, + "grad_norm": 0.13864809274673462, + "learning_rate": 4.594271696189627e-05, + "loss": 0.0401, + "step": 2550 + }, + { + "epoch": 0.8282892640968461, + "grad_norm": 0.08906254917383194, + "learning_rate": 4.586306868867083e-05, + "loss": 0.0362, + "step": 2600 + }, + { + "epoch": 0.8442179037910162, + "grad_norm": 0.11775562167167664, + "learning_rate": 4.5783420415445396e-05, + "loss": 0.0404, + "step": 2650 + }, + { + "epoch": 0.8601465434851864, + "grad_norm": 0.12016937136650085, + "learning_rate": 4.570377214221996e-05, + "loss": 0.0398, + "step": 2700 + }, + { + "epoch": 0.8760751831793565, + "grad_norm": 0.0807521790266037, + "learning_rate": 4.5624123868994525e-05, + "loss": 0.0443, + "step": 2750 + }, + { + "epoch": 0.8920038228735266, + "grad_norm": 0.16572950780391693, + "learning_rate": 4.554447559576909e-05, + "loss": 0.0397, + "step": 2800 + }, + { + "epoch": 0.9079324625676968, + "grad_norm": 0.16349689662456512, + "learning_rate": 4.5464827322543654e-05, + "loss": 0.0363, + "step": 2850 + }, + { + "epoch": 0.9238611022618668, + "grad_norm": 0.09836030751466751, + "learning_rate": 4.538517904931821e-05, + "loss": 0.0355, + "step": 2900 + }, + { + "epoch": 0.9397897419560369, + "grad_norm": 0.09340459853410721, + "learning_rate": 4.5305530776092776e-05, + "loss": 0.0372, + "step": 2950 + }, + { + "epoch": 0.9557183816502071, + "grad_norm": 0.11303897202014923, + "learning_rate": 4.522588250286734e-05, + "loss": 0.0359, + "step": 3000 + }, + { + "epoch": 0.9716470213443772, + "grad_norm": 0.134646475315094, + "learning_rate": 4.5146234229641906e-05, + "loss": 0.0354, + "step": 3050 + }, + { + "epoch": 0.9875756610385473, + "grad_norm": 0.13911692798137665, + "learning_rate": 4.506658595641647e-05, + "loss": 0.0364, + "step": 3100 + }, + { + "epoch": 1.0035043007327173, + "grad_norm": 0.09443005919456482, + "learning_rate": 4.4986937683191035e-05, + "loss": 0.0338, + "step": 3150 + }, + { + "epoch": 1.0194329404268876, + "grad_norm": 0.12161055952310562, + "learning_rate": 4.490728940996559e-05, + "loss": 0.0378, + "step": 3200 + }, + { + "epoch": 1.0353615801210576, + "grad_norm": 0.09010250866413116, + "learning_rate": 4.482764113674016e-05, + "loss": 0.0358, + "step": 3250 + }, + { + "epoch": 1.0512902198152279, + "grad_norm": 0.0787167027592659, + "learning_rate": 4.474799286351472e-05, + "loss": 0.0375, + "step": 3300 + }, + { + "epoch": 1.0672188595093979, + "grad_norm": 0.10341610759496689, + "learning_rate": 4.4668344590289286e-05, + "loss": 0.037, + "step": 3350 + }, + { + "epoch": 1.0831474992035681, + "grad_norm": 0.09436332434415817, + "learning_rate": 4.458869631706385e-05, + "loss": 0.035, + "step": 3400 + }, + { + "epoch": 1.0990761388977381, + "grad_norm": 0.11110551655292511, + "learning_rate": 4.4509048043838415e-05, + "loss": 0.0394, + "step": 3450 + }, + { + "epoch": 1.1150047785919082, + "grad_norm": 0.11926066130399704, + "learning_rate": 4.442939977061297e-05, + "loss": 0.0364, + "step": 3500 + }, + { + "epoch": 1.1309334182860784, + "grad_norm": 0.08972738683223724, + "learning_rate": 4.434975149738754e-05, + "loss": 0.0353, + "step": 3550 + }, + { + "epoch": 1.1468620579802484, + "grad_norm": 0.1294146478176117, + "learning_rate": 4.42701032241621e-05, + "loss": 0.0347, + "step": 3600 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.18671779334545135, + "learning_rate": 4.419045495093667e-05, + "loss": 0.0393, + "step": 3650 + }, + { + "epoch": 1.1787193373685887, + "grad_norm": 0.15697024762630463, + "learning_rate": 4.411080667771123e-05, + "loss": 0.0368, + "step": 3700 + }, + { + "epoch": 1.194647977062759, + "grad_norm": 0.17401783168315887, + "learning_rate": 4.4031158404485796e-05, + "loss": 0.0346, + "step": 3750 + }, + { + "epoch": 1.210576616756929, + "grad_norm": 0.13288350403308868, + "learning_rate": 4.3951510131260354e-05, + "loss": 0.0343, + "step": 3800 + }, + { + "epoch": 1.226505256451099, + "grad_norm": 0.13558614253997803, + "learning_rate": 4.387186185803492e-05, + "loss": 0.0385, + "step": 3850 + }, + { + "epoch": 1.2424338961452692, + "grad_norm": 0.08078035712242126, + "learning_rate": 4.379221358480948e-05, + "loss": 0.033, + "step": 3900 + }, + { + "epoch": 1.2583625358394392, + "grad_norm": 0.08991101384162903, + "learning_rate": 4.371256531158405e-05, + "loss": 0.0371, + "step": 3950 + }, + { + "epoch": 1.2742911755336095, + "grad_norm": 0.09473446011543274, + "learning_rate": 4.363291703835861e-05, + "loss": 0.0334, + "step": 4000 + }, + { + "epoch": 1.2902198152277795, + "grad_norm": 0.09711175411939621, + "learning_rate": 4.355326876513318e-05, + "loss": 0.0361, + "step": 4050 + }, + { + "epoch": 1.3061484549219498, + "grad_norm": 0.11206386983394623, + "learning_rate": 4.3473620491907735e-05, + "loss": 0.0332, + "step": 4100 + }, + { + "epoch": 1.3220770946161198, + "grad_norm": 0.10798367857933044, + "learning_rate": 4.33939722186823e-05, + "loss": 0.0367, + "step": 4150 + }, + { + "epoch": 1.3380057343102898, + "grad_norm": 0.13645893335342407, + "learning_rate": 4.3314323945456864e-05, + "loss": 0.0353, + "step": 4200 + }, + { + "epoch": 1.35393437400446, + "grad_norm": 0.10864555090665817, + "learning_rate": 4.323467567223143e-05, + "loss": 0.0361, + "step": 4250 + }, + { + "epoch": 1.36986301369863, + "grad_norm": 0.12079176306724548, + "learning_rate": 4.315502739900599e-05, + "loss": 0.0329, + "step": 4300 + }, + { + "epoch": 1.3857916533928003, + "grad_norm": 0.1080985963344574, + "learning_rate": 4.307537912578056e-05, + "loss": 0.0368, + "step": 4350 + }, + { + "epoch": 1.4017202930869703, + "grad_norm": 0.1214875727891922, + "learning_rate": 4.2995730852555115e-05, + "loss": 0.033, + "step": 4400 + }, + { + "epoch": 1.4176489327811406, + "grad_norm": 0.0906578078866005, + "learning_rate": 4.291608257932968e-05, + "loss": 0.0361, + "step": 4450 + }, + { + "epoch": 1.4335775724753106, + "grad_norm": 0.11563856154680252, + "learning_rate": 4.2836434306104244e-05, + "loss": 0.0335, + "step": 4500 + }, + { + "epoch": 1.4495062121694806, + "grad_norm": 0.1468946486711502, + "learning_rate": 4.275678603287881e-05, + "loss": 0.0335, + "step": 4550 + }, + { + "epoch": 1.4654348518636509, + "grad_norm": 0.16663908958435059, + "learning_rate": 4.2677137759653374e-05, + "loss": 0.0354, + "step": 4600 + }, + { + "epoch": 1.481363491557821, + "grad_norm": 0.18749141693115234, + "learning_rate": 4.259748948642794e-05, + "loss": 0.0377, + "step": 4650 + }, + { + "epoch": 1.4972921312519911, + "grad_norm": 0.12299200892448425, + "learning_rate": 4.2517841213202496e-05, + "loss": 0.0338, + "step": 4700 + }, + { + "epoch": 1.5132207709461611, + "grad_norm": 0.10765209048986435, + "learning_rate": 4.243819293997706e-05, + "loss": 0.0368, + "step": 4750 + }, + { + "epoch": 1.5291494106403314, + "grad_norm": 0.09512902796268463, + "learning_rate": 4.2358544666751625e-05, + "loss": 0.034, + "step": 4800 + }, + { + "epoch": 1.5450780503345014, + "grad_norm": 0.13502831757068634, + "learning_rate": 4.227889639352619e-05, + "loss": 0.0354, + "step": 4850 + }, + { + "epoch": 1.5610066900286714, + "grad_norm": 0.12296276539564133, + "learning_rate": 4.2199248120300754e-05, + "loss": 0.0344, + "step": 4900 + }, + { + "epoch": 1.5769353297228417, + "grad_norm": 0.11571130156517029, + "learning_rate": 4.211959984707532e-05, + "loss": 0.034, + "step": 4950 + }, + { + "epoch": 1.592863969417012, + "grad_norm": 0.08514443039894104, + "learning_rate": 4.203995157384988e-05, + "loss": 0.0339, + "step": 5000 + }, + { + "epoch": 1.608792609111182, + "grad_norm": 0.10442246496677399, + "learning_rate": 4.196030330062444e-05, + "loss": 0.0354, + "step": 5050 + }, + { + "epoch": 1.624721248805352, + "grad_norm": 0.0917409136891365, + "learning_rate": 4.1880655027399006e-05, + "loss": 0.035, + "step": 5100 + }, + { + "epoch": 1.6406498884995222, + "grad_norm": 0.1276286244392395, + "learning_rate": 4.180100675417357e-05, + "loss": 0.0356, + "step": 5150 + }, + { + "epoch": 1.6565785281936922, + "grad_norm": 0.12348821014165878, + "learning_rate": 4.1721358480948135e-05, + "loss": 0.0354, + "step": 5200 + }, + { + "epoch": 1.6725071678878622, + "grad_norm": 0.0923234224319458, + "learning_rate": 4.16417102077227e-05, + "loss": 0.0353, + "step": 5250 + }, + { + "epoch": 1.6884358075820325, + "grad_norm": 0.08973834663629532, + "learning_rate": 4.156206193449726e-05, + "loss": 0.0333, + "step": 5300 + }, + { + "epoch": 1.7043644472762027, + "grad_norm": 0.10418592393398285, + "learning_rate": 4.148241366127182e-05, + "loss": 0.0343, + "step": 5350 + }, + { + "epoch": 1.7202930869703728, + "grad_norm": 0.1491956114768982, + "learning_rate": 4.1402765388046387e-05, + "loss": 0.0347, + "step": 5400 + }, + { + "epoch": 1.7362217266645428, + "grad_norm": 0.1951647698879242, + "learning_rate": 4.132311711482095e-05, + "loss": 0.0294, + "step": 5450 + }, + { + "epoch": 1.752150366358713, + "grad_norm": 0.20774711668491364, + "learning_rate": 4.1243468841595516e-05, + "loss": 0.0336, + "step": 5500 + }, + { + "epoch": 1.768079006052883, + "grad_norm": 0.0813850536942482, + "learning_rate": 4.116382056837008e-05, + "loss": 0.0359, + "step": 5550 + }, + { + "epoch": 1.784007645747053, + "grad_norm": 0.15596628189086914, + "learning_rate": 4.108417229514464e-05, + "loss": 0.0332, + "step": 5600 + }, + { + "epoch": 1.7999362854412233, + "grad_norm": 0.14951321482658386, + "learning_rate": 4.10045240219192e-05, + "loss": 0.0337, + "step": 5650 + }, + { + "epoch": 1.8158649251353935, + "grad_norm": 0.13903406262397766, + "learning_rate": 4.092487574869377e-05, + "loss": 0.035, + "step": 5700 + }, + { + "epoch": 1.8317935648295636, + "grad_norm": 0.14913810789585114, + "learning_rate": 4.084522747546833e-05, + "loss": 0.0336, + "step": 5750 + }, + { + "epoch": 1.8477222045237336, + "grad_norm": 0.16890230774879456, + "learning_rate": 4.0765579202242896e-05, + "loss": 0.0372, + "step": 5800 + }, + { + "epoch": 1.8636508442179038, + "grad_norm": 0.12355700880289078, + "learning_rate": 4.068593092901746e-05, + "loss": 0.0347, + "step": 5850 + }, + { + "epoch": 1.8795794839120739, + "grad_norm": 0.12095997482538223, + "learning_rate": 4.060628265579202e-05, + "loss": 0.0391, + "step": 5900 + }, + { + "epoch": 1.8955081236062439, + "grad_norm": 0.12925802171230316, + "learning_rate": 4.052663438256658e-05, + "loss": 0.036, + "step": 5950 + }, + { + "epoch": 1.9114367633004141, + "grad_norm": 0.11391396820545197, + "learning_rate": 4.044698610934115e-05, + "loss": 0.0325, + "step": 6000 + }, + { + "epoch": 1.9273654029945844, + "grad_norm": 0.129618838429451, + "learning_rate": 4.036733783611571e-05, + "loss": 0.0327, + "step": 6050 + }, + { + "epoch": 1.9432940426887544, + "grad_norm": 0.13725541532039642, + "learning_rate": 4.028768956289028e-05, + "loss": 0.0336, + "step": 6100 + }, + { + "epoch": 1.9592226823829244, + "grad_norm": 0.14412935078144073, + "learning_rate": 4.020804128966484e-05, + "loss": 0.0325, + "step": 6150 + }, + { + "epoch": 1.9751513220770947, + "grad_norm": 0.10575806349515915, + "learning_rate": 4.01283930164394e-05, + "loss": 0.0315, + "step": 6200 + }, + { + "epoch": 1.991079961771265, + "grad_norm": 0.11367379128932953, + "learning_rate": 4.0048744743213964e-05, + "loss": 0.0324, + "step": 6250 + }, + { + "epoch": 2.0070086014654347, + "grad_norm": 0.12420395016670227, + "learning_rate": 3.996909646998853e-05, + "loss": 0.0305, + "step": 6300 + }, + { + "epoch": 2.022937241159605, + "grad_norm": 0.2189149558544159, + "learning_rate": 3.988944819676309e-05, + "loss": 0.0343, + "step": 6350 + }, + { + "epoch": 2.038865880853775, + "grad_norm": 0.08044280856847763, + "learning_rate": 3.980979992353766e-05, + "loss": 0.0314, + "step": 6400 + }, + { + "epoch": 2.0547945205479454, + "grad_norm": 0.09585762768983841, + "learning_rate": 3.973015165031222e-05, + "loss": 0.0343, + "step": 6450 + }, + { + "epoch": 2.0707231602421152, + "grad_norm": 0.1755801886320114, + "learning_rate": 3.965050337708679e-05, + "loss": 0.0333, + "step": 6500 + }, + { + "epoch": 2.0866517999362855, + "grad_norm": 0.14164239168167114, + "learning_rate": 3.957085510386135e-05, + "loss": 0.0331, + "step": 6550 + }, + { + "epoch": 2.1025804396304557, + "grad_norm": 0.12496601790189743, + "learning_rate": 3.9491206830635916e-05, + "loss": 0.035, + "step": 6600 + }, + { + "epoch": 2.1185090793246255, + "grad_norm": 0.11803654581308365, + "learning_rate": 3.941155855741048e-05, + "loss": 0.0312, + "step": 6650 + }, + { + "epoch": 2.1344377190187958, + "grad_norm": 0.09984956681728363, + "learning_rate": 3.933191028418504e-05, + "loss": 0.0322, + "step": 6700 + }, + { + "epoch": 2.150366358712966, + "grad_norm": 0.11662815511226654, + "learning_rate": 3.92522620109596e-05, + "loss": 0.0332, + "step": 6750 + }, + { + "epoch": 2.1662949984071362, + "grad_norm": 0.12902189791202545, + "learning_rate": 3.917261373773417e-05, + "loss": 0.0343, + "step": 6800 + }, + { + "epoch": 2.182223638101306, + "grad_norm": 0.1841822862625122, + "learning_rate": 3.909296546450873e-05, + "loss": 0.0339, + "step": 6850 + }, + { + "epoch": 2.1981522777954763, + "grad_norm": 0.09873718023300171, + "learning_rate": 3.90133171912833e-05, + "loss": 0.0303, + "step": 6900 + }, + { + "epoch": 2.2140809174896465, + "grad_norm": 0.1674479842185974, + "learning_rate": 3.893366891805786e-05, + "loss": 0.032, + "step": 6950 + }, + { + "epoch": 2.2300095571838163, + "grad_norm": 0.13210225105285645, + "learning_rate": 3.8854020644832426e-05, + "loss": 0.0352, + "step": 7000 + }, + { + "epoch": 2.2459381968779866, + "grad_norm": 0.20769694447517395, + "learning_rate": 3.877437237160699e-05, + "loss": 0.0294, + "step": 7050 + }, + { + "epoch": 2.261866836572157, + "grad_norm": 0.13857823610305786, + "learning_rate": 3.869472409838155e-05, + "loss": 0.0343, + "step": 7100 + }, + { + "epoch": 2.277795476266327, + "grad_norm": 0.14322370290756226, + "learning_rate": 3.861507582515611e-05, + "loss": 0.0296, + "step": 7150 + }, + { + "epoch": 2.293724115960497, + "grad_norm": 0.11245788633823395, + "learning_rate": 3.853542755193068e-05, + "loss": 0.0323, + "step": 7200 + }, + { + "epoch": 2.309652755654667, + "grad_norm": 0.12236214429140091, + "learning_rate": 3.845577927870524e-05, + "loss": 0.0311, + "step": 7250 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.08712544292211533, + "learning_rate": 3.837613100547981e-05, + "loss": 0.0341, + "step": 7300 + }, + { + "epoch": 2.341510035043007, + "grad_norm": 0.11802078783512115, + "learning_rate": 3.829648273225437e-05, + "loss": 0.0317, + "step": 7350 + }, + { + "epoch": 2.3574386747371774, + "grad_norm": 0.1141052171587944, + "learning_rate": 3.821683445902893e-05, + "loss": 0.031, + "step": 7400 + }, + { + "epoch": 2.3733673144313476, + "grad_norm": 0.15947668254375458, + "learning_rate": 3.8137186185803494e-05, + "loss": 0.0313, + "step": 7450 + }, + { + "epoch": 2.389295954125518, + "grad_norm": 0.11814655363559723, + "learning_rate": 3.805753791257806e-05, + "loss": 0.0334, + "step": 7500 + }, + { + "epoch": 2.4052245938196877, + "grad_norm": 0.18867388367652893, + "learning_rate": 3.797788963935262e-05, + "loss": 0.0316, + "step": 7550 + }, + { + "epoch": 2.421153233513858, + "grad_norm": 0.13499616086483002, + "learning_rate": 3.789824136612719e-05, + "loss": 0.0304, + "step": 7600 + }, + { + "epoch": 2.437081873208028, + "grad_norm": 0.15890513360500336, + "learning_rate": 3.781859309290175e-05, + "loss": 0.0309, + "step": 7650 + }, + { + "epoch": 2.453010512902198, + "grad_norm": 0.10094068199396133, + "learning_rate": 3.773894481967631e-05, + "loss": 0.0311, + "step": 7700 + }, + { + "epoch": 2.468939152596368, + "grad_norm": 0.19545088708400726, + "learning_rate": 3.7659296546450874e-05, + "loss": 0.033, + "step": 7750 + }, + { + "epoch": 2.4848677922905384, + "grad_norm": 0.12802977859973907, + "learning_rate": 3.757964827322544e-05, + "loss": 0.0332, + "step": 7800 + }, + { + "epoch": 2.5007964319847087, + "grad_norm": 0.08226735889911652, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.0318, + "step": 7850 + }, + { + "epoch": 2.5167250716788785, + "grad_norm": 0.11623780429363251, + "learning_rate": 3.742035172677457e-05, + "loss": 0.0336, + "step": 7900 + }, + { + "epoch": 2.5326537113730487, + "grad_norm": 0.16703219711780548, + "learning_rate": 3.734070345354913e-05, + "loss": 0.0307, + "step": 7950 + }, + { + "epoch": 2.548582351067219, + "grad_norm": 0.10822132229804993, + "learning_rate": 3.72610551803237e-05, + "loss": 0.033, + "step": 8000 + } + ], + "logging_steps": 50, + "max_steps": 31390, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.922138329882821e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/training_args.bin b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b45b2e6f313435b1d70860929c99bab2fc5943f --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adde3152d6d22eecd1475fa998c1854302f8f17da34801350df8aca4fd5ec59b +size 7736 diff --git a/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/zero_to_fp32.py b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp/checkpoint-8000/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/README.md b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..105007ea041cb936ba3417e0221e47fec447213d --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./CodeLlama-7b-Instruct-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_config.json b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a66bfa8abef7891a1ea8ba7eff2760c79356f01 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./CodeLlama-7b-Instruct-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_model.bin b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a89a472c90533e666c86cc0e026d009a4f8246e2 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da70a17fee47afe232392efb41959db2fb39a710b36e51755cba3f0651173b81 +size 8433034 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/added_tokens.json b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c09203d8a52151e0d3b5f3c8e6daedc5b60832b5 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/added_tokens.json @@ -0,0 +1,3 @@ +{ + "": 32016 +} diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1b38fb862e4f2e035ec66ddcae00d814d5b6307 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f59fb4348dce482deebf38ece351831ee84d9151b63e6685b43f381c24f89e3 +size 12589776 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8665cad2c9c5f23c1f4d1a3e76f519cb6949910 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b156529ae1a1c4708ace9ee3152b50a0a2e3c107efc9998a1a4c63fecb73be +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea684bebe7b170048e06faaa1a82765913d0b237 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece314cfdcbb4ed6c8655dc845821f529b58a7a64e9295e0e03d6ce4c3f27971 +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f57123a1ba8447d2c3373c5b3d5cc5373983da55 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb9a4c17bbfcbaa0c166f62ad74a2fcbdc8d5ced1df818b16caf2f375c6c6bad +size 12589840 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/mp_rank_00_model_states.pt b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5611ccce688a65f2b77d5a681fefb67905f04dc7 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/global_step12000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6c65fd295bb30b20fbf30e97d37e724733e14759a6459b074a218d014dd6dd +size 8507372 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/latest b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/latest new file mode 100644 index 0000000000000000000000000000000000000000..b8f4d8b87537b41ea73a42841f5fb1704b16b234 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/latest @@ -0,0 +1 @@ +global_step12000 \ No newline at end of file diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_0.pth b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..94597257604064fe9ad3c44c3f413114a5470ae7 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6144186c9e0ee485931eb4d99dedd714a16e9deb2967f0b44a941fe715daea34 +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_1.pth b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..70a37f4fc5c1d1ceeeddedb50fca0855a1b07859 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b52dc2350899c1747d6f03e5ad15ad8eff3a3da9e789354be34fa58fb06557b +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_2.pth b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ab0e0a6407124e0aa449beae3cf6262f06cd40c --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f148060e8336b10f2da5da6047431bdc25d3a563bd1498b6f14a9d6e0d272b24 +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_3.pth b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e828530adcc7499d88d659c3561db4627106f55 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8cb83b3ada39d011d872f34af1c1648119ffb242ca9c0f615a40d193a66eb7 +size 15024 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/special_tokens_map.json b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..2ece23e993f7e1c7063cb51148b6fa5c6c224775 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer.model b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer_config.json b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc4d9a55d56b17600905bfc69a0e45e2fae7a1c4 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/tokenizer_config.json @@ -0,0 +1,50 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/trainer_state.json b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..682a71d842a362601bfa4537262c840f54a27d86 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/trainer_state.json @@ -0,0 +1,1729 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.734107997265892, + "eval_steps": 6000, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01139211665527455, + "grad_norm": 0.24318678677082062, + "learning_rate": 4.989290434307069e-05, + "loss": 4.1869, + "step": 50 + }, + { + "epoch": 0.0227842333105491, + "grad_norm": 0.10898148268461227, + "learning_rate": 4.977897279314588e-05, + "loss": 0.0602, + "step": 100 + }, + { + "epoch": 0.03417634996582365, + "grad_norm": 0.07570989429950714, + "learning_rate": 4.966504124322108e-05, + "loss": 0.0489, + "step": 150 + }, + { + "epoch": 0.0455684666210982, + "grad_norm": 0.15807728469371796, + "learning_rate": 4.955110969329627e-05, + "loss": 0.0475, + "step": 200 + }, + { + "epoch": 0.05696058327637275, + "grad_norm": 0.09411854296922684, + "learning_rate": 4.943717814337147e-05, + "loss": 0.0477, + "step": 250 + }, + { + "epoch": 0.0683526999316473, + "grad_norm": 0.0637858584523201, + "learning_rate": 4.932324659344666e-05, + "loss": 0.0464, + "step": 300 + }, + { + "epoch": 0.07974481658692185, + "grad_norm": 0.08095966279506683, + "learning_rate": 4.920931504352185e-05, + "loss": 0.0419, + "step": 350 + }, + { + "epoch": 0.0911369332421964, + "grad_norm": 0.12281342595815659, + "learning_rate": 4.909538349359705e-05, + "loss": 0.0454, + "step": 400 + }, + { + "epoch": 0.10252904989747096, + "grad_norm": 0.08293534070253372, + "learning_rate": 4.898145194367225e-05, + "loss": 0.0466, + "step": 450 + }, + { + "epoch": 0.1139211665527455, + "grad_norm": 0.0917474702000618, + "learning_rate": 4.886752039374744e-05, + "loss": 0.043, + "step": 500 + }, + { + "epoch": 0.12531328320802004, + "grad_norm": 0.1135578453540802, + "learning_rate": 4.875358884382263e-05, + "loss": 0.046, + "step": 550 + }, + { + "epoch": 0.1367053998632946, + "grad_norm": 0.14516398310661316, + "learning_rate": 4.863965729389783e-05, + "loss": 0.0399, + "step": 600 + }, + { + "epoch": 0.14809751651856914, + "grad_norm": 0.090349480509758, + "learning_rate": 4.852572574397302e-05, + "loss": 0.0425, + "step": 650 + }, + { + "epoch": 0.1594896331738437, + "grad_norm": 0.10766596347093582, + "learning_rate": 4.8411794194048214e-05, + "loss": 0.0405, + "step": 700 + }, + { + "epoch": 0.17088174982911825, + "grad_norm": 0.09330946952104568, + "learning_rate": 4.829786264412341e-05, + "loss": 0.0414, + "step": 750 + }, + { + "epoch": 0.1822738664843928, + "grad_norm": 0.06139269098639488, + "learning_rate": 4.818393109419861e-05, + "loss": 0.0422, + "step": 800 + }, + { + "epoch": 0.19366598313966735, + "grad_norm": 0.10222186148166656, + "learning_rate": 4.80699995442738e-05, + "loss": 0.0457, + "step": 850 + }, + { + "epoch": 0.2050580997949419, + "grad_norm": 0.1095157265663147, + "learning_rate": 4.7956067994348995e-05, + "loss": 0.0393, + "step": 900 + }, + { + "epoch": 0.21645021645021645, + "grad_norm": 0.10699036717414856, + "learning_rate": 4.7842136444424193e-05, + "loss": 0.0448, + "step": 950 + }, + { + "epoch": 0.227842333105491, + "grad_norm": 0.08551250398159027, + "learning_rate": 4.772820489449939e-05, + "loss": 0.0394, + "step": 1000 + }, + { + "epoch": 0.23923444976076555, + "grad_norm": 0.12492193281650543, + "learning_rate": 4.7614273344574584e-05, + "loss": 0.0434, + "step": 1050 + }, + { + "epoch": 0.2506265664160401, + "grad_norm": 0.14445355534553528, + "learning_rate": 4.7500341794649776e-05, + "loss": 0.0443, + "step": 1100 + }, + { + "epoch": 0.26201868307131465, + "grad_norm": 0.09206435084342957, + "learning_rate": 4.7386410244724974e-05, + "loss": 0.0447, + "step": 1150 + }, + { + "epoch": 0.2734107997265892, + "grad_norm": 0.09216534346342087, + "learning_rate": 4.7272478694800166e-05, + "loss": 0.0411, + "step": 1200 + }, + { + "epoch": 0.2848029163818637, + "grad_norm": 0.09902399778366089, + "learning_rate": 4.715854714487536e-05, + "loss": 0.0385, + "step": 1250 + }, + { + "epoch": 0.2961950330371383, + "grad_norm": 0.06204482167959213, + "learning_rate": 4.7044615594950556e-05, + "loss": 0.0433, + "step": 1300 + }, + { + "epoch": 0.30758714969241285, + "grad_norm": 0.08369041979312897, + "learning_rate": 4.6930684045025755e-05, + "loss": 0.0416, + "step": 1350 + }, + { + "epoch": 0.3189792663476874, + "grad_norm": 0.07282637059688568, + "learning_rate": 4.6816752495100947e-05, + "loss": 0.036, + "step": 1400 + }, + { + "epoch": 0.3303713830029619, + "grad_norm": 0.09879472851753235, + "learning_rate": 4.670282094517614e-05, + "loss": 0.0385, + "step": 1450 + }, + { + "epoch": 0.3417634996582365, + "grad_norm": 0.06464097648859024, + "learning_rate": 4.658888939525134e-05, + "loss": 0.0413, + "step": 1500 + }, + { + "epoch": 0.35315561631351106, + "grad_norm": 0.09587877243757248, + "learning_rate": 4.647495784532653e-05, + "loss": 0.0359, + "step": 1550 + }, + { + "epoch": 0.3645477329687856, + "grad_norm": 0.09360693395137787, + "learning_rate": 4.636102629540173e-05, + "loss": 0.0392, + "step": 1600 + }, + { + "epoch": 0.37593984962406013, + "grad_norm": 0.09336020797491074, + "learning_rate": 4.624709474547692e-05, + "loss": 0.0372, + "step": 1650 + }, + { + "epoch": 0.3873319662793347, + "grad_norm": 0.06831030547618866, + "learning_rate": 4.613316319555212e-05, + "loss": 0.0394, + "step": 1700 + }, + { + "epoch": 0.39872408293460926, + "grad_norm": 0.0803089365363121, + "learning_rate": 4.601923164562731e-05, + "loss": 0.0373, + "step": 1750 + }, + { + "epoch": 0.4101161995898838, + "grad_norm": 0.12224093824625015, + "learning_rate": 4.59053000957025e-05, + "loss": 0.0386, + "step": 1800 + }, + { + "epoch": 0.42150831624515833, + "grad_norm": 0.10336671024560928, + "learning_rate": 4.57913685457777e-05, + "loss": 0.0381, + "step": 1850 + }, + { + "epoch": 0.4329004329004329, + "grad_norm": 0.08528792858123779, + "learning_rate": 4.56774369958529e-05, + "loss": 0.0371, + "step": 1900 + }, + { + "epoch": 0.44429254955570746, + "grad_norm": 0.11260873079299927, + "learning_rate": 4.556350544592809e-05, + "loss": 0.0406, + "step": 1950 + }, + { + "epoch": 0.455684666210982, + "grad_norm": 0.07501766085624695, + "learning_rate": 4.544957389600328e-05, + "loss": 0.0376, + "step": 2000 + }, + { + "epoch": 0.46707678286625653, + "grad_norm": 0.061855901032686234, + "learning_rate": 4.533564234607848e-05, + "loss": 0.0404, + "step": 2050 + }, + { + "epoch": 0.4784688995215311, + "grad_norm": 0.09459809958934784, + "learning_rate": 4.522171079615367e-05, + "loss": 0.0414, + "step": 2100 + }, + { + "epoch": 0.48986101617680566, + "grad_norm": 0.08048627525568008, + "learning_rate": 4.5107779246228864e-05, + "loss": 0.037, + "step": 2150 + }, + { + "epoch": 0.5012531328320802, + "grad_norm": 0.07276533544063568, + "learning_rate": 4.499384769630406e-05, + "loss": 0.0388, + "step": 2200 + }, + { + "epoch": 0.5126452494873548, + "grad_norm": 0.16834820806980133, + "learning_rate": 4.487991614637926e-05, + "loss": 0.0387, + "step": 2250 + }, + { + "epoch": 0.5240373661426293, + "grad_norm": 0.09600085765123367, + "learning_rate": 4.476598459645445e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.5354294827979038, + "grad_norm": 0.10438230633735657, + "learning_rate": 4.4652053046529645e-05, + "loss": 0.0396, + "step": 2350 + }, + { + "epoch": 0.5468215994531784, + "grad_norm": 0.06975755095481873, + "learning_rate": 4.453812149660484e-05, + "loss": 0.0395, + "step": 2400 + }, + { + "epoch": 0.5582137161084529, + "grad_norm": 0.11652950942516327, + "learning_rate": 4.442418994668004e-05, + "loss": 0.0387, + "step": 2450 + }, + { + "epoch": 0.5696058327637274, + "grad_norm": 0.07443119585514069, + "learning_rate": 4.4310258396755233e-05, + "loss": 0.0376, + "step": 2500 + }, + { + "epoch": 0.5809979494190021, + "grad_norm": 0.12101172655820847, + "learning_rate": 4.4196326846830425e-05, + "loss": 0.0368, + "step": 2550 + }, + { + "epoch": 0.5923900660742766, + "grad_norm": 0.1421501785516739, + "learning_rate": 4.4082395296905624e-05, + "loss": 0.0378, + "step": 2600 + }, + { + "epoch": 0.6037821827295512, + "grad_norm": 0.19242361187934875, + "learning_rate": 4.3968463746980816e-05, + "loss": 0.0353, + "step": 2650 + }, + { + "epoch": 0.6151742993848257, + "grad_norm": 0.09530607610940933, + "learning_rate": 4.385453219705601e-05, + "loss": 0.04, + "step": 2700 + }, + { + "epoch": 0.6265664160401002, + "grad_norm": 0.1451895534992218, + "learning_rate": 4.3740600647131206e-05, + "loss": 0.0347, + "step": 2750 + }, + { + "epoch": 0.6379585326953748, + "grad_norm": 0.11034991592168808, + "learning_rate": 4.3626669097206404e-05, + "loss": 0.0338, + "step": 2800 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.11478397250175476, + "learning_rate": 4.3512737547281596e-05, + "loss": 0.0412, + "step": 2850 + }, + { + "epoch": 0.6607427660059239, + "grad_norm": 0.12228801101446152, + "learning_rate": 4.339880599735679e-05, + "loss": 0.0362, + "step": 2900 + }, + { + "epoch": 0.6721348826611985, + "grad_norm": 0.10513991862535477, + "learning_rate": 4.3284874447431987e-05, + "loss": 0.0352, + "step": 2950 + }, + { + "epoch": 0.683526999316473, + "grad_norm": 0.1606149673461914, + "learning_rate": 4.317094289750718e-05, + "loss": 0.0404, + "step": 3000 + }, + { + "epoch": 0.6949191159717476, + "grad_norm": 0.12138944864273071, + "learning_rate": 4.305701134758238e-05, + "loss": 0.0366, + "step": 3050 + }, + { + "epoch": 0.7063112326270221, + "grad_norm": 0.16719172894954681, + "learning_rate": 4.294307979765757e-05, + "loss": 0.0348, + "step": 3100 + }, + { + "epoch": 0.7177033492822966, + "grad_norm": 0.07648681104183197, + "learning_rate": 4.282914824773277e-05, + "loss": 0.0356, + "step": 3150 + }, + { + "epoch": 0.7290954659375712, + "grad_norm": 0.11438747495412827, + "learning_rate": 4.271521669780796e-05, + "loss": 0.0417, + "step": 3200 + }, + { + "epoch": 0.7404875825928458, + "grad_norm": 0.10588862746953964, + "learning_rate": 4.260128514788315e-05, + "loss": 0.0369, + "step": 3250 + }, + { + "epoch": 0.7518796992481203, + "grad_norm": 0.0825788825750351, + "learning_rate": 4.248735359795835e-05, + "loss": 0.0369, + "step": 3300 + }, + { + "epoch": 0.7632718159033949, + "grad_norm": 0.08810622990131378, + "learning_rate": 4.237342204803355e-05, + "loss": 0.0352, + "step": 3350 + }, + { + "epoch": 0.7746639325586694, + "grad_norm": 0.08460413664579391, + "learning_rate": 4.225949049810874e-05, + "loss": 0.0384, + "step": 3400 + }, + { + "epoch": 0.7860560492139439, + "grad_norm": 0.09517823904752731, + "learning_rate": 4.214555894818393e-05, + "loss": 0.0335, + "step": 3450 + }, + { + "epoch": 0.7974481658692185, + "grad_norm": 0.1491287350654602, + "learning_rate": 4.203162739825913e-05, + "loss": 0.043, + "step": 3500 + }, + { + "epoch": 0.808840282524493, + "grad_norm": 0.0725693553686142, + "learning_rate": 4.191769584833432e-05, + "loss": 0.0348, + "step": 3550 + }, + { + "epoch": 0.8202323991797676, + "grad_norm": 0.10975562781095505, + "learning_rate": 4.180376429840952e-05, + "loss": 0.0354, + "step": 3600 + }, + { + "epoch": 0.8316245158350422, + "grad_norm": 0.08988745510578156, + "learning_rate": 4.168983274848471e-05, + "loss": 0.0376, + "step": 3650 + }, + { + "epoch": 0.8430166324903167, + "grad_norm": 0.09260562807321548, + "learning_rate": 4.157590119855991e-05, + "loss": 0.0381, + "step": 3700 + }, + { + "epoch": 0.8544087491455913, + "grad_norm": 0.10777267068624496, + "learning_rate": 4.14619696486351e-05, + "loss": 0.0355, + "step": 3750 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.10656338185071945, + "learning_rate": 4.1348038098710294e-05, + "loss": 0.0362, + "step": 3800 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.07918556034564972, + "learning_rate": 4.123410654878549e-05, + "loss": 0.0328, + "step": 3850 + }, + { + "epoch": 0.8885850991114149, + "grad_norm": 0.12000677734613419, + "learning_rate": 4.112017499886069e-05, + "loss": 0.0361, + "step": 3900 + }, + { + "epoch": 0.8999772157666894, + "grad_norm": 0.16487479209899902, + "learning_rate": 4.100624344893588e-05, + "loss": 0.0372, + "step": 3950 + }, + { + "epoch": 0.911369332421964, + "grad_norm": 0.1201501414179802, + "learning_rate": 4.0892311899011075e-05, + "loss": 0.0382, + "step": 4000 + }, + { + "epoch": 0.9227614490772386, + "grad_norm": 0.0880098044872284, + "learning_rate": 4.077838034908627e-05, + "loss": 0.0371, + "step": 4050 + }, + { + "epoch": 0.9341535657325131, + "grad_norm": 0.11153139173984528, + "learning_rate": 4.0664448799161465e-05, + "loss": 0.0379, + "step": 4100 + }, + { + "epoch": 0.9455456823877877, + "grad_norm": 0.1396927535533905, + "learning_rate": 4.055051724923666e-05, + "loss": 0.0344, + "step": 4150 + }, + { + "epoch": 0.9569377990430622, + "grad_norm": 0.1013760045170784, + "learning_rate": 4.0436585699311855e-05, + "loss": 0.036, + "step": 4200 + }, + { + "epoch": 0.9683299156983367, + "grad_norm": 0.08559609204530716, + "learning_rate": 4.0322654149387054e-05, + "loss": 0.036, + "step": 4250 + }, + { + "epoch": 0.9797220323536113, + "grad_norm": 0.09353000670671463, + "learning_rate": 4.0208722599462246e-05, + "loss": 0.0371, + "step": 4300 + }, + { + "epoch": 0.9911141490088858, + "grad_norm": 0.1802058070898056, + "learning_rate": 4.009479104953744e-05, + "loss": 0.0381, + "step": 4350 + }, + { + "epoch": 1.0025062656641603, + "grad_norm": 0.1477678269147873, + "learning_rate": 3.9980859499612636e-05, + "loss": 0.0391, + "step": 4400 + }, + { + "epoch": 1.013898382319435, + "grad_norm": 0.1183403730392456, + "learning_rate": 3.986692794968783e-05, + "loss": 0.0358, + "step": 4450 + }, + { + "epoch": 1.0252904989747096, + "grad_norm": 0.10020051896572113, + "learning_rate": 3.9752996399763026e-05, + "loss": 0.0386, + "step": 4500 + }, + { + "epoch": 1.036682615629984, + "grad_norm": 0.10497792065143585, + "learning_rate": 3.963906484983822e-05, + "loss": 0.0352, + "step": 4550 + }, + { + "epoch": 1.0480747322852586, + "grad_norm": 0.11814913153648376, + "learning_rate": 3.952513329991342e-05, + "loss": 0.0353, + "step": 4600 + }, + { + "epoch": 1.0594668489405332, + "grad_norm": 0.11891067773103714, + "learning_rate": 3.941120174998861e-05, + "loss": 0.0356, + "step": 4650 + }, + { + "epoch": 1.0708589655958076, + "grad_norm": 0.07517009973526001, + "learning_rate": 3.92972702000638e-05, + "loss": 0.0338, + "step": 4700 + }, + { + "epoch": 1.0822510822510822, + "grad_norm": 0.12747719883918762, + "learning_rate": 3.9183338650139e-05, + "loss": 0.0354, + "step": 4750 + }, + { + "epoch": 1.0936431989063569, + "grad_norm": 0.11763947457075119, + "learning_rate": 3.90694071002142e-05, + "loss": 0.0349, + "step": 4800 + }, + { + "epoch": 1.1050353155616313, + "grad_norm": 0.09053009003400803, + "learning_rate": 3.895547555028939e-05, + "loss": 0.0362, + "step": 4850 + }, + { + "epoch": 1.1164274322169059, + "grad_norm": 0.16646841168403625, + "learning_rate": 3.884154400036458e-05, + "loss": 0.037, + "step": 4900 + }, + { + "epoch": 1.1278195488721805, + "grad_norm": 0.08759460598230362, + "learning_rate": 3.872761245043978e-05, + "loss": 0.0353, + "step": 4950 + }, + { + "epoch": 1.139211665527455, + "grad_norm": 0.09641291946172714, + "learning_rate": 3.861368090051497e-05, + "loss": 0.0317, + "step": 5000 + }, + { + "epoch": 1.1506037821827295, + "grad_norm": 0.11179310083389282, + "learning_rate": 3.849974935059017e-05, + "loss": 0.0349, + "step": 5050 + }, + { + "epoch": 1.1619958988380041, + "grad_norm": 0.0900956317782402, + "learning_rate": 3.838581780066536e-05, + "loss": 0.034, + "step": 5100 + }, + { + "epoch": 1.1733880154932788, + "grad_norm": 0.11659402400255203, + "learning_rate": 3.827188625074056e-05, + "loss": 0.0341, + "step": 5150 + }, + { + "epoch": 1.1847801321485532, + "grad_norm": 0.19316904246807098, + "learning_rate": 3.815795470081575e-05, + "loss": 0.0357, + "step": 5200 + }, + { + "epoch": 1.1961722488038278, + "grad_norm": 0.14138014614582062, + "learning_rate": 3.8044023150890944e-05, + "loss": 0.0362, + "step": 5250 + }, + { + "epoch": 1.2075643654591024, + "grad_norm": 0.08280321955680847, + "learning_rate": 3.7930091600966136e-05, + "loss": 0.0336, + "step": 5300 + }, + { + "epoch": 1.2189564821143768, + "grad_norm": 0.0848747193813324, + "learning_rate": 3.781616005104134e-05, + "loss": 0.0375, + "step": 5350 + }, + { + "epoch": 1.2303485987696514, + "grad_norm": 0.09686152637004852, + "learning_rate": 3.770222850111653e-05, + "loss": 0.0327, + "step": 5400 + }, + { + "epoch": 1.241740715424926, + "grad_norm": 0.10783885419368744, + "learning_rate": 3.7588296951191724e-05, + "loss": 0.0364, + "step": 5450 + }, + { + "epoch": 1.2531328320802004, + "grad_norm": 0.14501520991325378, + "learning_rate": 3.747436540126692e-05, + "loss": 0.0395, + "step": 5500 + }, + { + "epoch": 1.264524948735475, + "grad_norm": 0.16979029774665833, + "learning_rate": 3.7360433851342115e-05, + "loss": 0.0352, + "step": 5550 + }, + { + "epoch": 1.2759170653907497, + "grad_norm": 0.16063962876796722, + "learning_rate": 3.7246502301417307e-05, + "loss": 0.0357, + "step": 5600 + }, + { + "epoch": 1.287309182046024, + "grad_norm": 0.1108214482665062, + "learning_rate": 3.7132570751492505e-05, + "loss": 0.0341, + "step": 5650 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.17493686079978943, + "learning_rate": 3.7018639201567704e-05, + "loss": 0.0373, + "step": 5700 + }, + { + "epoch": 1.3100934153565733, + "grad_norm": 0.08701369911432266, + "learning_rate": 3.6904707651642895e-05, + "loss": 0.0345, + "step": 5750 + }, + { + "epoch": 1.321485532011848, + "grad_norm": 0.11528841406106949, + "learning_rate": 3.679077610171809e-05, + "loss": 0.0309, + "step": 5800 + }, + { + "epoch": 1.3328776486671223, + "grad_norm": 0.14678539335727692, + "learning_rate": 3.6676844551793286e-05, + "loss": 0.0346, + "step": 5850 + }, + { + "epoch": 1.344269765322397, + "grad_norm": 0.13064803183078766, + "learning_rate": 3.6562913001868484e-05, + "loss": 0.0344, + "step": 5900 + }, + { + "epoch": 1.3556618819776713, + "grad_norm": 0.11759040504693985, + "learning_rate": 3.6448981451943676e-05, + "loss": 0.0319, + "step": 5950 + }, + { + "epoch": 1.367053998632946, + "grad_norm": 0.10772958397865295, + "learning_rate": 3.633504990201887e-05, + "loss": 0.0314, + "step": 6000 + }, + { + "epoch": 1.367053998632946, + "eval_loss": 0.05980786308646202, + "eval_runtime": 76.4079, + "eval_samples_per_second": 23.257, + "eval_steps_per_second": 1.466, + "step": 6000 + }, + { + "epoch": 1.3784461152882206, + "grad_norm": 0.15903785824775696, + "learning_rate": 3.6221118352094066e-05, + "loss": 0.0337, + "step": 6050 + }, + { + "epoch": 1.3898382319434952, + "grad_norm": 0.15311047434806824, + "learning_rate": 3.610718680216926e-05, + "loss": 0.034, + "step": 6100 + }, + { + "epoch": 1.4012303485987696, + "grad_norm": 0.08784125000238419, + "learning_rate": 3.599325525224445e-05, + "loss": 0.032, + "step": 6150 + }, + { + "epoch": 1.4126224652540442, + "grad_norm": 0.10080964863300323, + "learning_rate": 3.587932370231965e-05, + "loss": 0.0344, + "step": 6200 + }, + { + "epoch": 1.4240145819093186, + "grad_norm": 0.13672150671482086, + "learning_rate": 3.576539215239485e-05, + "loss": 0.0314, + "step": 6250 + }, + { + "epoch": 1.4354066985645932, + "grad_norm": 0.10255203396081924, + "learning_rate": 3.565146060247004e-05, + "loss": 0.0343, + "step": 6300 + }, + { + "epoch": 1.4467988152198679, + "grad_norm": 0.1191258653998375, + "learning_rate": 3.553752905254523e-05, + "loss": 0.0337, + "step": 6350 + }, + { + "epoch": 1.4581909318751425, + "grad_norm": 0.13031437993049622, + "learning_rate": 3.542359750262043e-05, + "loss": 0.0329, + "step": 6400 + }, + { + "epoch": 1.4695830485304169, + "grad_norm": 0.12329483032226562, + "learning_rate": 3.530966595269562e-05, + "loss": 0.0374, + "step": 6450 + }, + { + "epoch": 1.4809751651856915, + "grad_norm": 0.15432628989219666, + "learning_rate": 3.519573440277082e-05, + "loss": 0.0344, + "step": 6500 + }, + { + "epoch": 1.4923672818409661, + "grad_norm": 0.11702828109264374, + "learning_rate": 3.508180285284601e-05, + "loss": 0.0325, + "step": 6550 + }, + { + "epoch": 1.5037593984962405, + "grad_norm": 0.07083294540643692, + "learning_rate": 3.496787130292121e-05, + "loss": 0.0324, + "step": 6600 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.09021273255348206, + "learning_rate": 3.48539397529964e-05, + "loss": 0.0337, + "step": 6650 + }, + { + "epoch": 1.5265436318067898, + "grad_norm": 0.1727963089942932, + "learning_rate": 3.4740008203071593e-05, + "loss": 0.0344, + "step": 6700 + }, + { + "epoch": 1.5379357484620644, + "grad_norm": 0.17168262600898743, + "learning_rate": 3.4626076653146785e-05, + "loss": 0.0344, + "step": 6750 + }, + { + "epoch": 1.5493278651173388, + "grad_norm": 0.08169051259756088, + "learning_rate": 3.451214510322199e-05, + "loss": 0.0357, + "step": 6800 + }, + { + "epoch": 1.5607199817726134, + "grad_norm": 0.14371289312839508, + "learning_rate": 3.439821355329718e-05, + "loss": 0.0327, + "step": 6850 + }, + { + "epoch": 1.5721120984278878, + "grad_norm": 0.10983046889305115, + "learning_rate": 3.4284282003372374e-05, + "loss": 0.0341, + "step": 6900 + }, + { + "epoch": 1.5835042150831624, + "grad_norm": 0.1277296096086502, + "learning_rate": 3.417035045344757e-05, + "loss": 0.0308, + "step": 6950 + }, + { + "epoch": 1.594896331738437, + "grad_norm": 0.12647396326065063, + "learning_rate": 3.4056418903522764e-05, + "loss": 0.0334, + "step": 7000 + }, + { + "epoch": 1.6062884483937117, + "grad_norm": 0.09901976585388184, + "learning_rate": 3.394248735359796e-05, + "loss": 0.0322, + "step": 7050 + }, + { + "epoch": 1.617680565048986, + "grad_norm": 0.1494535207748413, + "learning_rate": 3.3828555803673155e-05, + "loss": 0.0347, + "step": 7100 + }, + { + "epoch": 1.6290726817042607, + "grad_norm": 0.14202262461185455, + "learning_rate": 3.371462425374835e-05, + "loss": 0.029, + "step": 7150 + }, + { + "epoch": 1.640464798359535, + "grad_norm": 0.13513007760047913, + "learning_rate": 3.3600692703823545e-05, + "loss": 0.031, + "step": 7200 + }, + { + "epoch": 1.6518569150148097, + "grad_norm": 0.1467670053243637, + "learning_rate": 3.348676115389874e-05, + "loss": 0.0362, + "step": 7250 + }, + { + "epoch": 1.6632490316700843, + "grad_norm": 0.0936419665813446, + "learning_rate": 3.337282960397393e-05, + "loss": 0.0311, + "step": 7300 + }, + { + "epoch": 1.674641148325359, + "grad_norm": 0.10516014695167542, + "learning_rate": 3.3258898054049134e-05, + "loss": 0.0322, + "step": 7350 + }, + { + "epoch": 1.6860332649806336, + "grad_norm": 0.166508287191391, + "learning_rate": 3.3144966504124326e-05, + "loss": 0.0342, + "step": 7400 + }, + { + "epoch": 1.697425381635908, + "grad_norm": 0.13199631869792938, + "learning_rate": 3.303103495419952e-05, + "loss": 0.0311, + "step": 7450 + }, + { + "epoch": 1.7088174982911823, + "grad_norm": 0.10680291801691055, + "learning_rate": 3.2917103404274716e-05, + "loss": 0.0302, + "step": 7500 + }, + { + "epoch": 1.720209614946457, + "grad_norm": 0.14647521078586578, + "learning_rate": 3.280317185434991e-05, + "loss": 0.0339, + "step": 7550 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.08651016652584076, + "learning_rate": 3.26892403044251e-05, + "loss": 0.0357, + "step": 7600 + }, + { + "epoch": 1.7429938482570062, + "grad_norm": 0.07976220548152924, + "learning_rate": 3.25753087545003e-05, + "loss": 0.0331, + "step": 7650 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.13063737750053406, + "learning_rate": 3.24613772045755e-05, + "loss": 0.0334, + "step": 7700 + }, + { + "epoch": 1.7657780815675552, + "grad_norm": 0.10824882984161377, + "learning_rate": 3.234744565465069e-05, + "loss": 0.0304, + "step": 7750 + }, + { + "epoch": 1.7771701982228298, + "grad_norm": 0.08849220722913742, + "learning_rate": 3.223351410472588e-05, + "loss": 0.0345, + "step": 7800 + }, + { + "epoch": 1.7885623148781042, + "grad_norm": 0.1834285408258438, + "learning_rate": 3.211958255480107e-05, + "loss": 0.0312, + "step": 7850 + }, + { + "epoch": 1.7999544315333789, + "grad_norm": 0.17661932110786438, + "learning_rate": 3.200565100487627e-05, + "loss": 0.0359, + "step": 7900 + }, + { + "epoch": 1.8113465481886535, + "grad_norm": 0.08962208777666092, + "learning_rate": 3.189171945495147e-05, + "loss": 0.0316, + "step": 7950 + }, + { + "epoch": 1.822738664843928, + "grad_norm": 0.15646621584892273, + "learning_rate": 3.177778790502666e-05, + "loss": 0.0312, + "step": 8000 + }, + { + "epoch": 1.8341307814992025, + "grad_norm": 0.1474676877260208, + "learning_rate": 3.166385635510186e-05, + "loss": 0.0344, + "step": 8050 + }, + { + "epoch": 1.8455228981544771, + "grad_norm": 0.1147208958864212, + "learning_rate": 3.154992480517705e-05, + "loss": 0.0329, + "step": 8100 + }, + { + "epoch": 1.8569150148097515, + "grad_norm": 0.183086097240448, + "learning_rate": 3.143599325525224e-05, + "loss": 0.0321, + "step": 8150 + }, + { + "epoch": 1.8683071314650261, + "grad_norm": 0.1263459175825119, + "learning_rate": 3.132206170532744e-05, + "loss": 0.0316, + "step": 8200 + }, + { + "epoch": 1.8796992481203008, + "grad_norm": 0.1425926238298416, + "learning_rate": 3.120813015540264e-05, + "loss": 0.0295, + "step": 8250 + }, + { + "epoch": 1.8910913647755754, + "grad_norm": 0.23871935904026031, + "learning_rate": 3.109419860547783e-05, + "loss": 0.033, + "step": 8300 + }, + { + "epoch": 1.90248348143085, + "grad_norm": 0.1078375056385994, + "learning_rate": 3.0980267055553024e-05, + "loss": 0.0321, + "step": 8350 + }, + { + "epoch": 1.9138755980861244, + "grad_norm": 0.06418582051992416, + "learning_rate": 3.0866335505628215e-05, + "loss": 0.0335, + "step": 8400 + }, + { + "epoch": 1.9252677147413988, + "grad_norm": 0.10235786437988281, + "learning_rate": 3.0752403955703414e-05, + "loss": 0.0341, + "step": 8450 + }, + { + "epoch": 1.9366598313966734, + "grad_norm": 0.12743011116981506, + "learning_rate": 3.063847240577861e-05, + "loss": 0.0349, + "step": 8500 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.13372161984443665, + "learning_rate": 3.0524540855853804e-05, + "loss": 0.0309, + "step": 8550 + }, + { + "epoch": 1.9594440647072227, + "grad_norm": 0.14914734661579132, + "learning_rate": 3.0410609305929e-05, + "loss": 0.0307, + "step": 8600 + }, + { + "epoch": 1.9708361813624973, + "grad_norm": 0.08178412914276123, + "learning_rate": 3.0296677756004195e-05, + "loss": 0.0329, + "step": 8650 + }, + { + "epoch": 1.9822282980177717, + "grad_norm": 0.12782682478427887, + "learning_rate": 3.0182746206079386e-05, + "loss": 0.0343, + "step": 8700 + }, + { + "epoch": 1.9936204146730463, + "grad_norm": 0.13054175674915314, + "learning_rate": 3.006881465615458e-05, + "loss": 0.0348, + "step": 8750 + }, + { + "epoch": 2.0050125313283207, + "grad_norm": 0.15654687583446503, + "learning_rate": 2.995488310622978e-05, + "loss": 0.0341, + "step": 8800 + }, + { + "epoch": 2.0164046479835953, + "grad_norm": 0.15794731676578522, + "learning_rate": 2.9840951556304975e-05, + "loss": 0.0311, + "step": 8850 + }, + { + "epoch": 2.02779676463887, + "grad_norm": 0.10117436945438385, + "learning_rate": 2.9727020006380167e-05, + "loss": 0.035, + "step": 8900 + }, + { + "epoch": 2.0391888812941446, + "grad_norm": 0.15975680947303772, + "learning_rate": 2.9613088456455362e-05, + "loss": 0.0322, + "step": 8950 + }, + { + "epoch": 2.050580997949419, + "grad_norm": 0.09420093894004822, + "learning_rate": 2.9499156906530557e-05, + "loss": 0.0303, + "step": 9000 + }, + { + "epoch": 2.0619731146046933, + "grad_norm": 0.1872691512107849, + "learning_rate": 2.938522535660575e-05, + "loss": 0.0318, + "step": 9050 + }, + { + "epoch": 2.073365231259968, + "grad_norm": 0.1728203147649765, + "learning_rate": 2.9271293806680948e-05, + "loss": 0.0303, + "step": 9100 + }, + { + "epoch": 2.0847573479152426, + "grad_norm": 0.12621662020683289, + "learning_rate": 2.9157362256756143e-05, + "loss": 0.0328, + "step": 9150 + }, + { + "epoch": 2.096149464570517, + "grad_norm": 0.15386846661567688, + "learning_rate": 2.9043430706831338e-05, + "loss": 0.0315, + "step": 9200 + }, + { + "epoch": 2.107541581225792, + "grad_norm": 0.1342351734638214, + "learning_rate": 2.892949915690653e-05, + "loss": 0.0313, + "step": 9250 + }, + { + "epoch": 2.1189336978810664, + "grad_norm": 0.14698518812656403, + "learning_rate": 2.8815567606981725e-05, + "loss": 0.0338, + "step": 9300 + }, + { + "epoch": 2.1303258145363406, + "grad_norm": 0.12786546349525452, + "learning_rate": 2.8701636057056924e-05, + "loss": 0.0315, + "step": 9350 + }, + { + "epoch": 2.1417179311916152, + "grad_norm": 0.17615847289562225, + "learning_rate": 2.858770450713212e-05, + "loss": 0.0288, + "step": 9400 + }, + { + "epoch": 2.15311004784689, + "grad_norm": 0.10935161262750626, + "learning_rate": 2.847377295720731e-05, + "loss": 0.0306, + "step": 9450 + }, + { + "epoch": 2.1645021645021645, + "grad_norm": 0.16573001444339752, + "learning_rate": 2.8359841407282506e-05, + "loss": 0.0309, + "step": 9500 + }, + { + "epoch": 2.175894281157439, + "grad_norm": 0.1991117298603058, + "learning_rate": 2.82459098573577e-05, + "loss": 0.0301, + "step": 9550 + }, + { + "epoch": 2.1872863978127137, + "grad_norm": 0.1383821666240692, + "learning_rate": 2.8131978307432893e-05, + "loss": 0.0332, + "step": 9600 + }, + { + "epoch": 2.1986785144679883, + "grad_norm": 0.2356170415878296, + "learning_rate": 2.8018046757508095e-05, + "loss": 0.032, + "step": 9650 + }, + { + "epoch": 2.2100706311232625, + "grad_norm": 0.2645617723464966, + "learning_rate": 2.7904115207583286e-05, + "loss": 0.0307, + "step": 9700 + }, + { + "epoch": 2.221462747778537, + "grad_norm": 0.147581547498703, + "learning_rate": 2.779018365765848e-05, + "loss": 0.0324, + "step": 9750 + }, + { + "epoch": 2.2328548644338118, + "grad_norm": 0.16367791593074799, + "learning_rate": 2.7676252107733673e-05, + "loss": 0.031, + "step": 9800 + }, + { + "epoch": 2.2442469810890864, + "grad_norm": 0.13787373900413513, + "learning_rate": 2.756232055780887e-05, + "loss": 0.0326, + "step": 9850 + }, + { + "epoch": 2.255639097744361, + "grad_norm": 0.1088612899184227, + "learning_rate": 2.7448389007884064e-05, + "loss": 0.0346, + "step": 9900 + }, + { + "epoch": 2.2670312143996356, + "grad_norm": 0.16382579505443573, + "learning_rate": 2.7334457457959262e-05, + "loss": 0.0328, + "step": 9950 + }, + { + "epoch": 2.27842333105491, + "grad_norm": 0.1262141764163971, + "learning_rate": 2.7220525908034454e-05, + "loss": 0.0319, + "step": 10000 + }, + { + "epoch": 2.2898154477101844, + "grad_norm": 0.1008811965584755, + "learning_rate": 2.710659435810965e-05, + "loss": 0.0311, + "step": 10050 + }, + { + "epoch": 2.301207564365459, + "grad_norm": 0.10501789301633835, + "learning_rate": 2.6992662808184844e-05, + "loss": 0.0336, + "step": 10100 + }, + { + "epoch": 2.3125996810207337, + "grad_norm": 0.12613259255886078, + "learning_rate": 2.6878731258260036e-05, + "loss": 0.0308, + "step": 10150 + }, + { + "epoch": 2.3239917976760083, + "grad_norm": 0.12407036870718002, + "learning_rate": 2.676479970833523e-05, + "loss": 0.028, + "step": 10200 + }, + { + "epoch": 2.335383914331283, + "grad_norm": 0.10909141600131989, + "learning_rate": 2.665086815841043e-05, + "loss": 0.0328, + "step": 10250 + }, + { + "epoch": 2.3467760309865575, + "grad_norm": 0.21803608536720276, + "learning_rate": 2.6536936608485625e-05, + "loss": 0.0293, + "step": 10300 + }, + { + "epoch": 2.3581681476418317, + "grad_norm": 0.13282905519008636, + "learning_rate": 2.6423005058560817e-05, + "loss": 0.0301, + "step": 10350 + }, + { + "epoch": 2.3695602642971063, + "grad_norm": 0.144027441740036, + "learning_rate": 2.6309073508636012e-05, + "loss": 0.027, + "step": 10400 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.16259746253490448, + "learning_rate": 2.6195141958711207e-05, + "loss": 0.0301, + "step": 10450 + }, + { + "epoch": 2.3923444976076556, + "grad_norm": 0.14099255204200745, + "learning_rate": 2.6081210408786406e-05, + "loss": 0.0305, + "step": 10500 + }, + { + "epoch": 2.40373661426293, + "grad_norm": 0.11188742518424988, + "learning_rate": 2.5967278858861597e-05, + "loss": 0.0294, + "step": 10550 + }, + { + "epoch": 2.415128730918205, + "grad_norm": 0.11726520955562592, + "learning_rate": 2.5853347308936793e-05, + "loss": 0.0322, + "step": 10600 + }, + { + "epoch": 2.426520847573479, + "grad_norm": 0.12376642227172852, + "learning_rate": 2.5739415759011988e-05, + "loss": 0.0267, + "step": 10650 + }, + { + "epoch": 2.4379129642287536, + "grad_norm": 0.1598552167415619, + "learning_rate": 2.562548420908718e-05, + "loss": 0.0326, + "step": 10700 + }, + { + "epoch": 2.449305080884028, + "grad_norm": 0.14880846440792084, + "learning_rate": 2.5511552659162375e-05, + "loss": 0.0296, + "step": 10750 + }, + { + "epoch": 2.460697197539303, + "grad_norm": 0.22758153080940247, + "learning_rate": 2.5397621109237573e-05, + "loss": 0.0293, + "step": 10800 + }, + { + "epoch": 2.4720893141945774, + "grad_norm": 0.17673678696155548, + "learning_rate": 2.528368955931277e-05, + "loss": 0.0338, + "step": 10850 + }, + { + "epoch": 2.483481430849852, + "grad_norm": 0.2402111291885376, + "learning_rate": 2.516975800938796e-05, + "loss": 0.0317, + "step": 10900 + }, + { + "epoch": 2.4948735475051267, + "grad_norm": 0.16588878631591797, + "learning_rate": 2.5055826459463155e-05, + "loss": 0.0289, + "step": 10950 + }, + { + "epoch": 2.506265664160401, + "grad_norm": 0.1362510472536087, + "learning_rate": 2.494189490953835e-05, + "loss": 0.0295, + "step": 11000 + }, + { + "epoch": 2.5176577808156755, + "grad_norm": 0.14621783792972565, + "learning_rate": 2.4827963359613546e-05, + "loss": 0.0303, + "step": 11050 + }, + { + "epoch": 2.52904989747095, + "grad_norm": 0.4146839380264282, + "learning_rate": 2.471403180968874e-05, + "loss": 0.0307, + "step": 11100 + }, + { + "epoch": 2.5404420141262247, + "grad_norm": 0.16631941497325897, + "learning_rate": 2.4600100259763936e-05, + "loss": 0.0315, + "step": 11150 + }, + { + "epoch": 2.5518341307814993, + "grad_norm": 0.1467452049255371, + "learning_rate": 2.448616870983913e-05, + "loss": 0.0321, + "step": 11200 + }, + { + "epoch": 2.5632262474367735, + "grad_norm": 0.18437771499156952, + "learning_rate": 2.4372237159914323e-05, + "loss": 0.0299, + "step": 11250 + }, + { + "epoch": 2.574618364092048, + "grad_norm": 0.21040193736553192, + "learning_rate": 2.425830560998952e-05, + "loss": 0.0311, + "step": 11300 + }, + { + "epoch": 2.5860104807473228, + "grad_norm": 0.18259912729263306, + "learning_rate": 2.4144374060064713e-05, + "loss": 0.0279, + "step": 11350 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 0.13136200606822968, + "learning_rate": 2.403044251013991e-05, + "loss": 0.0292, + "step": 11400 + }, + { + "epoch": 2.608794714057872, + "grad_norm": 0.12732388079166412, + "learning_rate": 2.3916510960215104e-05, + "loss": 0.0291, + "step": 11450 + }, + { + "epoch": 2.6201868307131466, + "grad_norm": 0.10226253420114517, + "learning_rate": 2.38025794102903e-05, + "loss": 0.0313, + "step": 11500 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.11919239908456802, + "learning_rate": 2.3688647860365494e-05, + "loss": 0.0267, + "step": 11550 + }, + { + "epoch": 2.642971064023696, + "grad_norm": 0.16108646988868713, + "learning_rate": 2.357471631044069e-05, + "loss": 0.0292, + "step": 11600 + }, + { + "epoch": 2.65436318067897, + "grad_norm": 0.23356276750564575, + "learning_rate": 2.3460784760515884e-05, + "loss": 0.0319, + "step": 11650 + }, + { + "epoch": 2.6657552973342447, + "grad_norm": 0.1463542878627777, + "learning_rate": 2.3346853210591076e-05, + "loss": 0.0282, + "step": 11700 + }, + { + "epoch": 2.6771474139895193, + "grad_norm": 0.12290333956480026, + "learning_rate": 2.3232921660666275e-05, + "loss": 0.0304, + "step": 11750 + }, + { + "epoch": 2.688539530644794, + "grad_norm": 0.1767304688692093, + "learning_rate": 2.3118990110741466e-05, + "loss": 0.0301, + "step": 11800 + }, + { + "epoch": 2.6999316473000685, + "grad_norm": 0.12297718971967697, + "learning_rate": 2.300505856081666e-05, + "loss": 0.0281, + "step": 11850 + }, + { + "epoch": 2.7113237639553427, + "grad_norm": 0.14692600071430206, + "learning_rate": 2.2891127010891857e-05, + "loss": 0.0288, + "step": 11900 + }, + { + "epoch": 2.7227158806106173, + "grad_norm": 0.15868423879146576, + "learning_rate": 2.2777195460967052e-05, + "loss": 0.0311, + "step": 11950 + }, + { + "epoch": 2.734107997265892, + "grad_norm": 0.22269558906555176, + "learning_rate": 2.2663263911042247e-05, + "loss": 0.0319, + "step": 12000 + }, + { + "epoch": 2.734107997265892, + "eval_loss": 0.05642309412360191, + "eval_runtime": 76.2989, + "eval_samples_per_second": 23.29, + "eval_steps_per_second": 1.468, + "step": 12000 + } + ], + "logging_steps": 50, + "max_steps": 21945, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.459165119956648e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/training_args.bin b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db308ac2f9011d86beca301acc47bd83d423f0f --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8def4cbff2092aebe0edc0db45038825f342bd99987945f3aa74431dfc4b747e +size 7800 diff --git a/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/zero_to_fp32.py b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/weight_dir/loraWeight/fixbycrflp2/checkpoint-12000/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/README.md b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..105007ea041cb936ba3417e0221e47fec447213d --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./CodeLlama-7b-Instruct-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_config.json b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8faaa17fff7e4931b18e07a687c5b35532df41ba --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./CodeLlama-7b-Instruct-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_model.bin b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..857c1c105ac4f72f96854ec22c36aafff1b45a68 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b61741be33f127dc45b0835fc9487a3d9fd4d81c973f690ae754f9f5970358 +size 8433034 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/added_tokens.json b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c09203d8a52151e0d3b5f3c8e6daedc5b60832b5 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/added_tokens.json @@ -0,0 +1,3 @@ +{ + "": 32016 +} diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6bcb8be556637534dcd5e8efa3f45bb7d3ca8c --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32828f5ce5f8cae6fcaf1c5172803670e5383b0f2d0f1dce46aaa10ee675d6f3 +size 12589776 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8170bab8b8392111e5c9ee4f0cfd0d9165c4b530 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca2a0efc82886db414fdbe10d8c18670579afc50c90890c7f4cdb96a75bbaa9 +size 12589840 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eff3db739771be39b601e02258c877472d5d225e --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd925adacf2b79ffe5ce595f3ea49a122305d633c40582d3227f718bf6f7379 +size 12589840 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32a669388715c1702d1ca98d33212db00abb1d47 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769c31e8d696b8e65487981efa5fb40772302293395bc9fe924dd3c381e0617e +size 12589840 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/mp_rank_00_model_states.pt b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7c00ab989b447e9bc3ee447e709da270a8a503a --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/global_step14000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c2dbdc41e44c7d5bd97e9d4b89d311ee8cfe727757664b9c21ce6575ebde25e +size 8507372 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/latest b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c2e99ab39c9aa1d68dfdcb53dc9234015ced784 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/latest @@ -0,0 +1 @@ +global_step14000 \ No newline at end of file diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_0.pth b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..599100e3930d98f23dcee3039f49fc7e796eada5 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b3271c0444170435af26350591421c00cc6bd21b5ec3df060d909eef6cc051 +size 15024 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_1.pth b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ba5a14a6ecfa06625e1e847d40abbf26048166f --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96bf15fe1750e738a04077747729607b44e6e60417bf57c69b48eb7a1ef1b1ce +size 15024 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_2.pth b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d55afb1e036f6f54b56376581552855227b0ca7 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f80ea4001e5074a6810f4f44445e34aa07b8939bbd17d68e4038293816eaf9b7 +size 15024 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_3.pth b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..21fb04c997c4e59c72b1a4fa0a79fcae061cbc24 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5373b0aabd34513e5bbf7fa26b5cee66964ccb17e486af249c2762b2762f9870 +size 15024 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/special_tokens_map.json b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..2ece23e993f7e1c7063cb51148b6fa5c6c224775 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer.model b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer_config.json b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc4d9a55d56b17600905bfc69a0e45e2fae7a1c4 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/tokenizer_config.json @@ -0,0 +1,50 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/trainer_state.json b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..464d40731e3f30cd88619085d0d10d4f75ca60c8 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/trainer_state.json @@ -0,0 +1,2009 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.477134633834346, + "eval_steps": 6000, + "global_step": 14000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01598976654940838, + "grad_norm": 0.4070083498954773, + "learning_rate": 4.992484329026481e-05, + "loss": 4.273, + "step": 50 + }, + { + "epoch": 0.03197953309881676, + "grad_norm": 0.057464830577373505, + "learning_rate": 4.984488934373801e-05, + "loss": 0.0247, + "step": 100 + }, + { + "epoch": 0.04796929964822513, + "grad_norm": 0.06848371773958206, + "learning_rate": 4.976493539721121e-05, + "loss": 0.0152, + "step": 150 + }, + { + "epoch": 0.06395906619763352, + "grad_norm": 0.042949117720127106, + "learning_rate": 4.968498145068441e-05, + "loss": 0.0142, + "step": 200 + }, + { + "epoch": 0.0799488327470419, + "grad_norm": 0.05076887458562851, + "learning_rate": 4.9605027504157606e-05, + "loss": 0.0138, + "step": 250 + }, + { + "epoch": 0.09593859929645027, + "grad_norm": 0.04695209115743637, + "learning_rate": 4.952507355763081e-05, + "loss": 0.014, + "step": 300 + }, + { + "epoch": 0.11192836584585865, + "grad_norm": 0.05572609230875969, + "learning_rate": 4.9445119611104006e-05, + "loss": 0.0126, + "step": 350 + }, + { + "epoch": 0.12791813239526703, + "grad_norm": 0.07735384255647659, + "learning_rate": 4.936516566457721e-05, + "loss": 0.0126, + "step": 400 + }, + { + "epoch": 0.1439078989446754, + "grad_norm": 0.04669607803225517, + "learning_rate": 4.9285211718050406e-05, + "loss": 0.0123, + "step": 450 + }, + { + "epoch": 0.1598976654940838, + "grad_norm": 0.041628964245319366, + "learning_rate": 4.92052577715236e-05, + "loss": 0.0116, + "step": 500 + }, + { + "epoch": 0.17588743204349216, + "grad_norm": 0.051499415189027786, + "learning_rate": 4.9125303824996806e-05, + "loss": 0.0111, + "step": 550 + }, + { + "epoch": 0.19187719859290053, + "grad_norm": 0.055776361376047134, + "learning_rate": 4.904534987847e-05, + "loss": 0.0113, + "step": 600 + }, + { + "epoch": 0.20786696514230893, + "grad_norm": 0.04950210452079773, + "learning_rate": 4.8965395931943206e-05, + "loss": 0.0111, + "step": 650 + }, + { + "epoch": 0.2238567316917173, + "grad_norm": 0.04448186606168747, + "learning_rate": 4.88854419854164e-05, + "loss": 0.0109, + "step": 700 + }, + { + "epoch": 0.23984649824112567, + "grad_norm": 0.04934769123792648, + "learning_rate": 4.88054880388896e-05, + "loss": 0.012, + "step": 750 + }, + { + "epoch": 0.25583626479053406, + "grad_norm": 0.05481697991490364, + "learning_rate": 4.87255340923628e-05, + "loss": 0.0113, + "step": 800 + }, + { + "epoch": 0.27182603133994243, + "grad_norm": 0.07512317597866058, + "learning_rate": 4.8645580145836e-05, + "loss": 0.0108, + "step": 850 + }, + { + "epoch": 0.2878157978893508, + "grad_norm": 0.0495820976793766, + "learning_rate": 4.85656261993092e-05, + "loss": 0.0106, + "step": 900 + }, + { + "epoch": 0.30380556443875917, + "grad_norm": 0.0660017803311348, + "learning_rate": 4.84856722527824e-05, + "loss": 0.0107, + "step": 950 + }, + { + "epoch": 0.3197953309881676, + "grad_norm": 0.043546389788389206, + "learning_rate": 4.8405718306255596e-05, + "loss": 0.0101, + "step": 1000 + }, + { + "epoch": 0.33578509753757596, + "grad_norm": 0.046335190534591675, + "learning_rate": 4.83257643597288e-05, + "loss": 0.0105, + "step": 1050 + }, + { + "epoch": 0.35177486408698433, + "grad_norm": 0.12547409534454346, + "learning_rate": 4.8245810413201996e-05, + "loss": 0.0107, + "step": 1100 + }, + { + "epoch": 0.3677646306363927, + "grad_norm": 0.04085518419742584, + "learning_rate": 4.81658564666752e-05, + "loss": 0.0102, + "step": 1150 + }, + { + "epoch": 0.38375439718580107, + "grad_norm": 0.05395630747079849, + "learning_rate": 4.8085902520148396e-05, + "loss": 0.0102, + "step": 1200 + }, + { + "epoch": 0.3997441637352095, + "grad_norm": 0.11172062903642654, + "learning_rate": 4.800594857362159e-05, + "loss": 0.0104, + "step": 1250 + }, + { + "epoch": 0.41573393028461786, + "grad_norm": 0.059676576405763626, + "learning_rate": 4.7925994627094797e-05, + "loss": 0.0096, + "step": 1300 + }, + { + "epoch": 0.4317236968340262, + "grad_norm": 0.06800328195095062, + "learning_rate": 4.784604068056799e-05, + "loss": 0.0099, + "step": 1350 + }, + { + "epoch": 0.4477134633834346, + "grad_norm": 0.031088583171367645, + "learning_rate": 4.77660867340412e-05, + "loss": 0.01, + "step": 1400 + }, + { + "epoch": 0.46370322993284296, + "grad_norm": 0.05977204814553261, + "learning_rate": 4.768613278751439e-05, + "loss": 0.0098, + "step": 1450 + }, + { + "epoch": 0.47969299648225133, + "grad_norm": 0.0633067712187767, + "learning_rate": 4.760617884098759e-05, + "loss": 0.011, + "step": 1500 + }, + { + "epoch": 0.49568276303165976, + "grad_norm": 0.10753436386585236, + "learning_rate": 4.752622489446079e-05, + "loss": 0.0102, + "step": 1550 + }, + { + "epoch": 0.5116725295810681, + "grad_norm": 0.04346701502799988, + "learning_rate": 4.744627094793399e-05, + "loss": 0.0099, + "step": 1600 + }, + { + "epoch": 0.5276622961304765, + "grad_norm": 0.08080250024795532, + "learning_rate": 4.736631700140719e-05, + "loss": 0.0098, + "step": 1650 + }, + { + "epoch": 0.5436520626798849, + "grad_norm": 0.04791934788227081, + "learning_rate": 4.728636305488039e-05, + "loss": 0.0094, + "step": 1700 + }, + { + "epoch": 0.5596418292292933, + "grad_norm": 0.07470395416021347, + "learning_rate": 4.720640910835359e-05, + "loss": 0.0097, + "step": 1750 + }, + { + "epoch": 0.5756315957787016, + "grad_norm": 0.030010297894477844, + "learning_rate": 4.712645516182679e-05, + "loss": 0.0103, + "step": 1800 + }, + { + "epoch": 0.59162136232811, + "grad_norm": 0.06268253177404404, + "learning_rate": 4.704650121529999e-05, + "loss": 0.0102, + "step": 1850 + }, + { + "epoch": 0.6076111288775183, + "grad_norm": 0.04115760698914528, + "learning_rate": 4.6966547268773183e-05, + "loss": 0.01, + "step": 1900 + }, + { + "epoch": 0.6236008954269268, + "grad_norm": 0.0384821742773056, + "learning_rate": 4.688659332224639e-05, + "loss": 0.0096, + "step": 1950 + }, + { + "epoch": 0.6395906619763352, + "grad_norm": 0.05536011606454849, + "learning_rate": 4.6806639375719584e-05, + "loss": 0.0099, + "step": 2000 + }, + { + "epoch": 0.6555804285257435, + "grad_norm": 0.042221494019031525, + "learning_rate": 4.672668542919279e-05, + "loss": 0.0098, + "step": 2050 + }, + { + "epoch": 0.6715701950751519, + "grad_norm": 0.05251794680953026, + "learning_rate": 4.6646731482665984e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.6875599616245602, + "grad_norm": 0.057784657925367355, + "learning_rate": 4.656677753613918e-05, + "loss": 0.0103, + "step": 2150 + }, + { + "epoch": 0.7035497281739687, + "grad_norm": 0.056001678109169006, + "learning_rate": 4.648682358961239e-05, + "loss": 0.0099, + "step": 2200 + }, + { + "epoch": 0.7195394947233771, + "grad_norm": 0.0560077428817749, + "learning_rate": 4.640686964308559e-05, + "loss": 0.0096, + "step": 2250 + }, + { + "epoch": 0.7355292612727854, + "grad_norm": 0.04319921135902405, + "learning_rate": 4.6326915696558784e-05, + "loss": 0.0093, + "step": 2300 + }, + { + "epoch": 0.7515190278221938, + "grad_norm": 0.059632595628499985, + "learning_rate": 4.624696175003199e-05, + "loss": 0.01, + "step": 2350 + }, + { + "epoch": 0.7675087943716021, + "grad_norm": 0.0601465068757534, + "learning_rate": 4.6167007803505184e-05, + "loss": 0.0098, + "step": 2400 + }, + { + "epoch": 0.7834985609210106, + "grad_norm": 0.03820424899458885, + "learning_rate": 4.608705385697839e-05, + "loss": 0.0101, + "step": 2450 + }, + { + "epoch": 0.799488327470419, + "grad_norm": 0.04104379937052727, + "learning_rate": 4.6007099910451584e-05, + "loss": 0.0092, + "step": 2500 + }, + { + "epoch": 0.8154780940198273, + "grad_norm": 0.033697500824928284, + "learning_rate": 4.592714596392478e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.8314678605692357, + "grad_norm": 0.10835737735033035, + "learning_rate": 4.5847192017397984e-05, + "loss": 0.0094, + "step": 2600 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 0.07756606489419937, + "learning_rate": 4.576723807087118e-05, + "loss": 0.01, + "step": 2650 + }, + { + "epoch": 0.8634473936680525, + "grad_norm": 0.036015670746564865, + "learning_rate": 4.5687284124344384e-05, + "loss": 0.0097, + "step": 2700 + }, + { + "epoch": 0.8794371602174609, + "grad_norm": 0.033497828990221024, + "learning_rate": 4.560733017781758e-05, + "loss": 0.0094, + "step": 2750 + }, + { + "epoch": 0.8954269267668692, + "grad_norm": 0.046858739107847214, + "learning_rate": 4.552737623129078e-05, + "loss": 0.0096, + "step": 2800 + }, + { + "epoch": 0.9114166933162776, + "grad_norm": 0.038639482110738754, + "learning_rate": 4.544742228476398e-05, + "loss": 0.0093, + "step": 2850 + }, + { + "epoch": 0.9274064598656859, + "grad_norm": 0.046789348125457764, + "learning_rate": 4.536746833823718e-05, + "loss": 0.0092, + "step": 2900 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.05411810427904129, + "learning_rate": 4.528751439171038e-05, + "loss": 0.0095, + "step": 2950 + }, + { + "epoch": 0.9593859929645027, + "grad_norm": 0.04174494370818138, + "learning_rate": 4.520756044518358e-05, + "loss": 0.0097, + "step": 3000 + }, + { + "epoch": 0.9753757595139111, + "grad_norm": 0.02797970548272133, + "learning_rate": 4.5127606498656774e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.9913655260633195, + "grad_norm": 0.050330642610788345, + "learning_rate": 4.504765255212998e-05, + "loss": 0.0097, + "step": 3100 + }, + { + "epoch": 1.0073552926127278, + "grad_norm": 0.03967903554439545, + "learning_rate": 4.4967698605603174e-05, + "loss": 0.0091, + "step": 3150 + }, + { + "epoch": 1.0233450591621363, + "grad_norm": 0.052495893090963364, + "learning_rate": 4.488774465907638e-05, + "loss": 0.0095, + "step": 3200 + }, + { + "epoch": 1.0393348257115447, + "grad_norm": 0.05642736330628395, + "learning_rate": 4.4807790712549574e-05, + "loss": 0.0085, + "step": 3250 + }, + { + "epoch": 1.0553245922609529, + "grad_norm": 0.04831695184111595, + "learning_rate": 4.472783676602277e-05, + "loss": 0.0094, + "step": 3300 + }, + { + "epoch": 1.0713143588103613, + "grad_norm": 0.13986164331436157, + "learning_rate": 4.4647882819495975e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 1.0873041253597697, + "grad_norm": 0.04970436543226242, + "learning_rate": 4.456792887296917e-05, + "loss": 0.0097, + "step": 3400 + }, + { + "epoch": 1.1032938919091781, + "grad_norm": 0.054993029683828354, + "learning_rate": 4.4487974926442375e-05, + "loss": 0.01, + "step": 3450 + }, + { + "epoch": 1.1192836584585866, + "grad_norm": 0.0306412186473608, + "learning_rate": 4.440802097991557e-05, + "loss": 0.0093, + "step": 3500 + }, + { + "epoch": 1.1352734250079948, + "grad_norm": 0.048830822110176086, + "learning_rate": 4.432806703338877e-05, + "loss": 0.0097, + "step": 3550 + }, + { + "epoch": 1.1512631915574032, + "grad_norm": 0.060777079313993454, + "learning_rate": 4.424811308686197e-05, + "loss": 0.0092, + "step": 3600 + }, + { + "epoch": 1.1672529581068116, + "grad_norm": 0.03263658285140991, + "learning_rate": 4.416815914033517e-05, + "loss": 0.0092, + "step": 3650 + }, + { + "epoch": 1.18324272465622, + "grad_norm": 0.04247328266501427, + "learning_rate": 4.408820519380837e-05, + "loss": 0.0086, + "step": 3700 + }, + { + "epoch": 1.1992324912056285, + "grad_norm": 0.057229358702898026, + "learning_rate": 4.400825124728157e-05, + "loss": 0.0096, + "step": 3750 + }, + { + "epoch": 1.2152222577550367, + "grad_norm": 0.043552979826927185, + "learning_rate": 4.3928297300754765e-05, + "loss": 0.0087, + "step": 3800 + }, + { + "epoch": 1.231212024304445, + "grad_norm": 0.0451231375336647, + "learning_rate": 4.384834335422797e-05, + "loss": 0.0095, + "step": 3850 + }, + { + "epoch": 1.2472017908538535, + "grad_norm": 0.03589295223355293, + "learning_rate": 4.3768389407701165e-05, + "loss": 0.0097, + "step": 3900 + }, + { + "epoch": 1.263191557403262, + "grad_norm": 0.03593279793858528, + "learning_rate": 4.368843546117437e-05, + "loss": 0.0094, + "step": 3950 + }, + { + "epoch": 1.2791813239526704, + "grad_norm": 0.055643677711486816, + "learning_rate": 4.3608481514647565e-05, + "loss": 0.0086, + "step": 4000 + }, + { + "epoch": 1.2951710905020786, + "grad_norm": 0.050604771822690964, + "learning_rate": 4.352852756812076e-05, + "loss": 0.0095, + "step": 4050 + }, + { + "epoch": 1.311160857051487, + "grad_norm": 0.04043160006403923, + "learning_rate": 4.3448573621593965e-05, + "loss": 0.0088, + "step": 4100 + }, + { + "epoch": 1.3271506236008954, + "grad_norm": 0.03268027678132057, + "learning_rate": 4.336861967506716e-05, + "loss": 0.0088, + "step": 4150 + }, + { + "epoch": 1.3431403901503038, + "grad_norm": 0.0318896509706974, + "learning_rate": 4.3288665728540365e-05, + "loss": 0.0095, + "step": 4200 + }, + { + "epoch": 1.3591301566997123, + "grad_norm": 0.03341570496559143, + "learning_rate": 4.320871178201356e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 1.3751199232491205, + "grad_norm": 0.02508491836488247, + "learning_rate": 4.312875783548676e-05, + "loss": 0.0087, + "step": 4300 + }, + { + "epoch": 1.391109689798529, + "grad_norm": 0.04143594950437546, + "learning_rate": 4.304880388895996e-05, + "loss": 0.0092, + "step": 4350 + }, + { + "epoch": 1.4070994563479373, + "grad_norm": 0.03601376339793205, + "learning_rate": 4.296884994243316e-05, + "loss": 0.0089, + "step": 4400 + }, + { + "epoch": 1.4230892228973457, + "grad_norm": 0.027452582493424416, + "learning_rate": 4.2888895995906355e-05, + "loss": 0.0085, + "step": 4450 + }, + { + "epoch": 1.4390789894467542, + "grad_norm": 0.046533871442079544, + "learning_rate": 4.280894204937956e-05, + "loss": 0.0094, + "step": 4500 + }, + { + "epoch": 1.4550687559961624, + "grad_norm": 0.059284090995788574, + "learning_rate": 4.2728988102852755e-05, + "loss": 0.0088, + "step": 4550 + }, + { + "epoch": 1.4710585225455708, + "grad_norm": 0.04795876890420914, + "learning_rate": 4.264903415632596e-05, + "loss": 0.0092, + "step": 4600 + }, + { + "epoch": 1.4870482890949792, + "grad_norm": 0.04677587375044823, + "learning_rate": 4.2569080209799155e-05, + "loss": 0.0092, + "step": 4650 + }, + { + "epoch": 1.5030380556443876, + "grad_norm": 0.04200517758727074, + "learning_rate": 4.248912626327235e-05, + "loss": 0.0094, + "step": 4700 + }, + { + "epoch": 1.519027822193796, + "grad_norm": 0.05572056025266647, + "learning_rate": 4.2409172316745555e-05, + "loss": 0.0086, + "step": 4750 + }, + { + "epoch": 1.5350175887432043, + "grad_norm": 0.026984069496393204, + "learning_rate": 4.232921837021875e-05, + "loss": 0.0088, + "step": 4800 + }, + { + "epoch": 1.5510073552926127, + "grad_norm": 0.055002227425575256, + "learning_rate": 4.2249264423691956e-05, + "loss": 0.0085, + "step": 4850 + }, + { + "epoch": 1.5669971218420211, + "grad_norm": 0.03688422963023186, + "learning_rate": 4.216931047716515e-05, + "loss": 0.0093, + "step": 4900 + }, + { + "epoch": 1.5829868883914295, + "grad_norm": 0.051286958158016205, + "learning_rate": 4.208935653063835e-05, + "loss": 0.0094, + "step": 4950 + }, + { + "epoch": 1.598976654940838, + "grad_norm": 0.04032529890537262, + "learning_rate": 4.200940258411155e-05, + "loss": 0.009, + "step": 5000 + }, + { + "epoch": 1.6149664214902462, + "grad_norm": 0.04411604255437851, + "learning_rate": 4.192944863758475e-05, + "loss": 0.0087, + "step": 5050 + }, + { + "epoch": 1.6309561880396546, + "grad_norm": 0.03501143679022789, + "learning_rate": 4.184949469105795e-05, + "loss": 0.0087, + "step": 5100 + }, + { + "epoch": 1.646945954589063, + "grad_norm": 0.04125143960118294, + "learning_rate": 4.176954074453115e-05, + "loss": 0.0089, + "step": 5150 + }, + { + "epoch": 1.6629357211384712, + "grad_norm": 0.03687961399555206, + "learning_rate": 4.168958679800435e-05, + "loss": 0.009, + "step": 5200 + }, + { + "epoch": 1.6789254876878799, + "grad_norm": 0.08890601992607117, + "learning_rate": 4.1609632851477556e-05, + "loss": 0.0087, + "step": 5250 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 0.03983084484934807, + "learning_rate": 4.152967890495075e-05, + "loss": 0.0093, + "step": 5300 + }, + { + "epoch": 1.7109050207866965, + "grad_norm": 0.02701224572956562, + "learning_rate": 4.144972495842395e-05, + "loss": 0.009, + "step": 5350 + }, + { + "epoch": 1.726894787336105, + "grad_norm": 0.023342708125710487, + "learning_rate": 4.136977101189715e-05, + "loss": 0.0085, + "step": 5400 + }, + { + "epoch": 1.742884553885513, + "grad_norm": 0.048117853701114655, + "learning_rate": 4.128981706537035e-05, + "loss": 0.0087, + "step": 5450 + }, + { + "epoch": 1.7588743204349218, + "grad_norm": 0.044627636671066284, + "learning_rate": 4.120986311884355e-05, + "loss": 0.009, + "step": 5500 + }, + { + "epoch": 1.77486408698433, + "grad_norm": 0.04622326046228409, + "learning_rate": 4.112990917231675e-05, + "loss": 0.0094, + "step": 5550 + }, + { + "epoch": 1.7908538535337384, + "grad_norm": 0.024695126339793205, + "learning_rate": 4.1049955225789946e-05, + "loss": 0.0088, + "step": 5600 + }, + { + "epoch": 1.8068436200831468, + "grad_norm": 0.07150325179100037, + "learning_rate": 4.097000127926315e-05, + "loss": 0.0089, + "step": 5650 + }, + { + "epoch": 1.822833386632555, + "grad_norm": 0.05561058968305588, + "learning_rate": 4.0890047332736346e-05, + "loss": 0.0089, + "step": 5700 + }, + { + "epoch": 1.8388231531819637, + "grad_norm": 0.04343913495540619, + "learning_rate": 4.081009338620955e-05, + "loss": 0.0084, + "step": 5750 + }, + { + "epoch": 1.8548129197313719, + "grad_norm": 0.022196004167199135, + "learning_rate": 4.0730139439682746e-05, + "loss": 0.0093, + "step": 5800 + }, + { + "epoch": 1.8708026862807803, + "grad_norm": 0.035860929638147354, + "learning_rate": 4.065018549315594e-05, + "loss": 0.0089, + "step": 5850 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.04030134156346321, + "learning_rate": 4.0570231546629146e-05, + "loss": 0.0087, + "step": 5900 + }, + { + "epoch": 1.902782219379597, + "grad_norm": 0.04269862174987793, + "learning_rate": 4.049027760010234e-05, + "loss": 0.0085, + "step": 5950 + }, + { + "epoch": 1.9187719859290056, + "grad_norm": 0.027169955894351006, + "learning_rate": 4.0410323653575546e-05, + "loss": 0.0088, + "step": 6000 + }, + { + "epoch": 1.9187719859290056, + "eval_loss": 0.015284636057913303, + "eval_runtime": 41.2462, + "eval_samples_per_second": 22.135, + "eval_steps_per_second": 1.406, + "step": 6000 + }, + { + "epoch": 1.9347617524784138, + "grad_norm": 0.028573885560035706, + "learning_rate": 4.033036970704874e-05, + "loss": 0.0085, + "step": 6050 + }, + { + "epoch": 1.9507515190278222, + "grad_norm": 0.04846006631851196, + "learning_rate": 4.025041576052194e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 1.9667412855772306, + "grad_norm": 0.04022432863712311, + "learning_rate": 4.017046181399514e-05, + "loss": 0.0087, + "step": 6150 + }, + { + "epoch": 1.9827310521266388, + "grad_norm": 0.052905626595020294, + "learning_rate": 4.009050786746834e-05, + "loss": 0.0087, + "step": 6200 + }, + { + "epoch": 1.9987208186760475, + "grad_norm": 0.047191545367240906, + "learning_rate": 4.001055392094154e-05, + "loss": 0.0088, + "step": 6250 + }, + { + "epoch": 2.0147105852254557, + "grad_norm": 0.03503846377134323, + "learning_rate": 3.993059997441474e-05, + "loss": 0.009, + "step": 6300 + }, + { + "epoch": 2.0307003517748643, + "grad_norm": 0.03783261030912399, + "learning_rate": 3.9850646027887936e-05, + "loss": 0.0083, + "step": 6350 + }, + { + "epoch": 2.0466901183242725, + "grad_norm": 0.07593075186014175, + "learning_rate": 3.977069208136114e-05, + "loss": 0.0082, + "step": 6400 + }, + { + "epoch": 2.0626798848736807, + "grad_norm": 0.04408886656165123, + "learning_rate": 3.9690738134834337e-05, + "loss": 0.0082, + "step": 6450 + }, + { + "epoch": 2.0786696514230893, + "grad_norm": 0.07408079504966736, + "learning_rate": 3.961078418830754e-05, + "loss": 0.0088, + "step": 6500 + }, + { + "epoch": 2.0946594179724976, + "grad_norm": 0.0387246273458004, + "learning_rate": 3.953083024178074e-05, + "loss": 0.0092, + "step": 6550 + }, + { + "epoch": 2.1106491845219058, + "grad_norm": 0.029871731996536255, + "learning_rate": 3.945087629525393e-05, + "loss": 0.0089, + "step": 6600 + }, + { + "epoch": 2.1266389510713144, + "grad_norm": 0.08507910370826721, + "learning_rate": 3.937092234872714e-05, + "loss": 0.0085, + "step": 6650 + }, + { + "epoch": 2.1426287176207226, + "grad_norm": 0.06320624053478241, + "learning_rate": 3.929096840220033e-05, + "loss": 0.0093, + "step": 6700 + }, + { + "epoch": 2.1586184841701312, + "grad_norm": 0.040876906365156174, + "learning_rate": 3.921101445567354e-05, + "loss": 0.0087, + "step": 6750 + }, + { + "epoch": 2.1746082507195394, + "grad_norm": 0.05744218826293945, + "learning_rate": 3.9131060509146733e-05, + "loss": 0.008, + "step": 6800 + }, + { + "epoch": 2.1905980172689477, + "grad_norm": 0.07130051404237747, + "learning_rate": 3.905110656261993e-05, + "loss": 0.0087, + "step": 6850 + }, + { + "epoch": 2.2065877838183563, + "grad_norm": 0.06204545497894287, + "learning_rate": 3.8971152616093134e-05, + "loss": 0.0085, + "step": 6900 + }, + { + "epoch": 2.2225775503677645, + "grad_norm": 0.04432045295834541, + "learning_rate": 3.889119866956633e-05, + "loss": 0.0083, + "step": 6950 + }, + { + "epoch": 2.238567316917173, + "grad_norm": 0.07190771400928497, + "learning_rate": 3.8811244723039534e-05, + "loss": 0.0094, + "step": 7000 + }, + { + "epoch": 2.2545570834665813, + "grad_norm": 0.06573159247636795, + "learning_rate": 3.873129077651273e-05, + "loss": 0.0087, + "step": 7050 + }, + { + "epoch": 2.2705468500159895, + "grad_norm": 0.03736311197280884, + "learning_rate": 3.865133682998593e-05, + "loss": 0.0084, + "step": 7100 + }, + { + "epoch": 2.286536616565398, + "grad_norm": 0.0285806842148304, + "learning_rate": 3.857138288345913e-05, + "loss": 0.0086, + "step": 7150 + }, + { + "epoch": 2.3025263831148064, + "grad_norm": 0.03527309000492096, + "learning_rate": 3.849142893693233e-05, + "loss": 0.0088, + "step": 7200 + }, + { + "epoch": 2.318516149664215, + "grad_norm": 0.0564735010266304, + "learning_rate": 3.841147499040553e-05, + "loss": 0.008, + "step": 7250 + }, + { + "epoch": 2.3345059162136232, + "grad_norm": 0.04310419037938118, + "learning_rate": 3.833152104387873e-05, + "loss": 0.0088, + "step": 7300 + }, + { + "epoch": 2.3504956827630314, + "grad_norm": 0.03855273872613907, + "learning_rate": 3.8251567097351924e-05, + "loss": 0.0084, + "step": 7350 + }, + { + "epoch": 2.36648544931244, + "grad_norm": 0.04269862547516823, + "learning_rate": 3.817161315082513e-05, + "loss": 0.0085, + "step": 7400 + }, + { + "epoch": 2.3824752158618483, + "grad_norm": 0.04821077361702919, + "learning_rate": 3.8091659204298324e-05, + "loss": 0.0084, + "step": 7450 + }, + { + "epoch": 2.398464982411257, + "grad_norm": 0.05434531718492508, + "learning_rate": 3.801170525777152e-05, + "loss": 0.0087, + "step": 7500 + }, + { + "epoch": 2.414454748960665, + "grad_norm": 0.05756519362330437, + "learning_rate": 3.7931751311244724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 2.4304445155100733, + "grad_norm": 0.0482054241001606, + "learning_rate": 3.785179736471792e-05, + "loss": 0.0082, + "step": 7600 + }, + { + "epoch": 2.446434282059482, + "grad_norm": 0.03534189611673355, + "learning_rate": 3.7771843418191124e-05, + "loss": 0.0085, + "step": 7650 + }, + { + "epoch": 2.46242404860889, + "grad_norm": 0.04225745424628258, + "learning_rate": 3.769188947166432e-05, + "loss": 0.0082, + "step": 7700 + }, + { + "epoch": 2.478413815158299, + "grad_norm": 0.08791286498308182, + "learning_rate": 3.761193552513752e-05, + "loss": 0.009, + "step": 7750 + }, + { + "epoch": 2.494403581707707, + "grad_norm": 0.027702681720256805, + "learning_rate": 3.753198157861072e-05, + "loss": 0.0083, + "step": 7800 + }, + { + "epoch": 2.5103933482571152, + "grad_norm": 0.03311574086546898, + "learning_rate": 3.745202763208392e-05, + "loss": 0.0084, + "step": 7850 + }, + { + "epoch": 2.526383114806524, + "grad_norm": 0.07140166312456131, + "learning_rate": 3.737207368555712e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 0.04849368333816528, + "learning_rate": 3.729211973903032e-05, + "loss": 0.0082, + "step": 7950 + }, + { + "epoch": 2.5583626479053407, + "grad_norm": 0.04977947846055031, + "learning_rate": 3.7212165792503514e-05, + "loss": 0.0084, + "step": 8000 + }, + { + "epoch": 2.574352414454749, + "grad_norm": 0.035205528140068054, + "learning_rate": 3.713221184597672e-05, + "loss": 0.0087, + "step": 8050 + }, + { + "epoch": 2.590342181004157, + "grad_norm": 0.05450873449444771, + "learning_rate": 3.7052257899449914e-05, + "loss": 0.0086, + "step": 8100 + }, + { + "epoch": 2.606331947553566, + "grad_norm": 0.04244232550263405, + "learning_rate": 3.697230395292312e-05, + "loss": 0.0087, + "step": 8150 + }, + { + "epoch": 2.622321714102974, + "grad_norm": 0.04528970643877983, + "learning_rate": 3.689235000639632e-05, + "loss": 0.0081, + "step": 8200 + }, + { + "epoch": 2.6383114806523826, + "grad_norm": 0.05795321986079216, + "learning_rate": 3.681239605986952e-05, + "loss": 0.0085, + "step": 8250 + }, + { + "epoch": 2.654301247201791, + "grad_norm": 0.04863649979233742, + "learning_rate": 3.673244211334272e-05, + "loss": 0.0083, + "step": 8300 + }, + { + "epoch": 2.670291013751199, + "grad_norm": 0.03658927232027054, + "learning_rate": 3.665248816681592e-05, + "loss": 0.008, + "step": 8350 + }, + { + "epoch": 2.6862807803006077, + "grad_norm": 0.039778467267751694, + "learning_rate": 3.6572534220289114e-05, + "loss": 0.0089, + "step": 8400 + }, + { + "epoch": 2.702270546850016, + "grad_norm": 0.04085814952850342, + "learning_rate": 3.649258027376232e-05, + "loss": 0.0085, + "step": 8450 + }, + { + "epoch": 2.7182603133994245, + "grad_norm": 0.030918624252080917, + "learning_rate": 3.6412626327235515e-05, + "loss": 0.0082, + "step": 8500 + }, + { + "epoch": 2.7342500799488327, + "grad_norm": 0.05798843502998352, + "learning_rate": 3.633267238070872e-05, + "loss": 0.0082, + "step": 8550 + }, + { + "epoch": 2.750239846498241, + "grad_norm": 0.03750443831086159, + "learning_rate": 3.6252718434181915e-05, + "loss": 0.0084, + "step": 8600 + }, + { + "epoch": 2.7662296130476496, + "grad_norm": 0.0728391632437706, + "learning_rate": 3.617276448765511e-05, + "loss": 0.0084, + "step": 8650 + }, + { + "epoch": 2.782219379597058, + "grad_norm": 0.09955340623855591, + "learning_rate": 3.6092810541128315e-05, + "loss": 0.0086, + "step": 8700 + }, + { + "epoch": 2.7982091461464664, + "grad_norm": 0.045326054096221924, + "learning_rate": 3.601285659460151e-05, + "loss": 0.0078, + "step": 8750 + }, + { + "epoch": 2.8141989126958746, + "grad_norm": 0.024823077023029327, + "learning_rate": 3.5932902648074715e-05, + "loss": 0.0085, + "step": 8800 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.03785817325115204, + "learning_rate": 3.585294870154791e-05, + "loss": 0.0077, + "step": 8850 + }, + { + "epoch": 2.8461784457946915, + "grad_norm": 0.041541390120983124, + "learning_rate": 3.577299475502111e-05, + "loss": 0.0087, + "step": 8900 + }, + { + "epoch": 2.8621682123440997, + "grad_norm": 0.06973368674516678, + "learning_rate": 3.569304080849431e-05, + "loss": 0.0084, + "step": 8950 + }, + { + "epoch": 2.8781579788935083, + "grad_norm": 0.02823769487440586, + "learning_rate": 3.561308686196751e-05, + "loss": 0.0084, + "step": 9000 + }, + { + "epoch": 2.8941477454429165, + "grad_norm": 0.07545454800128937, + "learning_rate": 3.553313291544071e-05, + "loss": 0.0081, + "step": 9050 + }, + { + "epoch": 2.9101375119923247, + "grad_norm": 0.09222853183746338, + "learning_rate": 3.545317896891391e-05, + "loss": 0.0079, + "step": 9100 + }, + { + "epoch": 2.9261272785417334, + "grad_norm": 0.06112439185380936, + "learning_rate": 3.5373225022387105e-05, + "loss": 0.0081, + "step": 9150 + }, + { + "epoch": 2.9421170450911416, + "grad_norm": 0.03861480578780174, + "learning_rate": 3.529327107586031e-05, + "loss": 0.0082, + "step": 9200 + }, + { + "epoch": 2.9581068116405502, + "grad_norm": 0.08047035336494446, + "learning_rate": 3.5213317129333505e-05, + "loss": 0.0085, + "step": 9250 + }, + { + "epoch": 2.9740965781899584, + "grad_norm": 0.0482422411441803, + "learning_rate": 3.513336318280671e-05, + "loss": 0.0078, + "step": 9300 + }, + { + "epoch": 2.9900863447393666, + "grad_norm": 0.10705345124006271, + "learning_rate": 3.5053409236279905e-05, + "loss": 0.0084, + "step": 9350 + }, + { + "epoch": 3.0060761112887753, + "grad_norm": 0.05015913024544716, + "learning_rate": 3.49734552897531e-05, + "loss": 0.0079, + "step": 9400 + }, + { + "epoch": 3.0220658778381835, + "grad_norm": 0.03695971891283989, + "learning_rate": 3.4893501343226305e-05, + "loss": 0.0083, + "step": 9450 + }, + { + "epoch": 3.038055644387592, + "grad_norm": 0.04805247113108635, + "learning_rate": 3.48135473966995e-05, + "loss": 0.0076, + "step": 9500 + }, + { + "epoch": 3.0540454109370003, + "grad_norm": 0.049487996846437454, + "learning_rate": 3.4733593450172705e-05, + "loss": 0.0079, + "step": 9550 + }, + { + "epoch": 3.0700351774864085, + "grad_norm": 0.037209343165159225, + "learning_rate": 3.46536395036459e-05, + "loss": 0.0079, + "step": 9600 + }, + { + "epoch": 3.086024944035817, + "grad_norm": 0.0906648263335228, + "learning_rate": 3.45736855571191e-05, + "loss": 0.0086, + "step": 9650 + }, + { + "epoch": 3.1020147105852254, + "grad_norm": 0.06882278621196747, + "learning_rate": 3.44937316105923e-05, + "loss": 0.0085, + "step": 9700 + }, + { + "epoch": 3.118004477134634, + "grad_norm": 0.04665728658437729, + "learning_rate": 3.44137776640655e-05, + "loss": 0.0081, + "step": 9750 + }, + { + "epoch": 3.1339942436840422, + "grad_norm": 0.05663415044546127, + "learning_rate": 3.43338237175387e-05, + "loss": 0.0085, + "step": 9800 + }, + { + "epoch": 3.1499840102334504, + "grad_norm": 0.03854561224579811, + "learning_rate": 3.42538697710119e-05, + "loss": 0.0081, + "step": 9850 + }, + { + "epoch": 3.165973776782859, + "grad_norm": 0.04344400390982628, + "learning_rate": 3.4173915824485095e-05, + "loss": 0.0084, + "step": 9900 + }, + { + "epoch": 3.1819635433322673, + "grad_norm": 0.04235091805458069, + "learning_rate": 3.40939618779583e-05, + "loss": 0.0073, + "step": 9950 + }, + { + "epoch": 3.197953309881676, + "grad_norm": 0.03422466665506363, + "learning_rate": 3.4014007931431496e-05, + "loss": 0.0085, + "step": 10000 + }, + { + "epoch": 3.213943076431084, + "grad_norm": 0.04774649068713188, + "learning_rate": 3.39340539849047e-05, + "loss": 0.0079, + "step": 10050 + }, + { + "epoch": 3.2299328429804923, + "grad_norm": 0.03879115357995033, + "learning_rate": 3.3854100038377896e-05, + "loss": 0.0084, + "step": 10100 + }, + { + "epoch": 3.245922609529901, + "grad_norm": 0.08118848502635956, + "learning_rate": 3.377414609185109e-05, + "loss": 0.0085, + "step": 10150 + }, + { + "epoch": 3.261912376079309, + "grad_norm": 0.05128278210759163, + "learning_rate": 3.3694192145324296e-05, + "loss": 0.0082, + "step": 10200 + }, + { + "epoch": 3.277902142628718, + "grad_norm": 0.05539889633655548, + "learning_rate": 3.361423819879749e-05, + "loss": 0.0077, + "step": 10250 + }, + { + "epoch": 3.293891909178126, + "grad_norm": 0.06546144187450409, + "learning_rate": 3.353428425227069e-05, + "loss": 0.008, + "step": 10300 + }, + { + "epoch": 3.3098816757275342, + "grad_norm": 0.055574435740709305, + "learning_rate": 3.345433030574389e-05, + "loss": 0.0078, + "step": 10350 + }, + { + "epoch": 3.325871442276943, + "grad_norm": 0.07268719375133514, + "learning_rate": 3.337437635921709e-05, + "loss": 0.0077, + "step": 10400 + }, + { + "epoch": 3.341861208826351, + "grad_norm": 0.05133509263396263, + "learning_rate": 3.329442241269029e-05, + "loss": 0.0086, + "step": 10450 + }, + { + "epoch": 3.3578509753757597, + "grad_norm": 0.039566852152347565, + "learning_rate": 3.321446846616349e-05, + "loss": 0.0078, + "step": 10500 + }, + { + "epoch": 3.373840741925168, + "grad_norm": 0.03541762754321098, + "learning_rate": 3.3134514519636686e-05, + "loss": 0.0079, + "step": 10550 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 0.0460277684032917, + "learning_rate": 3.305456057310989e-05, + "loss": 0.0079, + "step": 10600 + }, + { + "epoch": 3.4058202750239848, + "grad_norm": 0.08597122132778168, + "learning_rate": 3.2974606626583086e-05, + "loss": 0.0077, + "step": 10650 + }, + { + "epoch": 3.421810041573393, + "grad_norm": 0.07249324768781662, + "learning_rate": 3.289465268005629e-05, + "loss": 0.0074, + "step": 10700 + }, + { + "epoch": 3.4377998081228016, + "grad_norm": 0.04298989102244377, + "learning_rate": 3.2814698733529486e-05, + "loss": 0.0079, + "step": 10750 + }, + { + "epoch": 3.45378957467221, + "grad_norm": 0.029726523905992508, + "learning_rate": 3.273474478700268e-05, + "loss": 0.008, + "step": 10800 + }, + { + "epoch": 3.469779341221618, + "grad_norm": 0.04522155970335007, + "learning_rate": 3.2654790840475886e-05, + "loss": 0.0082, + "step": 10850 + }, + { + "epoch": 3.4857691077710267, + "grad_norm": 0.052699171006679535, + "learning_rate": 3.257483689394908e-05, + "loss": 0.0078, + "step": 10900 + }, + { + "epoch": 3.501758874320435, + "grad_norm": 0.06500669568777084, + "learning_rate": 3.2494882947422286e-05, + "loss": 0.0083, + "step": 10950 + }, + { + "epoch": 3.5177486408698435, + "grad_norm": 0.05794863402843475, + "learning_rate": 3.241492900089548e-05, + "loss": 0.0074, + "step": 11000 + }, + { + "epoch": 3.5337384074192517, + "grad_norm": 0.08507433533668518, + "learning_rate": 3.233497505436868e-05, + "loss": 0.0079, + "step": 11050 + }, + { + "epoch": 3.54972817396866, + "grad_norm": 0.07829931378364563, + "learning_rate": 3.225502110784189e-05, + "loss": 0.0075, + "step": 11100 + }, + { + "epoch": 3.5657179405180686, + "grad_norm": 0.07517477124929428, + "learning_rate": 3.2175067161315086e-05, + "loss": 0.0082, + "step": 11150 + }, + { + "epoch": 3.5817077070674768, + "grad_norm": 0.05099401995539665, + "learning_rate": 3.209511321478828e-05, + "loss": 0.0082, + "step": 11200 + }, + { + "epoch": 3.5976974736168854, + "grad_norm": 0.04134771227836609, + "learning_rate": 3.2015159268261486e-05, + "loss": 0.008, + "step": 11250 + }, + { + "epoch": 3.6136872401662936, + "grad_norm": 0.0595158226788044, + "learning_rate": 3.193520532173468e-05, + "loss": 0.0077, + "step": 11300 + }, + { + "epoch": 3.629677006715702, + "grad_norm": 0.08330442011356354, + "learning_rate": 3.1855251375207887e-05, + "loss": 0.0078, + "step": 11350 + }, + { + "epoch": 3.6456667732651105, + "grad_norm": 0.06455782800912857, + "learning_rate": 3.177529742868108e-05, + "loss": 0.0077, + "step": 11400 + }, + { + "epoch": 3.6616565398145187, + "grad_norm": 0.05490834638476372, + "learning_rate": 3.169534348215428e-05, + "loss": 0.0081, + "step": 11450 + }, + { + "epoch": 3.6776463063639273, + "grad_norm": 0.0402890108525753, + "learning_rate": 3.161538953562748e-05, + "loss": 0.0075, + "step": 11500 + }, + { + "epoch": 3.6936360729133355, + "grad_norm": 0.049679581075906754, + "learning_rate": 3.153543558910068e-05, + "loss": 0.0084, + "step": 11550 + }, + { + "epoch": 3.7096258394627437, + "grad_norm": 0.04728088527917862, + "learning_rate": 3.145548164257388e-05, + "loss": 0.0079, + "step": 11600 + }, + { + "epoch": 3.7256156060121524, + "grad_norm": 0.060690511018037796, + "learning_rate": 3.137552769604708e-05, + "loss": 0.0077, + "step": 11650 + }, + { + "epoch": 3.7416053725615606, + "grad_norm": 0.0751800686120987, + "learning_rate": 3.129557374952028e-05, + "loss": 0.0075, + "step": 11700 + }, + { + "epoch": 3.757595139110969, + "grad_norm": 0.056977272033691406, + "learning_rate": 3.121561980299348e-05, + "loss": 0.0079, + "step": 11750 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 0.06811773031949997, + "learning_rate": 3.113566585646668e-05, + "loss": 0.0081, + "step": 11800 + }, + { + "epoch": 3.7895746722097856, + "grad_norm": 0.040852271020412445, + "learning_rate": 3.105571190993988e-05, + "loss": 0.0075, + "step": 11850 + }, + { + "epoch": 3.8055644387591943, + "grad_norm": 0.0817752480506897, + "learning_rate": 3.097575796341308e-05, + "loss": 0.0078, + "step": 11900 + }, + { + "epoch": 3.8215542053086025, + "grad_norm": 0.045933738350868225, + "learning_rate": 3.0895804016886273e-05, + "loss": 0.0078, + "step": 11950 + }, + { + "epoch": 3.837543971858011, + "grad_norm": 0.04742230847477913, + "learning_rate": 3.081585007035948e-05, + "loss": 0.0074, + "step": 12000 + }, + { + "epoch": 3.837543971858011, + "eval_loss": 0.015844305977225304, + "eval_runtime": 41.1322, + "eval_samples_per_second": 22.197, + "eval_steps_per_second": 1.41, + "step": 12000 + }, + { + "epoch": 3.8535337384074193, + "grad_norm": 0.0642109289765358, + "learning_rate": 3.0735896123832674e-05, + "loss": 0.0082, + "step": 12050 + }, + { + "epoch": 3.8695235049568275, + "grad_norm": 0.08985566347837448, + "learning_rate": 3.065594217730588e-05, + "loss": 0.0079, + "step": 12100 + }, + { + "epoch": 3.885513271506236, + "grad_norm": 0.07430126518011093, + "learning_rate": 3.0575988230779074e-05, + "loss": 0.0076, + "step": 12150 + }, + { + "epoch": 3.9015030380556444, + "grad_norm": 0.05662678927183151, + "learning_rate": 3.0496034284252274e-05, + "loss": 0.0072, + "step": 12200 + }, + { + "epoch": 3.917492804605053, + "grad_norm": 0.08210637420415878, + "learning_rate": 3.0416080337725474e-05, + "loss": 0.0078, + "step": 12250 + }, + { + "epoch": 3.933482571154461, + "grad_norm": 0.09279511868953705, + "learning_rate": 3.033612639119867e-05, + "loss": 0.0075, + "step": 12300 + }, + { + "epoch": 3.9494723377038694, + "grad_norm": 0.03649890050292015, + "learning_rate": 3.025617244467187e-05, + "loss": 0.0075, + "step": 12350 + }, + { + "epoch": 3.965462104253278, + "grad_norm": 0.07408464699983597, + "learning_rate": 3.017621849814507e-05, + "loss": 0.0077, + "step": 12400 + }, + { + "epoch": 3.9814518708026863, + "grad_norm": 0.06778215616941452, + "learning_rate": 3.009626455161827e-05, + "loss": 0.0074, + "step": 12450 + }, + { + "epoch": 3.997441637352095, + "grad_norm": 0.07367826253175735, + "learning_rate": 3.0016310605091467e-05, + "loss": 0.008, + "step": 12500 + }, + { + "epoch": 4.013431403901503, + "grad_norm": 0.09045771509408951, + "learning_rate": 2.9936356658564667e-05, + "loss": 0.0076, + "step": 12550 + }, + { + "epoch": 4.029421170450911, + "grad_norm": 0.03875061869621277, + "learning_rate": 2.9856402712037867e-05, + "loss": 0.0075, + "step": 12600 + }, + { + "epoch": 4.0454109370003195, + "grad_norm": 0.0555877648293972, + "learning_rate": 2.9776448765511067e-05, + "loss": 0.007, + "step": 12650 + }, + { + "epoch": 4.061400703549729, + "grad_norm": 0.057610440999269485, + "learning_rate": 2.9696494818984267e-05, + "loss": 0.0075, + "step": 12700 + }, + { + "epoch": 4.077390470099137, + "grad_norm": 0.06584593653678894, + "learning_rate": 2.9616540872457464e-05, + "loss": 0.0075, + "step": 12750 + }, + { + "epoch": 4.093380236648545, + "grad_norm": 0.07038649171590805, + "learning_rate": 2.9536586925930664e-05, + "loss": 0.008, + "step": 12800 + }, + { + "epoch": 4.109370003197953, + "grad_norm": 0.05053670331835747, + "learning_rate": 2.9456632979403864e-05, + "loss": 0.008, + "step": 12850 + }, + { + "epoch": 4.125359769747361, + "grad_norm": 0.05693377926945686, + "learning_rate": 2.9376679032877064e-05, + "loss": 0.0073, + "step": 12900 + }, + { + "epoch": 4.1413495362967705, + "grad_norm": 0.07704003155231476, + "learning_rate": 2.9296725086350264e-05, + "loss": 0.0084, + "step": 12950 + }, + { + "epoch": 4.157339302846179, + "grad_norm": 0.05889516696333885, + "learning_rate": 2.921677113982346e-05, + "loss": 0.0078, + "step": 13000 + }, + { + "epoch": 4.173329069395587, + "grad_norm": 0.08338670432567596, + "learning_rate": 2.913681719329666e-05, + "loss": 0.007, + "step": 13050 + }, + { + "epoch": 4.189318835944995, + "grad_norm": 0.06243142858147621, + "learning_rate": 2.905686324676986e-05, + "loss": 0.0077, + "step": 13100 + }, + { + "epoch": 4.205308602494403, + "grad_norm": 0.04768793657422066, + "learning_rate": 2.897690930024306e-05, + "loss": 0.0076, + "step": 13150 + }, + { + "epoch": 4.2212983690438115, + "grad_norm": 0.04623186215758324, + "learning_rate": 2.889695535371626e-05, + "loss": 0.0072, + "step": 13200 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 0.059952862560749054, + "learning_rate": 2.8817001407189458e-05, + "loss": 0.0081, + "step": 13250 + }, + { + "epoch": 4.253277902142629, + "grad_norm": 0.05281800776720047, + "learning_rate": 2.8737047460662658e-05, + "loss": 0.0078, + "step": 13300 + }, + { + "epoch": 4.269267668692037, + "grad_norm": 0.07640237361192703, + "learning_rate": 2.8657093514135858e-05, + "loss": 0.0074, + "step": 13350 + }, + { + "epoch": 4.285257435241445, + "grad_norm": 0.06750530749559402, + "learning_rate": 2.8577139567609058e-05, + "loss": 0.0075, + "step": 13400 + }, + { + "epoch": 4.301247201790854, + "grad_norm": 0.07524023205041885, + "learning_rate": 2.8497185621082258e-05, + "loss": 0.0075, + "step": 13450 + }, + { + "epoch": 4.3172369683402625, + "grad_norm": 0.07160656154155731, + "learning_rate": 2.8417231674555454e-05, + "loss": 0.0069, + "step": 13500 + }, + { + "epoch": 4.333226734889671, + "grad_norm": 0.06828583776950836, + "learning_rate": 2.8337277728028655e-05, + "loss": 0.0076, + "step": 13550 + }, + { + "epoch": 4.349216501439079, + "grad_norm": 0.061410386115312576, + "learning_rate": 2.8257323781501855e-05, + "loss": 0.0075, + "step": 13600 + }, + { + "epoch": 4.365206267988487, + "grad_norm": 0.06293186545372009, + "learning_rate": 2.8177369834975055e-05, + "loss": 0.0073, + "step": 13650 + }, + { + "epoch": 4.381196034537895, + "grad_norm": 0.062094636261463165, + "learning_rate": 2.8097415888448255e-05, + "loss": 0.0074, + "step": 13700 + }, + { + "epoch": 4.397185801087304, + "grad_norm": 0.10179385542869568, + "learning_rate": 2.801746194192145e-05, + "loss": 0.0077, + "step": 13750 + }, + { + "epoch": 4.413175567636713, + "grad_norm": 0.09233298152685165, + "learning_rate": 2.793750799539465e-05, + "loss": 0.0069, + "step": 13800 + }, + { + "epoch": 4.429165334186121, + "grad_norm": 0.08230157941579819, + "learning_rate": 2.785755404886785e-05, + "loss": 0.0069, + "step": 13850 + }, + { + "epoch": 4.445155100735529, + "grad_norm": 0.056463953107595444, + "learning_rate": 2.777760010234105e-05, + "loss": 0.0074, + "step": 13900 + }, + { + "epoch": 4.461144867284938, + "grad_norm": 0.06046191230416298, + "learning_rate": 2.7697646155814248e-05, + "loss": 0.0071, + "step": 13950 + }, + { + "epoch": 4.477134633834346, + "grad_norm": 0.06782487034797668, + "learning_rate": 2.7617692209287448e-05, + "loss": 0.0078, + "step": 14000 + } + ], + "logging_steps": 50, + "max_steps": 31270, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.095048815305556e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/training_args.bin b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6dbdff8b4afad36ed85290a57c3f67d54ed2560d --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2351918e4b9c08dd780eb0d2893ea88e9398fc16ba645474bf58d4db3583195 +size 7736 diff --git a/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/zero_to_fp32.py b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/weight_dir/loraWeight/trace_CRFLP/checkpoint-14000/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)