diff --git a/.gitattributes b/.gitattributes index bb5573d9df23bee0bc4375bc59912821fe460cc1..22be5cdc7e8da72e3cc43d253b978c49b73f8411 100644 --- a/.gitattributes +++ b/.gitattributes @@ -43,3 +43,4 @@ math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jso math/INP-PAR/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/INP-OH/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/INP/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text +math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0eb3d1aa081e0bfc2637a7740ff0f286a9deba17 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf9df9234dc653a13ebc37a58bc5247fc241e0d9c4f7f0d2e49203ba7a8b929c +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df205b91276f7ffb6580e8c652c62f54271c4694 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a11a0febb706843c34d89db0943ed63a747c7a4b28b88d25900c72332a6aaf +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..84dd31a0d9d10f6f97fd3fcaf9555113f9570551 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6603180b4f7b4f23eb2fefc470a07c3ec6223e2d309190662f43f30d96be9ce5 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bebd6f8aea042602cdbea7c81b9f67d21dc1bb50 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27f1021fb57112918a3a6091b09b0ccd50cb071a2324c12ae9afcc9851ee8bd3 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e899126a25538ff85c74c1e363ffbd951d4dda1e --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d8cdfceac9f7917b978dca661a3b8e04187faea5d5f6bd7b462d61d8234d57f +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2989d13db820938656f54404328fe2eb98bad623 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/trainer_state.json @@ -0,0 +1,283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21333333333333335, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ced3860fca24e17d74a5481751bce5be613ac53 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3060230d41dc9ae4e7608db56f3eea1306392dd1de03faebd841f30b160cc1 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bda503f22c0375ee6c025a9111765fcb6adc7137 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57580597ffdb6d3dea49b345f1bc5a35357a29e1c2bb2dc9e19a132c229e1d66 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9bef0a4cfb05a0d1c7d93cdd76a62c34be65e408 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cbbccc533dc6035b9eb3e81ab0c37a3544ee2638528b1cb900a84d35f5b76b2 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..efa56583b4b7d9817e369af4d3b7ec8e31b20fae --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69989ab343a15b03b088a3c9e2bb39b88ed718b675223b5e1d55890e63636453 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f90e16891856bcfb31d679597efff574807cb3ce --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d9aa99505fc60c0db1b9cdacaa08b06e8a85c8aaaab4e389667a719fafb9bf +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..edc995bba816bcfee041d934a2813ce8dc784ada --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/trainer_state.json @@ -0,0 +1,2533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1365333333333334, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4790433386107907, + "avg_response_length": 212.5, + "avg_student_mask_ratio": 0.4790433386107907, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24621229091012536, + "epoch": 1.5162666666666667, + "grad_norm": 0.2099609375, + "kd_loss": 0.43454050603151584, + "learning_rate": 3e-06, + "loss": 0.5302, + "masked_tokens": 108.7375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7375 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.36416104665024707, + "epoch": 1.5375999999999999, + "grad_norm": 0.75, + "kd_loss": 0.5665610315164941, + "learning_rate": 3e-06, + "loss": 0.7121, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38923927966282007, + "epoch": 1.5589333333333333, + "grad_norm": 1.015625, + "kd_loss": 0.4302867329986782, + "learning_rate": 3e-06, + "loss": 0.639, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3361817517367399, + "epoch": 1.5802666666666667, + "grad_norm": 0.40234375, + "kd_loss": 0.5340734164818514, + "learning_rate": 3e-06, + "loss": 0.7461, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37429177601145514, + "epoch": 1.6016, + "grad_norm": 0.58203125, + "kd_loss": 0.5036597276406856, + "learning_rate": 3e-06, + "loss": 0.6491, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46706983938929625, + "avg_response_length": 216.0625, + "avg_student_mask_ratio": 0.46706983938929625, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4449058656399984, + "epoch": 1.6229333333333333, + "grad_norm": 0.8203125, + "kd_loss": 0.5661326096985168, + "learning_rate": 3e-06, + "loss": 0.7233, + "masked_tokens": 107.7, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.44156218122225255, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.44156218122225255, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25899335961771613, + "epoch": 1.6442666666666668, + "grad_norm": 0.396484375, + "kd_loss": 0.4095979654902003, + "learning_rate": 3e-06, + "loss": 0.5099, + "masked_tokens": 117.5, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.5 + }, + { + "avg_mask_ratio": 0.42836043585848527, + "avg_response_length": 258.5125, + "avg_student_mask_ratio": 0.42836043585848527, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2897560694203321, + "epoch": 1.6656, + "grad_norm": 0.2431640625, + "kd_loss": 0.34635278815572546, + "learning_rate": 3e-06, + "loss": 0.4802, + "masked_tokens": 119.0125, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 119.0125 + }, + { + "avg_mask_ratio": 0.46589430308085866, + "avg_response_length": 222.3125, + "avg_student_mask_ratio": 0.46589430308085866, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21603642557238345, + "epoch": 1.6869333333333332, + "grad_norm": 0.140625, + "kd_loss": 0.33674514803767297, + "learning_rate": 3e-06, + "loss": 0.489, + "masked_tokens": 103.25, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 103.25 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2663005536277069, + "epoch": 1.7082666666666668, + "grad_norm": 0.23828125, + "kd_loss": 0.35138718315538425, + "learning_rate": 3e-06, + "loss": 0.5434, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.503375941584818, + "avg_response_length": 237.85, + "avg_student_mask_ratio": 0.503375941584818, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4813590554784753, + "epoch": 1.7296, + "grad_norm": 1.6015625, + "kd_loss": 0.45312339970045057, + "learning_rate": 3e-06, + "loss": 0.706, + "masked_tokens": 118.2125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 118.2125 + }, + { + "avg_mask_ratio": 0.5110091455746442, + "avg_response_length": 209.0875, + "avg_student_mask_ratio": 0.5110091455746442, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4535834654417954, + "epoch": 1.7509333333333332, + "grad_norm": 0.70703125, + "kd_loss": 0.5985253949772413, + "learning_rate": 3e-06, + "loss": 0.7794, + "masked_tokens": 120.95, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 120.95 + }, + { + "avg_mask_ratio": 0.49899387182667854, + "avg_response_length": 263.975, + "avg_student_mask_ratio": 0.49899387182667854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40083689643704473, + "epoch": 1.7722666666666667, + "grad_norm": 0.1708984375, + "kd_loss": 0.5644028104892641, + "learning_rate": 3e-06, + "loss": 0.7632, + "masked_tokens": 137.075, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.075 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2870929398425915, + "epoch": 1.7936, + "grad_norm": 0.345703125, + "kd_loss": 0.4698917509396324, + "learning_rate": 3e-06, + "loss": 0.6327, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3023421537889817, + "epoch": 1.8149333333333333, + "grad_norm": 0.443359375, + "kd_loss": 0.3271854338312551, + "learning_rate": 3e-06, + "loss": 0.5634, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4635998342186213, + "avg_response_length": 229.125, + "avg_student_mask_ratio": 0.4635998342186213, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37467331880507115, + "epoch": 1.8362666666666667, + "grad_norm": 0.384765625, + "kd_loss": 0.4431717619034316, + "learning_rate": 3e-06, + "loss": 0.5956, + "masked_tokens": 109.675, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.49111039767740294, + "avg_response_length": 229.1, + "avg_student_mask_ratio": 0.49111039767740294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3838037288314126, + "epoch": 1.8576000000000001, + "grad_norm": 0.333984375, + "kd_loss": 0.47523635068355363, + "learning_rate": 3e-06, + "loss": 0.6859, + "masked_tokens": 115.6625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 115.6625 + }, + { + "avg_mask_ratio": 0.4427660425659269, + "avg_response_length": 198.5625, + "avg_student_mask_ratio": 0.4427660425659269, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33109274096627817, + "epoch": 1.8789333333333333, + "grad_norm": 1.0859375, + "kd_loss": 0.46695662873548827, + "learning_rate": 3e-06, + "loss": 0.6284, + "masked_tokens": 91.175, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.175 + }, + { + "avg_mask_ratio": 0.4464349385118112, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.4464349385118112, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22229116438190885, + "epoch": 1.9002666666666665, + "grad_norm": 0.12890625, + "kd_loss": 0.4006316699657759, + "learning_rate": 3e-06, + "loss": 0.4934, + "masked_tokens": 101.75, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.75 + }, + { + "avg_mask_ratio": 0.44976164362160487, + "avg_response_length": 227.7875, + "avg_student_mask_ratio": 0.44976164362160487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38169105723031577, + "epoch": 1.9216, + "grad_norm": 1.765625, + "kd_loss": 0.47280531010078086, + "learning_rate": 3e-06, + "loss": 0.6337, + "masked_tokens": 103.475, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.475 + }, + { + "avg_mask_ratio": 0.475579984736396, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475579984736396, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27549623605577833, + "epoch": 1.9429333333333334, + "grad_norm": 0.451171875, + "kd_loss": 0.4638562942510987, + "learning_rate": 3e-06, + "loss": 0.5387, + "masked_tokens": 124.6375, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 124.6375 + }, + { + "avg_mask_ratio": 0.4688875659601763, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4688875659601763, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2772836374151325, + "epoch": 1.9642666666666666, + "grad_norm": 0.416015625, + "kd_loss": 0.44530672791033793, + "learning_rate": 3e-06, + "loss": 0.6177, + "masked_tokens": 110.0125, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47999348094454036, + "avg_response_length": 237.05, + "avg_student_mask_ratio": 0.47999348094454036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2901802834984665, + "epoch": 1.9856, + "grad_norm": 0.37890625, + "kd_loss": 0.4553093938939094, + "learning_rate": 3e-06, + "loss": 0.5905, + "masked_tokens": 121.6, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 121.6 + }, + { + "avg_mask_ratio": 0.49413903727240505, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.49413903727240505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37941894131193166, + "epoch": 2.0085333333333333, + "grad_norm": 0.4921875, + "kd_loss": 0.4946319753903075, + "learning_rate": 3e-06, + "loss": 0.6668, + "masked_tokens": 120.5, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.5 + }, + { + "avg_mask_ratio": 0.4368605303927325, + "avg_response_length": 240.9125, + "avg_student_mask_ratio": 0.4368605303927325, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22575005246883392, + "epoch": 2.0298666666666665, + "grad_norm": 1.1875, + "kd_loss": 0.4342805288508771, + "learning_rate": 3e-06, + "loss": 0.5248, + "masked_tokens": 111.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 111.4125 + }, + { + "avg_mask_ratio": 0.4988762516761199, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4988762516761199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.49722497609602667, + "epoch": 2.0512, + "grad_norm": 0.40625, + "kd_loss": 0.5839257182941765, + "learning_rate": 3e-06, + "loss": 0.7523, + "masked_tokens": 143.825, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.825 + }, + { + "avg_mask_ratio": 0.437801384011982, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.437801384011982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2855980422358698, + "epoch": 2.0725333333333333, + "grad_norm": 0.4765625, + "kd_loss": 0.35673561348757377, + "learning_rate": 3e-06, + "loss": 0.538, + "masked_tokens": 107.025, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.025 + }, + { + "avg_mask_ratio": 0.42220073882490394, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42220073882490394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2906558813129777, + "epoch": 2.0938666666666665, + "grad_norm": 0.466796875, + "kd_loss": 0.36284122784349504, + "learning_rate": 3e-06, + "loss": 0.4889, + "masked_tokens": 97.7875, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 97.7875 + }, + { + "avg_mask_ratio": 0.4605769342277199, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4605769342277199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.18629460762591635, + "epoch": 2.1152, + "grad_norm": 0.625, + "kd_loss": 0.4187604939788798, + "learning_rate": 3e-06, + "loss": 0.5063, + "masked_tokens": 122.0, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 122.0 + }, + { + "avg_mask_ratio": 0.4547682981239632, + "avg_response_length": 215.3, + "avg_student_mask_ratio": 0.4547682981239632, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.26735156250199454, + "epoch": 2.1365333333333334, + "grad_norm": 0.26953125, + "kd_loss": 0.3440752963605235, + "learning_rate": 3e-06, + "loss": 0.5169, + "masked_tokens": 100.775, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.775 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..649b125662d245f69e53cbacfb0dce5160a9add8 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0db96232a244b6cab173add792ca63b51cf8b25da144eaf8d1f4d360eb0131 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d56b5b57b4f934a9349b42ae796aa9ecdeff0de2 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4152fa051690004fa25524b0e8f1171a8278945d174cb036cbb0a1b9fde3d01c +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a414893d506cea6e26edc9aee4315ab3b08e349 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:907e39dff0cf7ad1a1affaa1e7047653794ab16e25c6977ce7b5524769fdf799 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..153bdb1075947f3d8b9332967b3c5eaada5ea686 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf0b85ec66092ce471ce68f6fcda64182d50a012837c2e4c9f9690fe3e5f5c3 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..125c51aef8c1558b284b7ffdb401f40b1199eb92 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25795e3b7374d0f6abdd7ab4b34fbf7ab0447ba73c04014500c2ab8b5acec5b4 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..118402c6c7f9831b624682e2f714b914b897977d --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/trainer_state.json @@ -0,0 +1,2783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3498666666666668, + "eval_steps": 500, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4790433386107907, + "avg_response_length": 212.5, + "avg_student_mask_ratio": 0.4790433386107907, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24621229091012536, + "epoch": 1.5162666666666667, + "grad_norm": 0.2099609375, + "kd_loss": 0.43454050603151584, + "learning_rate": 3e-06, + "loss": 0.5302, + "masked_tokens": 108.7375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7375 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.36416104665024707, + "epoch": 1.5375999999999999, + "grad_norm": 0.75, + "kd_loss": 0.5665610315164941, + "learning_rate": 3e-06, + "loss": 0.7121, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38923927966282007, + "epoch": 1.5589333333333333, + "grad_norm": 1.015625, + "kd_loss": 0.4302867329986782, + "learning_rate": 3e-06, + "loss": 0.639, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3361817517367399, + "epoch": 1.5802666666666667, + "grad_norm": 0.40234375, + "kd_loss": 0.5340734164818514, + "learning_rate": 3e-06, + "loss": 0.7461, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37429177601145514, + "epoch": 1.6016, + "grad_norm": 0.58203125, + "kd_loss": 0.5036597276406856, + "learning_rate": 3e-06, + "loss": 0.6491, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46706983938929625, + "avg_response_length": 216.0625, + "avg_student_mask_ratio": 0.46706983938929625, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4449058656399984, + "epoch": 1.6229333333333333, + "grad_norm": 0.8203125, + "kd_loss": 0.5661326096985168, + "learning_rate": 3e-06, + "loss": 0.7233, + "masked_tokens": 107.7, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.44156218122225255, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.44156218122225255, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25899335961771613, + "epoch": 1.6442666666666668, + "grad_norm": 0.396484375, + "kd_loss": 0.4095979654902003, + "learning_rate": 3e-06, + "loss": 0.5099, + "masked_tokens": 117.5, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.5 + }, + { + "avg_mask_ratio": 0.42836043585848527, + "avg_response_length": 258.5125, + "avg_student_mask_ratio": 0.42836043585848527, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2897560694203321, + "epoch": 1.6656, + "grad_norm": 0.2431640625, + "kd_loss": 0.34635278815572546, + "learning_rate": 3e-06, + "loss": 0.4802, + "masked_tokens": 119.0125, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 119.0125 + }, + { + "avg_mask_ratio": 0.46589430308085866, + "avg_response_length": 222.3125, + "avg_student_mask_ratio": 0.46589430308085866, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21603642557238345, + "epoch": 1.6869333333333332, + "grad_norm": 0.140625, + "kd_loss": 0.33674514803767297, + "learning_rate": 3e-06, + "loss": 0.489, + "masked_tokens": 103.25, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 103.25 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2663005536277069, + "epoch": 1.7082666666666668, + "grad_norm": 0.23828125, + "kd_loss": 0.35138718315538425, + "learning_rate": 3e-06, + "loss": 0.5434, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.503375941584818, + "avg_response_length": 237.85, + "avg_student_mask_ratio": 0.503375941584818, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4813590554784753, + "epoch": 1.7296, + "grad_norm": 1.6015625, + "kd_loss": 0.45312339970045057, + "learning_rate": 3e-06, + "loss": 0.706, + "masked_tokens": 118.2125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 118.2125 + }, + { + "avg_mask_ratio": 0.5110091455746442, + "avg_response_length": 209.0875, + "avg_student_mask_ratio": 0.5110091455746442, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4535834654417954, + "epoch": 1.7509333333333332, + "grad_norm": 0.70703125, + "kd_loss": 0.5985253949772413, + "learning_rate": 3e-06, + "loss": 0.7794, + "masked_tokens": 120.95, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 120.95 + }, + { + "avg_mask_ratio": 0.49899387182667854, + "avg_response_length": 263.975, + "avg_student_mask_ratio": 0.49899387182667854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40083689643704473, + "epoch": 1.7722666666666667, + "grad_norm": 0.1708984375, + "kd_loss": 0.5644028104892641, + "learning_rate": 3e-06, + "loss": 0.7632, + "masked_tokens": 137.075, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.075 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2870929398425915, + "epoch": 1.7936, + "grad_norm": 0.345703125, + "kd_loss": 0.4698917509396324, + "learning_rate": 3e-06, + "loss": 0.6327, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3023421537889817, + "epoch": 1.8149333333333333, + "grad_norm": 0.443359375, + "kd_loss": 0.3271854338312551, + "learning_rate": 3e-06, + "loss": 0.5634, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4635998342186213, + "avg_response_length": 229.125, + "avg_student_mask_ratio": 0.4635998342186213, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37467331880507115, + "epoch": 1.8362666666666667, + "grad_norm": 0.384765625, + "kd_loss": 0.4431717619034316, + "learning_rate": 3e-06, + "loss": 0.5956, + "masked_tokens": 109.675, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.49111039767740294, + "avg_response_length": 229.1, + "avg_student_mask_ratio": 0.49111039767740294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3838037288314126, + "epoch": 1.8576000000000001, + "grad_norm": 0.333984375, + "kd_loss": 0.47523635068355363, + "learning_rate": 3e-06, + "loss": 0.6859, + "masked_tokens": 115.6625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 115.6625 + }, + { + "avg_mask_ratio": 0.4427660425659269, + "avg_response_length": 198.5625, + "avg_student_mask_ratio": 0.4427660425659269, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33109274096627817, + "epoch": 1.8789333333333333, + "grad_norm": 1.0859375, + "kd_loss": 0.46695662873548827, + "learning_rate": 3e-06, + "loss": 0.6284, + "masked_tokens": 91.175, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.175 + }, + { + "avg_mask_ratio": 0.4464349385118112, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.4464349385118112, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22229116438190885, + "epoch": 1.9002666666666665, + "grad_norm": 0.12890625, + "kd_loss": 0.4006316699657759, + "learning_rate": 3e-06, + "loss": 0.4934, + "masked_tokens": 101.75, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.75 + }, + { + "avg_mask_ratio": 0.44976164362160487, + "avg_response_length": 227.7875, + "avg_student_mask_ratio": 0.44976164362160487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38169105723031577, + "epoch": 1.9216, + "grad_norm": 1.765625, + "kd_loss": 0.47280531010078086, + "learning_rate": 3e-06, + "loss": 0.6337, + "masked_tokens": 103.475, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.475 + }, + { + "avg_mask_ratio": 0.475579984736396, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475579984736396, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27549623605577833, + "epoch": 1.9429333333333334, + "grad_norm": 0.451171875, + "kd_loss": 0.4638562942510987, + "learning_rate": 3e-06, + "loss": 0.5387, + "masked_tokens": 124.6375, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 124.6375 + }, + { + "avg_mask_ratio": 0.4688875659601763, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4688875659601763, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2772836374151325, + "epoch": 1.9642666666666666, + "grad_norm": 0.416015625, + "kd_loss": 0.44530672791033793, + "learning_rate": 3e-06, + "loss": 0.6177, + "masked_tokens": 110.0125, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47999348094454036, + "avg_response_length": 237.05, + "avg_student_mask_ratio": 0.47999348094454036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2901802834984665, + "epoch": 1.9856, + "grad_norm": 0.37890625, + "kd_loss": 0.4553093938939094, + "learning_rate": 3e-06, + "loss": 0.5905, + "masked_tokens": 121.6, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 121.6 + }, + { + "avg_mask_ratio": 0.49413903727240505, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.49413903727240505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37941894131193166, + "epoch": 2.0085333333333333, + "grad_norm": 0.4921875, + "kd_loss": 0.4946319753903075, + "learning_rate": 3e-06, + "loss": 0.6668, + "masked_tokens": 120.5, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.5 + }, + { + "avg_mask_ratio": 0.4368605303927325, + "avg_response_length": 240.9125, + "avg_student_mask_ratio": 0.4368605303927325, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22575005246883392, + "epoch": 2.0298666666666665, + "grad_norm": 1.1875, + "kd_loss": 0.4342805288508771, + "learning_rate": 3e-06, + "loss": 0.5248, + "masked_tokens": 111.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 111.4125 + }, + { + "avg_mask_ratio": 0.4988762516761199, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4988762516761199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.49722497609602667, + "epoch": 2.0512, + "grad_norm": 0.40625, + "kd_loss": 0.5839257182941765, + "learning_rate": 3e-06, + "loss": 0.7523, + "masked_tokens": 143.825, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.825 + }, + { + "avg_mask_ratio": 0.437801384011982, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.437801384011982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2855980422358698, + "epoch": 2.0725333333333333, + "grad_norm": 0.4765625, + "kd_loss": 0.35673561348757377, + "learning_rate": 3e-06, + "loss": 0.538, + "masked_tokens": 107.025, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.025 + }, + { + "avg_mask_ratio": 0.42220073882490394, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42220073882490394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2906558813129777, + "epoch": 2.0938666666666665, + "grad_norm": 0.466796875, + "kd_loss": 0.36284122784349504, + "learning_rate": 3e-06, + "loss": 0.4889, + "masked_tokens": 97.7875, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 97.7875 + }, + { + "avg_mask_ratio": 0.4605769342277199, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4605769342277199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.18629460762591635, + "epoch": 2.1152, + "grad_norm": 0.625, + "kd_loss": 0.4187604939788798, + "learning_rate": 3e-06, + "loss": 0.5063, + "masked_tokens": 122.0, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 122.0 + }, + { + "avg_mask_ratio": 0.4547682981239632, + "avg_response_length": 215.3, + "avg_student_mask_ratio": 0.4547682981239632, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.26735156250199454, + "epoch": 2.1365333333333334, + "grad_norm": 0.26953125, + "kd_loss": 0.3440752963605235, + "learning_rate": 3e-06, + "loss": 0.5169, + "masked_tokens": 100.775, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.775 + }, + { + "avg_mask_ratio": 0.43540415074676275, + "avg_response_length": 215.0, + "avg_student_mask_ratio": 0.43540415074676275, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.28448459618934974, + "epoch": 2.1578666666666666, + "grad_norm": 0.2216796875, + "kd_loss": 0.36393369872412384, + "learning_rate": 3e-06, + "loss": 0.503, + "masked_tokens": 88.65, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 88.65 + }, + { + "avg_mask_ratio": 0.5063220548443497, + "avg_response_length": 206.9125, + "avg_student_mask_ratio": 0.5063220548443497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3141316814458378, + "epoch": 2.1792, + "grad_norm": 0.328125, + "kd_loss": 0.49756694839059035, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 110.25, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 110.25 + }, + { + "avg_mask_ratio": 0.46985941788880153, + "avg_response_length": 220.05, + "avg_student_mask_ratio": 0.46985941788880153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267214318231197, + "epoch": 2.2005333333333335, + "grad_norm": 0.423828125, + "kd_loss": 0.4489077641891422, + "learning_rate": 3e-06, + "loss": 0.6384, + "masked_tokens": 104.9, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 104.9 + }, + { + "avg_mask_ratio": 0.49566771630197765, + "avg_response_length": 213.7, + "avg_student_mask_ratio": 0.49566771630197765, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2765686680849626, + "epoch": 2.2218666666666667, + "grad_norm": 0.74609375, + "kd_loss": 0.5419906556950081, + "learning_rate": 3e-06, + "loss": 0.6686, + "masked_tokens": 100.35, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 100.35 + }, + { + "avg_mask_ratio": 0.5123252369463444, + "avg_response_length": 239.1125, + "avg_student_mask_ratio": 0.5123252369463444, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3092239250220928, + "epoch": 2.2432, + "grad_norm": 0.412109375, + "kd_loss": 0.5601085751741266, + "learning_rate": 3e-06, + "loss": 0.6416, + "masked_tokens": 123.0, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 123.0 + }, + { + "avg_mask_ratio": 0.46218636581033934, + "avg_response_length": 273.7875, + "avg_student_mask_ratio": 0.46218636581033934, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33267747830594485, + "epoch": 2.2645333333333335, + "grad_norm": 0.3984375, + "kd_loss": 0.49028674410892564, + "learning_rate": 3e-06, + "loss": 0.5827, + "masked_tokens": 122.125, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 122.125 + }, + { + "avg_mask_ratio": 0.48546303423354403, + "avg_response_length": 260.8125, + "avg_student_mask_ratio": 0.48546303423354403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4495345233380249, + "epoch": 2.2858666666666667, + "grad_norm": 0.345703125, + "kd_loss": 0.422707377332182, + "learning_rate": 3e-06, + "loss": 0.6038, + "masked_tokens": 138.0375, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 138.0375 + }, + { + "avg_mask_ratio": 0.4625250873621553, + "avg_response_length": 219.8875, + "avg_student_mask_ratio": 0.4625250873621553, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3075333785695989, + "epoch": 2.3072, + "grad_norm": 0.087890625, + "kd_loss": 0.3667106795892323, + "learning_rate": 3e-06, + "loss": 0.4549, + "masked_tokens": 100.5625, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 100.5625 + }, + { + "avg_mask_ratio": 0.44248262273031286, + "avg_response_length": 213.55, + "avg_student_mask_ratio": 0.44248262273031286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24968633314620092, + "epoch": 2.3285333333333336, + "grad_norm": 0.11572265625, + "kd_loss": 0.3935246549681978, + "learning_rate": 3e-06, + "loss": 0.5053, + "masked_tokens": 91.9125, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 91.9125 + }, + { + "avg_mask_ratio": 0.5204601250356063, + "avg_response_length": 246.1125, + "avg_student_mask_ratio": 0.5204601250356063, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40365264619552477, + "epoch": 2.3498666666666668, + "grad_norm": 0.37109375, + "kd_loss": 0.4355207666182196, + "learning_rate": 3e-06, + "loss": 0.6746, + "masked_tokens": 133.1875, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 133.1875 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0366afdc407c724600d79c9560e0dc3b4cc594dd --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb291f85216472941823a02c50c071b544197be99565c1a4cfeb03ab8292d425 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f2e111ad8f8f43235277e23683a4cf2c8aee664 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2d46298a65b9c03cf05313711e1a42dabb2761fb50ad9c342ca544cbbbd5d6 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..921a14eba8310d556048263a58727eadbc6dcc1b --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1394463a46489e6dce7c0369a296b9effad20c6a87b30dbb892b34b73b5d6365 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d70a0d655a420e01dc52403012951d226ff25d1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c8dcfaaa26518a9158534ae7671344dd035fcb11f3b40b193e3c3bd47123883 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b1aeae281b5040d3bcd2aa5b378a5c2504e2b5 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8c95a6d9085dfcee1e6620c88ede526366d3a02c5018932b1bc04809c0e0c7 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..757b5c2a1269ffa9f9815c213cfc7f06536affe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/trainer_state.json @@ -0,0 +1,3033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5632, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4790433386107907, + "avg_response_length": 212.5, + "avg_student_mask_ratio": 0.4790433386107907, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24621229091012536, + "epoch": 1.5162666666666667, + "grad_norm": 0.2099609375, + "kd_loss": 0.43454050603151584, + "learning_rate": 3e-06, + "loss": 0.5302, + "masked_tokens": 108.7375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7375 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.36416104665024707, + "epoch": 1.5375999999999999, + "grad_norm": 0.75, + "kd_loss": 0.5665610315164941, + "learning_rate": 3e-06, + "loss": 0.7121, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38923927966282007, + "epoch": 1.5589333333333333, + "grad_norm": 1.015625, + "kd_loss": 0.4302867329986782, + "learning_rate": 3e-06, + "loss": 0.639, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3361817517367399, + "epoch": 1.5802666666666667, + "grad_norm": 0.40234375, + "kd_loss": 0.5340734164818514, + "learning_rate": 3e-06, + "loss": 0.7461, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37429177601145514, + "epoch": 1.6016, + "grad_norm": 0.58203125, + "kd_loss": 0.5036597276406856, + "learning_rate": 3e-06, + "loss": 0.6491, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46706983938929625, + "avg_response_length": 216.0625, + "avg_student_mask_ratio": 0.46706983938929625, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4449058656399984, + "epoch": 1.6229333333333333, + "grad_norm": 0.8203125, + "kd_loss": 0.5661326096985168, + "learning_rate": 3e-06, + "loss": 0.7233, + "masked_tokens": 107.7, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.44156218122225255, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.44156218122225255, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25899335961771613, + "epoch": 1.6442666666666668, + "grad_norm": 0.396484375, + "kd_loss": 0.4095979654902003, + "learning_rate": 3e-06, + "loss": 0.5099, + "masked_tokens": 117.5, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.5 + }, + { + "avg_mask_ratio": 0.42836043585848527, + "avg_response_length": 258.5125, + "avg_student_mask_ratio": 0.42836043585848527, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2897560694203321, + "epoch": 1.6656, + "grad_norm": 0.2431640625, + "kd_loss": 0.34635278815572546, + "learning_rate": 3e-06, + "loss": 0.4802, + "masked_tokens": 119.0125, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 119.0125 + }, + { + "avg_mask_ratio": 0.46589430308085866, + "avg_response_length": 222.3125, + "avg_student_mask_ratio": 0.46589430308085866, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21603642557238345, + "epoch": 1.6869333333333332, + "grad_norm": 0.140625, + "kd_loss": 0.33674514803767297, + "learning_rate": 3e-06, + "loss": 0.489, + "masked_tokens": 103.25, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 103.25 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2663005536277069, + "epoch": 1.7082666666666668, + "grad_norm": 0.23828125, + "kd_loss": 0.35138718315538425, + "learning_rate": 3e-06, + "loss": 0.5434, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.503375941584818, + "avg_response_length": 237.85, + "avg_student_mask_ratio": 0.503375941584818, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4813590554784753, + "epoch": 1.7296, + "grad_norm": 1.6015625, + "kd_loss": 0.45312339970045057, + "learning_rate": 3e-06, + "loss": 0.706, + "masked_tokens": 118.2125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 118.2125 + }, + { + "avg_mask_ratio": 0.5110091455746442, + "avg_response_length": 209.0875, + "avg_student_mask_ratio": 0.5110091455746442, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4535834654417954, + "epoch": 1.7509333333333332, + "grad_norm": 0.70703125, + "kd_loss": 0.5985253949772413, + "learning_rate": 3e-06, + "loss": 0.7794, + "masked_tokens": 120.95, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 120.95 + }, + { + "avg_mask_ratio": 0.49899387182667854, + "avg_response_length": 263.975, + "avg_student_mask_ratio": 0.49899387182667854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40083689643704473, + "epoch": 1.7722666666666667, + "grad_norm": 0.1708984375, + "kd_loss": 0.5644028104892641, + "learning_rate": 3e-06, + "loss": 0.7632, + "masked_tokens": 137.075, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.075 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2870929398425915, + "epoch": 1.7936, + "grad_norm": 0.345703125, + "kd_loss": 0.4698917509396324, + "learning_rate": 3e-06, + "loss": 0.6327, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3023421537889817, + "epoch": 1.8149333333333333, + "grad_norm": 0.443359375, + "kd_loss": 0.3271854338312551, + "learning_rate": 3e-06, + "loss": 0.5634, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4635998342186213, + "avg_response_length": 229.125, + "avg_student_mask_ratio": 0.4635998342186213, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37467331880507115, + "epoch": 1.8362666666666667, + "grad_norm": 0.384765625, + "kd_loss": 0.4431717619034316, + "learning_rate": 3e-06, + "loss": 0.5956, + "masked_tokens": 109.675, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.49111039767740294, + "avg_response_length": 229.1, + "avg_student_mask_ratio": 0.49111039767740294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3838037288314126, + "epoch": 1.8576000000000001, + "grad_norm": 0.333984375, + "kd_loss": 0.47523635068355363, + "learning_rate": 3e-06, + "loss": 0.6859, + "masked_tokens": 115.6625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 115.6625 + }, + { + "avg_mask_ratio": 0.4427660425659269, + "avg_response_length": 198.5625, + "avg_student_mask_ratio": 0.4427660425659269, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33109274096627817, + "epoch": 1.8789333333333333, + "grad_norm": 1.0859375, + "kd_loss": 0.46695662873548827, + "learning_rate": 3e-06, + "loss": 0.6284, + "masked_tokens": 91.175, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.175 + }, + { + "avg_mask_ratio": 0.4464349385118112, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.4464349385118112, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22229116438190885, + "epoch": 1.9002666666666665, + "grad_norm": 0.12890625, + "kd_loss": 0.4006316699657759, + "learning_rate": 3e-06, + "loss": 0.4934, + "masked_tokens": 101.75, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.75 + }, + { + "avg_mask_ratio": 0.44976164362160487, + "avg_response_length": 227.7875, + "avg_student_mask_ratio": 0.44976164362160487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38169105723031577, + "epoch": 1.9216, + "grad_norm": 1.765625, + "kd_loss": 0.47280531010078086, + "learning_rate": 3e-06, + "loss": 0.6337, + "masked_tokens": 103.475, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.475 + }, + { + "avg_mask_ratio": 0.475579984736396, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475579984736396, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27549623605577833, + "epoch": 1.9429333333333334, + "grad_norm": 0.451171875, + "kd_loss": 0.4638562942510987, + "learning_rate": 3e-06, + "loss": 0.5387, + "masked_tokens": 124.6375, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 124.6375 + }, + { + "avg_mask_ratio": 0.4688875659601763, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4688875659601763, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2772836374151325, + "epoch": 1.9642666666666666, + "grad_norm": 0.416015625, + "kd_loss": 0.44530672791033793, + "learning_rate": 3e-06, + "loss": 0.6177, + "masked_tokens": 110.0125, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47999348094454036, + "avg_response_length": 237.05, + "avg_student_mask_ratio": 0.47999348094454036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2901802834984665, + "epoch": 1.9856, + "grad_norm": 0.37890625, + "kd_loss": 0.4553093938939094, + "learning_rate": 3e-06, + "loss": 0.5905, + "masked_tokens": 121.6, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 121.6 + }, + { + "avg_mask_ratio": 0.49413903727240505, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.49413903727240505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37941894131193166, + "epoch": 2.0085333333333333, + "grad_norm": 0.4921875, + "kd_loss": 0.4946319753903075, + "learning_rate": 3e-06, + "loss": 0.6668, + "masked_tokens": 120.5, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.5 + }, + { + "avg_mask_ratio": 0.4368605303927325, + "avg_response_length": 240.9125, + "avg_student_mask_ratio": 0.4368605303927325, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22575005246883392, + "epoch": 2.0298666666666665, + "grad_norm": 1.1875, + "kd_loss": 0.4342805288508771, + "learning_rate": 3e-06, + "loss": 0.5248, + "masked_tokens": 111.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 111.4125 + }, + { + "avg_mask_ratio": 0.4988762516761199, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4988762516761199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.49722497609602667, + "epoch": 2.0512, + "grad_norm": 0.40625, + "kd_loss": 0.5839257182941765, + "learning_rate": 3e-06, + "loss": 0.7523, + "masked_tokens": 143.825, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.825 + }, + { + "avg_mask_ratio": 0.437801384011982, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.437801384011982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2855980422358698, + "epoch": 2.0725333333333333, + "grad_norm": 0.4765625, + "kd_loss": 0.35673561348757377, + "learning_rate": 3e-06, + "loss": 0.538, + "masked_tokens": 107.025, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.025 + }, + { + "avg_mask_ratio": 0.42220073882490394, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42220073882490394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2906558813129777, + "epoch": 2.0938666666666665, + "grad_norm": 0.466796875, + "kd_loss": 0.36284122784349504, + "learning_rate": 3e-06, + "loss": 0.4889, + "masked_tokens": 97.7875, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 97.7875 + }, + { + "avg_mask_ratio": 0.4605769342277199, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4605769342277199, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.18629460762591635, + "epoch": 2.1152, + "grad_norm": 0.625, + "kd_loss": 0.4187604939788798, + "learning_rate": 3e-06, + "loss": 0.5063, + "masked_tokens": 122.0, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 122.0 + }, + { + "avg_mask_ratio": 0.4547682981239632, + "avg_response_length": 215.3, + "avg_student_mask_ratio": 0.4547682981239632, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.26735156250199454, + "epoch": 2.1365333333333334, + "grad_norm": 0.26953125, + "kd_loss": 0.3440752963605235, + "learning_rate": 3e-06, + "loss": 0.5169, + "masked_tokens": 100.775, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.775 + }, + { + "avg_mask_ratio": 0.43540415074676275, + "avg_response_length": 215.0, + "avg_student_mask_ratio": 0.43540415074676275, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.28448459618934974, + "epoch": 2.1578666666666666, + "grad_norm": 0.2216796875, + "kd_loss": 0.36393369872412384, + "learning_rate": 3e-06, + "loss": 0.503, + "masked_tokens": 88.65, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 88.65 + }, + { + "avg_mask_ratio": 0.5063220548443497, + "avg_response_length": 206.9125, + "avg_student_mask_ratio": 0.5063220548443497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3141316814458378, + "epoch": 2.1792, + "grad_norm": 0.328125, + "kd_loss": 0.49756694839059035, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 110.25, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 110.25 + }, + { + "avg_mask_ratio": 0.46985941788880153, + "avg_response_length": 220.05, + "avg_student_mask_ratio": 0.46985941788880153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267214318231197, + "epoch": 2.2005333333333335, + "grad_norm": 0.423828125, + "kd_loss": 0.4489077641891422, + "learning_rate": 3e-06, + "loss": 0.6384, + "masked_tokens": 104.9, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 104.9 + }, + { + "avg_mask_ratio": 0.49566771630197765, + "avg_response_length": 213.7, + "avg_student_mask_ratio": 0.49566771630197765, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2765686680849626, + "epoch": 2.2218666666666667, + "grad_norm": 0.74609375, + "kd_loss": 0.5419906556950081, + "learning_rate": 3e-06, + "loss": 0.6686, + "masked_tokens": 100.35, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 100.35 + }, + { + "avg_mask_ratio": 0.5123252369463444, + "avg_response_length": 239.1125, + "avg_student_mask_ratio": 0.5123252369463444, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3092239250220928, + "epoch": 2.2432, + "grad_norm": 0.412109375, + "kd_loss": 0.5601085751741266, + "learning_rate": 3e-06, + "loss": 0.6416, + "masked_tokens": 123.0, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 123.0 + }, + { + "avg_mask_ratio": 0.46218636581033934, + "avg_response_length": 273.7875, + "avg_student_mask_ratio": 0.46218636581033934, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33267747830594485, + "epoch": 2.2645333333333335, + "grad_norm": 0.3984375, + "kd_loss": 0.49028674410892564, + "learning_rate": 3e-06, + "loss": 0.5827, + "masked_tokens": 122.125, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 122.125 + }, + { + "avg_mask_ratio": 0.48546303423354403, + "avg_response_length": 260.8125, + "avg_student_mask_ratio": 0.48546303423354403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4495345233380249, + "epoch": 2.2858666666666667, + "grad_norm": 0.345703125, + "kd_loss": 0.422707377332182, + "learning_rate": 3e-06, + "loss": 0.6038, + "masked_tokens": 138.0375, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 138.0375 + }, + { + "avg_mask_ratio": 0.4625250873621553, + "avg_response_length": 219.8875, + "avg_student_mask_ratio": 0.4625250873621553, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3075333785695989, + "epoch": 2.3072, + "grad_norm": 0.087890625, + "kd_loss": 0.3667106795892323, + "learning_rate": 3e-06, + "loss": 0.4549, + "masked_tokens": 100.5625, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 100.5625 + }, + { + "avg_mask_ratio": 0.44248262273031286, + "avg_response_length": 213.55, + "avg_student_mask_ratio": 0.44248262273031286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24968633314620092, + "epoch": 2.3285333333333336, + "grad_norm": 0.11572265625, + "kd_loss": 0.3935246549681978, + "learning_rate": 3e-06, + "loss": 0.5053, + "masked_tokens": 91.9125, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 91.9125 + }, + { + "avg_mask_ratio": 0.5204601250356063, + "avg_response_length": 246.1125, + "avg_student_mask_ratio": 0.5204601250356063, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40365264619552477, + "epoch": 2.3498666666666668, + "grad_norm": 0.37109375, + "kd_loss": 0.4355207666182196, + "learning_rate": 3e-06, + "loss": 0.6746, + "masked_tokens": 133.1875, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 133.1875 + }, + { + "avg_mask_ratio": 0.447697223268915, + "avg_response_length": 226.6375, + "avg_student_mask_ratio": 0.447697223268915, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.323583398951439, + "epoch": 2.3712, + "grad_norm": 1.0625, + "kd_loss": 0.3599243894114693, + "learning_rate": 3e-06, + "loss": 0.5035, + "masked_tokens": 103.475, + "mean_t": 0.4757364276825683, + "step": 1110, + "student_masked_tokens": 103.475 + }, + { + "avg_mask_ratio": 0.4670829998096451, + "avg_response_length": 245.0375, + "avg_student_mask_ratio": 0.4670829998096451, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2362255901227968, + "epoch": 2.392533333333333, + "grad_norm": 0.91015625, + "kd_loss": 0.350646685710654, + "learning_rate": 3e-06, + "loss": 0.5669, + "masked_tokens": 114.175, + "mean_t": 0.5013068238971755, + "step": 1120, + "student_masked_tokens": 114.175 + }, + { + "avg_mask_ratio": 0.5008096542558633, + "avg_response_length": 273.125, + "avg_student_mask_ratio": 0.5008096542558633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.45858247512928757, + "epoch": 2.413866666666667, + "grad_norm": 0.8359375, + "kd_loss": 0.5058724594353151, + "learning_rate": 3e-06, + "loss": 0.6836, + "masked_tokens": 144.2625, + "mean_t": 0.5303254407714121, + "step": 1130, + "student_masked_tokens": 144.2625 + }, + { + "avg_mask_ratio": 0.4456572526367381, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4456572526367381, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.20887070082535503, + "epoch": 2.4352, + "grad_norm": 7.21875, + "kd_loss": 0.44113304006314136, + "learning_rate": 3e-06, + "loss": 0.5599, + "masked_tokens": 103.125, + "mean_t": 0.4845335395424627, + "step": 1140, + "student_masked_tokens": 103.125 + }, + { + "avg_mask_ratio": 0.5291026248247362, + "avg_response_length": 212.5875, + "avg_student_mask_ratio": 0.5291026248247362, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3553736373823654, + "epoch": 2.4565333333333332, + "grad_norm": 0.365234375, + "kd_loss": 0.4455350318000953, + "learning_rate": 3e-06, + "loss": 0.7113, + "masked_tokens": 110.025, + "mean_t": 0.5690932425903157, + "step": 1150, + "student_masked_tokens": 110.025 + }, + { + "avg_mask_ratio": 0.46949602509848776, + "avg_response_length": 229.475, + "avg_student_mask_ratio": 0.46949602509848776, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3682716753066188, + "epoch": 2.4778666666666664, + "grad_norm": 0.98828125, + "kd_loss": 0.4137534074947894, + "learning_rate": 3e-06, + "loss": 0.5178, + "masked_tokens": 104.4375, + "mean_t": 0.5040684466948733, + "step": 1160, + "student_masked_tokens": 104.4375 + }, + { + "avg_mask_ratio": 0.480710746452678, + "avg_response_length": 247.7625, + "avg_student_mask_ratio": 0.480710746452678, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276487763844216, + "epoch": 2.4992, + "grad_norm": 0.67578125, + "kd_loss": 0.39728453117754725, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 123.9125, + "mean_t": 0.5114516971167177, + "step": 1170, + "student_masked_tokens": 123.9125 + }, + { + "avg_mask_ratio": 0.41960311922593974, + "avg_response_length": 215.025, + "avg_student_mask_ratio": 0.41960311922593974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3255401685638958, + "epoch": 2.5205333333333333, + "grad_norm": 0.69140625, + "kd_loss": 0.31952214344050844, + "learning_rate": 3e-06, + "loss": 0.5148, + "masked_tokens": 85.3625, + "mean_t": 0.4491677140351385, + "step": 1180, + "student_masked_tokens": 85.3625 + }, + { + "avg_mask_ratio": 0.522994744987227, + "avg_response_length": 220.3375, + "avg_student_mask_ratio": 0.522994744987227, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.46823025978692384, + "epoch": 2.5418666666666665, + "grad_norm": 0.87890625, + "kd_loss": 0.48877327183990926, + "learning_rate": 3e-06, + "loss": 0.7125, + "masked_tokens": 124.2875, + "mean_t": 0.5590635397238657, + "step": 1190, + "student_masked_tokens": 124.2875 + }, + { + "avg_mask_ratio": 0.4730891800048994, + "avg_response_length": 215.675, + "avg_student_mask_ratio": 0.4730891800048994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25984036189172455, + "epoch": 2.5632, + "grad_norm": 0.34765625, + "kd_loss": 0.3999250433967063, + "learning_rate": 3e-06, + "loss": 0.5566, + "masked_tokens": 98.775, + "mean_t": 0.506370971655997, + "step": 1200, + "student_masked_tokens": 98.775 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..27feeaeb0a870c486cf10665e3d2aedbebf4b94a --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:373a5e8685ef586dfb53f3d0e5d25bce673968b92f9aa3aa9a0b664f70d1aad9 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4225d465f844430b9a4f1cd549f22797912afba0 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dad61a855c3db0b23d7549282c2eb83eae465eeae3ad5b3c593adc89daba8b5 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ebecfeaa71b9e56006937d85b525cf11fb0edda --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d1d06bd17db661594f307eb1a293c09413ad06c18d1facb6bda1bbe2f3940a +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..14a0a9f350721cb05144387ffeb3157287f34528 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e69b6548f92233a5d3cb22aa7c60d5d4d1f37d04ac969eb521f7a7c36271ae54 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..421781b8dda6971ad78c51f1dc130f1fff19ce51 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3f0e18fd4ce38e61410a1f0e851c2762584e71a80ec7ce0bc5150325adcecc +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b62f15b3094d8ab3addbf710861fc16df2aeeda --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/trainer_state.json @@ -0,0 +1,533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4266666666666667, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..afa5d2908b01e5c9b1bd8937ea56214bb2a1cdf3 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0628a26dc3698e8b26ee3e8c5839bcdddc4a15d0501673a3faedd536e06be7ef +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2552b7332365b03bc9f4150e4c19cbfbbd7db63 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd2918a717be1688c778cbe8e58099e66551411a02a7d7a8ed6319e7827c1aca +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f663147613e01aad2acedee1255fea5adbb8875 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea249e27bdcc58f76757cc2d0238e340c88c1327490da7eed0c4fbe1cd131189 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8e5c08c0b601c9cbb71c7b266a8aa853c646317 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4362fa223b941d3d0aba37d42b420ac3c89882c4bc48d1c3e477d44eb4429605 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63622572f2f8fd0f5991a8ee55768496dcd77b8 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc305f845008b8f20405e65b1f962cf273957c5abdd0858e9cccb461f9b6d925 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..47624f74339a6b66a74dc7af7cdae791bbf3006e --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/trainer_state.json @@ -0,0 +1,783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.64, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6c399a40a9a5995f5c9fb156c12a6bc449ae248 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c686c498f789fd497c60f24c4d6aeb649b7b39a9c9d4136e07f68c09b75303ef +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cdbb1d223d998faef8740648e6ec1e5eeb892f9 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b121946ef9e514acd6fe741de4855c033edd3a1d9b63ed87547c2d8a50c407d8 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..447a9c0caf77f3592c066c69fe75008e834efc90 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3f3f1d5bf66d651fe6d3be22482cc6816096f059815d7eeb9e15ab3e4bcd272 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e4b820b177e63c0715bb5ee5cd1c7c518c08a98 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8448f21bbdc1eaf87ce64616ae89476260da88ec36334556bcafe73d8fbe837b +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c306ef4bb7e6a16b41c020edcc557eaff2f11b3f --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9830a27ccf317f0eed7689e600baef1933674a8f45133ca57c902cf16747aad2 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3146112eaa1cfb7ac0032ae71b538bb9d2026d64 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/trainer_state.json @@ -0,0 +1,1033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8533333333333334, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d43fd22f5be8f436afa8b81656a03c61de29c341 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3609bb4f85ba707001f11d04aa774b8a19173b2d45d9f0840351cefe683a381c +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c473909b3ea084a2ef4f3eea9cdfccb1063bcb06 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd106ea5bd24a3b3bca83cfbe148b4ecb7fdc1fff1af23278c2424781a95539 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa492d00bb4976206180efcab5062abe7656608e --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b043f63d1d4e4eeacfe119d7fa0dcc8d95ac6611adb290fca2511e28005ef6fd +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f2ac2e82ead44d1d6cd0908d025d65186be5ab --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c2d655c6c83c46f64e878fe7d2966f6d5aa2aa796f3997aed0076c9825f2f72 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..931e4be8f6a79592ab2ba42943c7a73e26c7bc07 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a517e97adb4dde873654de5d66064258ac3222271d3ace011285ec503f6a5b2 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a00f005e9fab1e9da02183fc4d57e3d422af68ea --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/trainer_state.json @@ -0,0 +1,1283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0682666666666667, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..151c18dc0450b0a5f4ecfc2005cbc07be74f9436 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2303fc0a6c98874723ccebe65f512d43ac98ca2894c3da47726a2cc0fa3a6353 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ba0e373efa58d6327c94ce150ada1e8b375990a --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f3e637dd97e346d3a52ed5783df488055eb1c5c83f32256bf63da34eaefd1a3 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ca4d93414c713bca8a3210710c45cb709d37e9e --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfe253e59db24d50fcce52c7305735daa27fac85ea89cdbc20049a1a78f05938 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4666cbed0c1c37cf57d093fc2ab5d0842a21077c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac04232a65236923af955fb6966aebb81d4581fd531037354790603c6b2f8312 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b530cce6692c8b72c51afea911741a4a11eef386 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f247b9a9f5a42bb05b5f94047806ee145b80e59e6134cfbac5720987816b080b +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1648f1d9a901d2ed7f85091723b2f8683d924da8 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/trainer_state.json @@ -0,0 +1,1533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2816, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c92bb5fbccf3454a722507262a7f1317c89ef28 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e9089b2e164cd2731931999db6da1d851c01823999bca0bdae50941af012abd +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0db1e2093e589d234cf5b0569fa5c9871e3024c1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36fd52abd4a9f66aab6705c5212bfbf871b026a32b56d7901af811c5e4c0b5f +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..500e5ec920565aa1de527248cebefade3800997c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71255ed6179013d58ea121d2387431fd41bdf5b5e2ca8cd71dccc5054540d2bc +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..faed7d6b13793e8fd5c0267f73a5684c5ae79f90 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b51688ece3ccd2ea3d0ba03351fa420516a12cb3019c6851b496fa50b12f37f +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..45cb6d6e9c58698e39624654c64f68865acc1e8c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31fc166825c283cd6e21942858b480fed83fd7716de86c3ed00fd14e8e22122 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..964a4e562935e954580b6c703eba117017717d20 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/trainer_state.json @@ -0,0 +1,1783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4949333333333334, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..390580f05b7428c7e3bc953ada8b8913cf16598f --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3a23a203edf07279ccb30cdf165cbf6ffa8b50d0de486d296e0f41d651c7cc +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f10f63f70e69928d0d4e3a4e9c68b4e329185854 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc252056482b2aed6588ed57184606ac04c3f0c06b5f63e8c33244da7713d020 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c590ef1e1d2681fa6c95a13325ca91bce0729f9a --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:610a9a4f78d7f85134c92f927b7d166865d8dd7ec8daa53878a554654e35dc7a +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..04902e543aac92631e5f758f8bea6d2a42562afa --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55d2eee3a1d73e9461cf205044bfc49da1a90f0dc09712f1f2004548134848 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..de962ad5d0694c2759ebc84569f3ce66309888ee --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0971d510e1111e0fef1ce3a2af63a62a1fc1c7d7b17a17e0c2de3f5ab7c9d0 +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5c10cc093a4186f8ceb79653e4963fed15bb520e --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/trainer_state.json @@ -0,0 +1,2033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7082666666666668, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4790433386107907, + "avg_response_length": 212.5, + "avg_student_mask_ratio": 0.4790433386107907, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24621229091012536, + "epoch": 1.5162666666666667, + "grad_norm": 0.2099609375, + "kd_loss": 0.43454050603151584, + "learning_rate": 3e-06, + "loss": 0.5302, + "masked_tokens": 108.7375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7375 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.36416104665024707, + "epoch": 1.5375999999999999, + "grad_norm": 0.75, + "kd_loss": 0.5665610315164941, + "learning_rate": 3e-06, + "loss": 0.7121, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38923927966282007, + "epoch": 1.5589333333333333, + "grad_norm": 1.015625, + "kd_loss": 0.4302867329986782, + "learning_rate": 3e-06, + "loss": 0.639, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3361817517367399, + "epoch": 1.5802666666666667, + "grad_norm": 0.40234375, + "kd_loss": 0.5340734164818514, + "learning_rate": 3e-06, + "loss": 0.7461, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37429177601145514, + "epoch": 1.6016, + "grad_norm": 0.58203125, + "kd_loss": 0.5036597276406856, + "learning_rate": 3e-06, + "loss": 0.6491, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46706983938929625, + "avg_response_length": 216.0625, + "avg_student_mask_ratio": 0.46706983938929625, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4449058656399984, + "epoch": 1.6229333333333333, + "grad_norm": 0.8203125, + "kd_loss": 0.5661326096985168, + "learning_rate": 3e-06, + "loss": 0.7233, + "masked_tokens": 107.7, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.44156218122225255, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.44156218122225255, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25899335961771613, + "epoch": 1.6442666666666668, + "grad_norm": 0.396484375, + "kd_loss": 0.4095979654902003, + "learning_rate": 3e-06, + "loss": 0.5099, + "masked_tokens": 117.5, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.5 + }, + { + "avg_mask_ratio": 0.42836043585848527, + "avg_response_length": 258.5125, + "avg_student_mask_ratio": 0.42836043585848527, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2897560694203321, + "epoch": 1.6656, + "grad_norm": 0.2431640625, + "kd_loss": 0.34635278815572546, + "learning_rate": 3e-06, + "loss": 0.4802, + "masked_tokens": 119.0125, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 119.0125 + }, + { + "avg_mask_ratio": 0.46589430308085866, + "avg_response_length": 222.3125, + "avg_student_mask_ratio": 0.46589430308085866, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21603642557238345, + "epoch": 1.6869333333333332, + "grad_norm": 0.140625, + "kd_loss": 0.33674514803767297, + "learning_rate": 3e-06, + "loss": 0.489, + "masked_tokens": 103.25, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 103.25 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2663005536277069, + "epoch": 1.7082666666666668, + "grad_norm": 0.23828125, + "kd_loss": 0.35138718315538425, + "learning_rate": 3e-06, + "loss": 0.5434, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/README.md b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_config.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0f61db13b9b0a2854984efa3e7c726f3e0dbe1 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_model.safetensors b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52d15b5bc18a04923af519e4dc1b0e5418a0c570 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de89f1c971653ef741ed64779ae532661a8eceb7873619d1a15faea28878a048 +size 2406624648 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/optimizer.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b94f164e07d7c0268540d39e76a6b9c6f53fe94 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f820cd1221b8efab1520cea36d4234803e5ac718e3dee66a9fe978af0f098784 +size 671304442 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_0.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..05559ab6cb992c8421309076d32d66327d6035eb --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4292a87262703401e211211269f119ddf5ea5bcbc9d2e6f7a2d1cd1322766b37 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_1.pth b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..24d31ffcf58450c7267c35d498fde775e6294508 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c144fddcdd55a704b2cf6b874efb51bdf3703f4906a99e702ddffe42a716c98 +size 14512 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/scheduler.pt b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..186dc458d73d11481b005defcffdd17b8b9b8a93 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf0430fbf8ed72ad90ba29a6f885082e3cf20a4095c07f619baeb5e62ae385d +size 1064 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/trainer_state.json b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f7f796ac045f28061538faf61c9baedd58ae0ee5 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/trainer_state.json @@ -0,0 +1,2283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9216, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4931091487989761, + "avg_response_length": 225.975, + "avg_student_mask_ratio": 0.4931091487989761, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5570551689027979, + "epoch": 0.021333333333333333, + "grad_norm": 0.404296875, + "kd_loss": 0.5375588692116253, + "learning_rate": 3e-06, + "loss": 0.8247, + "masked_tokens": 111.95, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 111.95 + }, + { + "avg_mask_ratio": 0.42058031369233506, + "avg_response_length": 255.2625, + "avg_student_mask_ratio": 0.42058031369233506, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42826092825978324, + "epoch": 0.042666666666666665, + "grad_norm": 0.8046875, + "kd_loss": 0.4450965437417761, + "learning_rate": 3e-06, + "loss": 0.5724, + "masked_tokens": 97.35, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 97.35 + }, + { + "avg_mask_ratio": 0.4538542575784959, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4538542575784959, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4461815005188782, + "epoch": 0.064, + "grad_norm": 0.50390625, + "kd_loss": 0.5296064364436825, + "learning_rate": 3e-06, + "loss": 0.702, + "masked_tokens": 110.2, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 110.2 + }, + { + "avg_mask_ratio": 0.4207469140383182, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.4207469140383182, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38499989152683156, + "epoch": 0.08533333333333333, + "grad_norm": 1.671875, + "kd_loss": 0.33118802310931417, + "learning_rate": 3e-06, + "loss": 0.5529, + "masked_tokens": 98.1625, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.1625 + }, + { + "avg_mask_ratio": 0.43260439952719026, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.43260439952719026, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5240421466317912, + "epoch": 0.10666666666666667, + "grad_norm": 1.6875, + "kd_loss": 0.4270985169670894, + "learning_rate": 3e-06, + "loss": 0.671, + "masked_tokens": 85.05, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.05 + }, + { + "avg_mask_ratio": 0.46053453313652426, + "avg_response_length": 251.0875, + "avg_student_mask_ratio": 0.46053453313652426, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5027546818272185, + "epoch": 0.128, + "grad_norm": 0.17578125, + "kd_loss": 0.3904111967755945, + "learning_rate": 3e-06, + "loss": 0.6672, + "masked_tokens": 120.9, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 120.9 + }, + { + "avg_mask_ratio": 0.5112146578729153, + "avg_response_length": 202.5875, + "avg_student_mask_ratio": 0.5112146578729153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.7753003867959023, + "epoch": 0.14933333333333335, + "grad_norm": 0.953125, + "kd_loss": 0.4415664039527428, + "learning_rate": 3e-06, + "loss": 0.856, + "masked_tokens": 104.5875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 104.5875 + }, + { + "avg_mask_ratio": 0.37548826879356056, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.37548826879356056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3791731233859082, + "epoch": 0.17066666666666666, + "grad_norm": 0.1552734375, + "kd_loss": 0.31052538527774515, + "learning_rate": 3e-06, + "loss": 0.4843, + "masked_tokens": 85.0625, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 85.0625 + }, + { + "avg_mask_ratio": 0.5001560213277116, + "avg_response_length": 229.75, + "avg_student_mask_ratio": 0.5001560213277116, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6899960007944174, + "epoch": 0.192, + "grad_norm": 1.25, + "kd_loss": 0.5995283465861896, + "learning_rate": 3e-06, + "loss": 0.9721, + "masked_tokens": 107.6625, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 107.6625 + }, + { + "avg_mask_ratio": 0.4576045103633078, + "avg_response_length": 208.0, + "avg_student_mask_ratio": 0.4576045103633078, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41132245859021166, + "epoch": 0.21333333333333335, + "grad_norm": 0.64453125, + "kd_loss": 0.3813956479015957, + "learning_rate": 3e-06, + "loss": 0.6635, + "masked_tokens": 104.1625, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 104.1625 + }, + { + "avg_mask_ratio": 0.4877026333590038, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.4877026333590038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4612084587922368, + "epoch": 0.23466666666666666, + "grad_norm": 0.64453125, + "kd_loss": 0.5074845846289577, + "learning_rate": 3e-06, + "loss": 0.7993, + "masked_tokens": 102.075, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 102.075 + }, + { + "avg_mask_ratio": 0.45146879020612685, + "avg_response_length": 224.1875, + "avg_student_mask_ratio": 0.45146879020612685, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3276976759495483, + "epoch": 0.256, + "grad_norm": 0.30078125, + "kd_loss": 0.41461311469229256, + "learning_rate": 3e-06, + "loss": 0.6088, + "masked_tokens": 100.525, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 100.525 + }, + { + "avg_mask_ratio": 0.4356566035945434, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.4356566035945434, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.12710368948505674, + "epoch": 0.2773333333333333, + "grad_norm": 0.490234375, + "kd_loss": 0.23057804748218585, + "learning_rate": 3e-06, + "loss": 0.384, + "masked_tokens": 89.5625, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.5625 + }, + { + "avg_mask_ratio": 0.49419954856857656, + "avg_response_length": 255.625, + "avg_student_mask_ratio": 0.49419954856857656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48596099013025196, + "epoch": 0.2986666666666667, + "grad_norm": 0.859375, + "kd_loss": 0.5025483015746885, + "learning_rate": 3e-06, + "loss": 0.7892, + "masked_tokens": 136.575, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 136.575 + }, + { + "avg_mask_ratio": 0.4736677930341102, + "avg_response_length": 255.375, + "avg_student_mask_ratio": 0.4736677930341102, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5493089448234059, + "epoch": 0.32, + "grad_norm": 0.076171875, + "kd_loss": 0.4892602212316547, + "learning_rate": 3e-06, + "loss": 0.7918, + "masked_tokens": 126.575, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 126.575 + }, + { + "avg_mask_ratio": 0.5032523009285796, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.5032523009285796, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5929547422666019, + "epoch": 0.3413333333333333, + "grad_norm": 1.171875, + "kd_loss": 0.44711892502580214, + "learning_rate": 3e-06, + "loss": 0.7472, + "masked_tokens": 99.425, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 99.425 + }, + { + "avg_mask_ratio": 0.4806730231270194, + "avg_response_length": 190.325, + "avg_student_mask_ratio": 0.4806730231270194, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3583432949517601, + "epoch": 0.3626666666666667, + "grad_norm": 0.6171875, + "kd_loss": 0.4521343837219092, + "learning_rate": 3e-06, + "loss": 0.6358, + "masked_tokens": 89.825, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.825 + }, + { + "avg_mask_ratio": 0.45829249716189224, + "avg_response_length": 244.0, + "avg_student_mask_ratio": 0.45829249716189224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3746713957985094, + "epoch": 0.384, + "grad_norm": 0.49609375, + "kd_loss": 0.34934306121722897, + "learning_rate": 3e-06, + "loss": 0.5747, + "masked_tokens": 110.075, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.075 + }, + { + "avg_mask_ratio": 0.4842760307248682, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.4842760307248682, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5090123614077584, + "epoch": 0.4053333333333333, + "grad_norm": 1.6171875, + "kd_loss": 0.43204482231294605, + "learning_rate": 3e-06, + "loss": 0.7055, + "masked_tokens": 109.5875, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.5875 + }, + { + "avg_mask_ratio": 0.4665210062637925, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4665210062637925, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27265903051802526, + "epoch": 0.4266666666666667, + "grad_norm": 0.318359375, + "kd_loss": 0.357759011555504, + "learning_rate": 3e-06, + "loss": 0.5013, + "masked_tokens": 97.0125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.0125 + }, + { + "avg_mask_ratio": 0.45183838629163803, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.45183838629163803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21031314329709402, + "epoch": 0.448, + "grad_norm": 0.8359375, + "kd_loss": 0.38585986606940426, + "learning_rate": 3e-06, + "loss": 0.5103, + "masked_tokens": 96.0125, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.0125 + }, + { + "avg_mask_ratio": 0.5396152360364794, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5396152360364794, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.6178499971098063, + "epoch": 0.4693333333333333, + "grad_norm": 0.96875, + "kd_loss": 0.46674597742967305, + "learning_rate": 3e-06, + "loss": 0.8495, + "masked_tokens": 125.35, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 125.35 + }, + { + "avg_mask_ratio": 0.4403991688624956, + "avg_response_length": 252.7, + "avg_student_mask_ratio": 0.4403991688624956, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25455478885055527, + "epoch": 0.49066666666666664, + "grad_norm": 0.5703125, + "kd_loss": 0.43305868929596725, + "learning_rate": 3e-06, + "loss": 0.6428, + "masked_tokens": 107.325, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.47419991258066146, + "avg_response_length": 212.85, + "avg_student_mask_ratio": 0.47419991258066146, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32057130943685763, + "epoch": 0.512, + "grad_norm": 0.43359375, + "kd_loss": 0.5083060303753086, + "learning_rate": 3e-06, + "loss": 0.6986, + "masked_tokens": 106.9, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 106.9 + }, + { + "avg_mask_ratio": 0.4464209079160355, + "avg_response_length": 243.475, + "avg_student_mask_ratio": 0.4464209079160355, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33636454603331456, + "epoch": 0.5333333333333333, + "grad_norm": 0.1142578125, + "kd_loss": 0.41649795620701296, + "learning_rate": 3e-06, + "loss": 0.5666, + "masked_tokens": 112.7375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.7375 + }, + { + "avg_mask_ratio": 0.4520751796895638, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.4520751796895638, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37478437887749577, + "epoch": 0.5546666666666666, + "grad_norm": 0.328125, + "kd_loss": 0.31532439299670545, + "learning_rate": 3e-06, + "loss": 0.5129, + "masked_tokens": 109.6375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 109.6375 + }, + { + "avg_mask_ratio": 0.5305180630879477, + "avg_response_length": 224.45, + "avg_student_mask_ratio": 0.5305180630879477, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.42709534656005416, + "epoch": 0.576, + "grad_norm": 0.7578125, + "kd_loss": 0.5525495689224045, + "learning_rate": 3e-06, + "loss": 0.812, + "masked_tokens": 120.475, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 120.475 + }, + { + "avg_mask_ratio": 0.46451686368091033, + "avg_response_length": 254.825, + "avg_student_mask_ratio": 0.46451686368091033, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31382316479499084, + "epoch": 0.5973333333333334, + "grad_norm": 0.90234375, + "kd_loss": 0.3957495673693458, + "learning_rate": 3e-06, + "loss": 0.6028, + "masked_tokens": 129.225, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 129.225 + }, + { + "avg_mask_ratio": 0.389662017847877, + "avg_response_length": 245.9125, + "avg_student_mask_ratio": 0.389662017847877, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.23645576389110373, + "epoch": 0.6186666666666667, + "grad_norm": 0.302734375, + "kd_loss": 0.27728830450374853, + "learning_rate": 3e-06, + "loss": 0.4314, + "masked_tokens": 99.8625, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.44417000194080175, + "avg_response_length": 217.0375, + "avg_student_mask_ratio": 0.44417000194080175, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3240562055096575, + "epoch": 0.64, + "grad_norm": 1.09375, + "kd_loss": 0.31930388437995133, + "learning_rate": 3e-06, + "loss": 0.5264, + "masked_tokens": 104.625, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 104.625 + }, + { + "avg_mask_ratio": 0.4706685543409549, + "avg_response_length": 175.45, + "avg_student_mask_ratio": 0.4706685543409549, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34333510200582396, + "epoch": 0.6613333333333333, + "grad_norm": 1.234375, + "kd_loss": 0.5067149527083984, + "learning_rate": 3e-06, + "loss": 0.6534, + "masked_tokens": 84.875, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 84.875 + }, + { + "avg_mask_ratio": 0.4974605386145413, + "avg_response_length": 234.7875, + "avg_student_mask_ratio": 0.4974605386145413, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.34462752127872137, + "epoch": 0.6826666666666666, + "grad_norm": 0.333984375, + "kd_loss": 0.3942846609736307, + "learning_rate": 3e-06, + "loss": 0.7133, + "masked_tokens": 119.6, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.5112370474264025, + "avg_response_length": 236.0625, + "avg_student_mask_ratio": 0.5112370474264025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2974585796398969, + "epoch": 0.704, + "grad_norm": 0.44140625, + "kd_loss": 0.4301003347501496, + "learning_rate": 3e-06, + "loss": 0.6754, + "masked_tokens": 129.425, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 129.425 + }, + { + "avg_mask_ratio": 0.44370225080056114, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44370225080056114, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3732590021626493, + "epoch": 0.7253333333333334, + "grad_norm": 0.98046875, + "kd_loss": 0.4610515360019235, + "learning_rate": 3e-06, + "loss": 0.6627, + "masked_tokens": 108.775, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 108.775 + }, + { + "avg_mask_ratio": 0.49959173843380994, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.49959173843380994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.48515336151417615, + "epoch": 0.7466666666666667, + "grad_norm": 0.92578125, + "kd_loss": 0.5031771080357654, + "learning_rate": 3e-06, + "loss": 0.7668, + "masked_tokens": 125.625, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.625 + }, + { + "avg_mask_ratio": 0.4744729608530179, + "avg_response_length": 246.1625, + "avg_student_mask_ratio": 0.4744729608530179, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3005135279950082, + "epoch": 0.768, + "grad_norm": 0.169921875, + "kd_loss": 0.5216399239409879, + "learning_rate": 3e-06, + "loss": 0.6077, + "masked_tokens": 116.875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.875 + }, + { + "avg_mask_ratio": 0.4738045462174341, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.4738045462174341, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5349442186782426, + "epoch": 0.7893333333333333, + "grad_norm": 0.201171875, + "kd_loss": 0.6039233199480805, + "learning_rate": 3e-06, + "loss": 0.7196, + "masked_tokens": 127.4625, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 127.4625 + }, + { + "avg_mask_ratio": 0.4512475330149755, + "avg_response_length": 209.8, + "avg_student_mask_ratio": 0.4512475330149755, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.19145508916275275, + "epoch": 0.8106666666666666, + "grad_norm": 0.6875, + "kd_loss": 0.4029755606519984, + "learning_rate": 3e-06, + "loss": 0.5055, + "masked_tokens": 100.8375, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 100.8375 + }, + { + "avg_mask_ratio": 0.4752940105390735, + "avg_response_length": 219.5625, + "avg_student_mask_ratio": 0.4752940105390735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4267096655552223, + "epoch": 0.832, + "grad_norm": 0.2578125, + "kd_loss": 0.4655849843487971, + "learning_rate": 3e-06, + "loss": 0.6749, + "masked_tokens": 112.375, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 112.375 + }, + { + "avg_mask_ratio": 0.47461870914557946, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.47461870914557946, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.27868834779033025, + "epoch": 0.8533333333333334, + "grad_norm": 0.640625, + "kd_loss": 0.5299579592951205, + "learning_rate": 3e-06, + "loss": 0.6538, + "masked_tokens": 120.4125, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 120.4125 + }, + { + "avg_mask_ratio": 0.48321815438685006, + "avg_response_length": 228.15, + "avg_student_mask_ratio": 0.48321815438685006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.43057951200541994, + "epoch": 0.8746666666666667, + "grad_norm": 0.5390625, + "kd_loss": 0.504674318619719, + "learning_rate": 3e-06, + "loss": 0.7381, + "masked_tokens": 119.0, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 119.0 + }, + { + "avg_mask_ratio": 0.4379329536575824, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4379329536575824, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.132674143492045, + "epoch": 0.896, + "grad_norm": 1.09375, + "kd_loss": 0.27731474525324984, + "learning_rate": 3e-06, + "loss": 0.3953, + "masked_tokens": 85.525, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 85.525 + }, + { + "avg_mask_ratio": 0.4674084897618741, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4674084897618741, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37605725416574387, + "epoch": 0.9173333333333333, + "grad_norm": 0.43359375, + "kd_loss": 0.49442086774362226, + "learning_rate": 3e-06, + "loss": 0.6699, + "masked_tokens": 104.5625, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 104.5625 + }, + { + "avg_mask_ratio": 0.4415457699564286, + "avg_response_length": 241.0875, + "avg_student_mask_ratio": 0.4415457699564286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3754083825901603, + "epoch": 0.9386666666666666, + "grad_norm": 0.6328125, + "kd_loss": 0.45159815376919143, + "learning_rate": 3e-06, + "loss": 0.6585, + "masked_tokens": 113.0875, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 113.0875 + }, + { + "avg_mask_ratio": 0.42486972180195154, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42486972180195154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32457938515717616, + "epoch": 0.96, + "grad_norm": 0.6953125, + "kd_loss": 0.4011907008050457, + "learning_rate": 3e-06, + "loss": 0.5644, + "masked_tokens": 103.4, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.4 + }, + { + "avg_mask_ratio": 0.47578654896933587, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.47578654896933587, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32885359905767475, + "epoch": 0.9813333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.44463847501747294, + "learning_rate": 3e-06, + "loss": 0.635, + "masked_tokens": 105.3125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 105.3125 + }, + { + "avg_mask_ratio": 0.4782901787132557, + "avg_response_length": 224.0952380952381, + "avg_student_mask_ratio": 0.4782901787132557, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3393430382851702, + "epoch": 1.0042666666666666, + "grad_norm": 0.65625, + "kd_loss": 0.5178591865708675, + "learning_rate": 3e-06, + "loss": 0.7769, + "masked_tokens": 107.23809523809524, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 107.23809523809524 + }, + { + "avg_mask_ratio": 0.47575968883465974, + "avg_response_length": 249.4125, + "avg_student_mask_ratio": 0.47575968883465974, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.44613247805159517, + "epoch": 1.0256, + "grad_norm": 0.498046875, + "kd_loss": 0.5374264506522252, + "learning_rate": 3e-06, + "loss": 0.6772, + "masked_tokens": 118.35, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 118.35 + }, + { + "avg_mask_ratio": 0.4563717324635945, + "avg_response_length": 232.0375, + "avg_student_mask_ratio": 0.4563717324635945, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37626147485414096, + "epoch": 1.0469333333333333, + "grad_norm": 0.54296875, + "kd_loss": 0.392788901903657, + "learning_rate": 3e-06, + "loss": 0.6047, + "masked_tokens": 98.35, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.35 + }, + { + "avg_mask_ratio": 0.5079968665260821, + "avg_response_length": 253.7875, + "avg_student_mask_ratio": 0.5079968665260821, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.30954629559880686, + "epoch": 1.0682666666666667, + "grad_norm": 0.291015625, + "kd_loss": 0.4563873354276211, + "learning_rate": 3e-06, + "loss": 0.5996, + "masked_tokens": 128.225, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 128.225 + }, + { + "avg_mask_ratio": 0.5109448074479588, + "avg_response_length": 254.2, + "avg_student_mask_ratio": 0.5109448074479588, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2868076219221166, + "epoch": 1.0896, + "grad_norm": 2.515625, + "kd_loss": 0.5652106747879998, + "learning_rate": 3e-06, + "loss": 0.6398, + "masked_tokens": 137.5875, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 137.5875 + }, + { + "avg_mask_ratio": 0.45396183808334173, + "avg_response_length": 202.7625, + "avg_student_mask_ratio": 0.45396183808334173, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38311037250946356, + "epoch": 1.1109333333333333, + "grad_norm": 0.6171875, + "kd_loss": 0.423658079797778, + "learning_rate": 3e-06, + "loss": 0.6386, + "masked_tokens": 87.0625, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.0625 + }, + { + "avg_mask_ratio": 0.47015948037151245, + "avg_response_length": 214.275, + "avg_student_mask_ratio": 0.47015948037151245, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.47228433731506814, + "epoch": 1.1322666666666668, + "grad_norm": 0.609375, + "kd_loss": 0.45688082203427316, + "learning_rate": 3e-06, + "loss": 0.737, + "masked_tokens": 99.8625, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.4892866689246148, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.4892866689246148, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4080867745911064, + "epoch": 1.1536, + "grad_norm": 0.341796875, + "kd_loss": 0.5618651450654625, + "learning_rate": 3e-06, + "loss": 0.6922, + "masked_tokens": 107.375, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.4541942774085328, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4541942774085328, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22217674175137744, + "epoch": 1.1749333333333334, + "grad_norm": 0.2412109375, + "kd_loss": 0.3673438885498399, + "learning_rate": 3e-06, + "loss": 0.5008, + "masked_tokens": 97.8875, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.8875 + }, + { + "avg_mask_ratio": 0.39282396506750955, + "avg_response_length": 231.4125, + "avg_student_mask_ratio": 0.39282396506750955, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3512847523151777, + "epoch": 1.1962666666666666, + "grad_norm": 0.8828125, + "kd_loss": 0.48686740984790616, + "learning_rate": 3e-06, + "loss": 0.5823, + "masked_tokens": 99.2875, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.2875 + }, + { + "avg_mask_ratio": 0.4483634108910337, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.4483634108910337, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.31345968546206676, + "epoch": 1.2176, + "grad_norm": 0.4453125, + "kd_loss": 0.41564053312727084, + "learning_rate": 3e-06, + "loss": 0.5898, + "masked_tokens": 108.9875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.9875 + }, + { + "avg_mask_ratio": 0.465452536707744, + "avg_response_length": 267.4375, + "avg_student_mask_ratio": 0.465452536707744, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3618907347364768, + "epoch": 1.2389333333333332, + "grad_norm": 8.6875, + "kd_loss": 0.4481006292516895, + "learning_rate": 3e-06, + "loss": 0.6314, + "masked_tokens": 129.075, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.075 + }, + { + "avg_mask_ratio": 0.5225977989146486, + "avg_response_length": 228.45, + "avg_student_mask_ratio": 0.5225977989146486, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5639314363695348, + "epoch": 1.2602666666666666, + "grad_norm": 1.1328125, + "kd_loss": 0.5351108588445992, + "learning_rate": 3e-06, + "loss": 0.8274, + "masked_tokens": 121.675, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.675 + }, + { + "avg_mask_ratio": 0.44998724836623294, + "avg_response_length": 236.7, + "avg_student_mask_ratio": 0.44998724836623294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3396833263838971, + "epoch": 1.2816, + "grad_norm": 0.365234375, + "kd_loss": 0.41761890975592914, + "learning_rate": 3e-06, + "loss": 0.5752, + "masked_tokens": 110.1625, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.1625 + }, + { + "avg_mask_ratio": 0.5042130865273066, + "avg_response_length": 230.3375, + "avg_student_mask_ratio": 0.5042130865273066, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.35890077192343595, + "epoch": 1.3029333333333333, + "grad_norm": 0.28515625, + "kd_loss": 0.5558427174539929, + "learning_rate": 3e-06, + "loss": 0.7657, + "masked_tokens": 112.625, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 112.625 + }, + { + "avg_mask_ratio": 0.49637898594373836, + "avg_response_length": 233.0625, + "avg_student_mask_ratio": 0.49637898594373836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.32318839170733327, + "epoch": 1.3242666666666667, + "grad_norm": 0.515625, + "kd_loss": 0.5518322235134179, + "learning_rate": 3e-06, + "loss": 0.6742, + "masked_tokens": 111.25, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.25 + }, + { + "avg_mask_ratio": 0.5177568581304512, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.5177568581304512, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.5710563842050931, + "epoch": 1.3456000000000001, + "grad_norm": 1.3515625, + "kd_loss": 0.5316411310721378, + "learning_rate": 3e-06, + "loss": 0.8598, + "masked_tokens": 129.6125, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 129.6125 + }, + { + "avg_mask_ratio": 0.48226998368045315, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.48226998368045315, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2804489129174499, + "epoch": 1.3669333333333333, + "grad_norm": 0.2421875, + "kd_loss": 0.3663112932188085, + "learning_rate": 3e-06, + "loss": 0.4584, + "masked_tokens": 120.275, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 120.275 + }, + { + "avg_mask_ratio": 0.5306948523037136, + "avg_response_length": 238.0125, + "avg_student_mask_ratio": 0.5306948523037136, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.475157093159612, + "epoch": 1.3882666666666665, + "grad_norm": 1.8125, + "kd_loss": 0.5062341513834724, + "learning_rate": 3e-06, + "loss": 0.7115, + "masked_tokens": 133.25, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.25 + }, + { + "avg_mask_ratio": 0.4821273953886703, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.4821273953886703, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.41770620119971225, + "epoch": 1.4096, + "grad_norm": 0.9375, + "kd_loss": 0.425496905214095, + "learning_rate": 3e-06, + "loss": 0.6361, + "masked_tokens": 128.875, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.875 + }, + { + "avg_mask_ratio": 0.46056515555246735, + "avg_response_length": 240.4375, + "avg_student_mask_ratio": 0.46056515555246735, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24846992658117414, + "epoch": 1.4309333333333334, + "grad_norm": 0.60546875, + "kd_loss": 0.34861083538812637, + "learning_rate": 3e-06, + "loss": 0.5112, + "masked_tokens": 119.85, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 119.85 + }, + { + "avg_mask_ratio": 0.4666106043441687, + "avg_response_length": 226.7375, + "avg_student_mask_ratio": 0.4666106043441687, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4541423492493323, + "epoch": 1.4522666666666666, + "grad_norm": 0.51953125, + "kd_loss": 0.4910934407485213, + "learning_rate": 3e-06, + "loss": 0.6946, + "masked_tokens": 107.4625, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4790851596510038, + "avg_response_length": 202.05, + "avg_student_mask_ratio": 0.4790851596510038, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3711260147189023, + "epoch": 1.4736, + "grad_norm": 2.03125, + "kd_loss": 0.41718243765291446, + "learning_rate": 3e-06, + "loss": 0.6313, + "masked_tokens": 111.3125, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 111.3125 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22230932631540554, + "epoch": 1.4949333333333334, + "grad_norm": 0.26171875, + "kd_loss": 0.6619142963969352, + "learning_rate": 3e-06, + "loss": 0.7717, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4790433386107907, + "avg_response_length": 212.5, + "avg_student_mask_ratio": 0.4790433386107907, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.24621229091012536, + "epoch": 1.5162666666666667, + "grad_norm": 0.2099609375, + "kd_loss": 0.43454050603151584, + "learning_rate": 3e-06, + "loss": 0.5302, + "masked_tokens": 108.7375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7375 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.36416104665024707, + "epoch": 1.5375999999999999, + "grad_norm": 0.75, + "kd_loss": 0.5665610315164941, + "learning_rate": 3e-06, + "loss": 0.7121, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38923927966282007, + "epoch": 1.5589333333333333, + "grad_norm": 1.015625, + "kd_loss": 0.4302867329986782, + "learning_rate": 3e-06, + "loss": 0.639, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3361817517367399, + "epoch": 1.5802666666666667, + "grad_norm": 0.40234375, + "kd_loss": 0.5340734164818514, + "learning_rate": 3e-06, + "loss": 0.7461, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37429177601145514, + "epoch": 1.6016, + "grad_norm": 0.58203125, + "kd_loss": 0.5036597276406856, + "learning_rate": 3e-06, + "loss": 0.6491, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46706983938929625, + "avg_response_length": 216.0625, + "avg_student_mask_ratio": 0.46706983938929625, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4449058656399984, + "epoch": 1.6229333333333333, + "grad_norm": 0.8203125, + "kd_loss": 0.5661326096985168, + "learning_rate": 3e-06, + "loss": 0.7233, + "masked_tokens": 107.7, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.44156218122225255, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.44156218122225255, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.25899335961771613, + "epoch": 1.6442666666666668, + "grad_norm": 0.396484375, + "kd_loss": 0.4095979654902003, + "learning_rate": 3e-06, + "loss": 0.5099, + "masked_tokens": 117.5, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.5 + }, + { + "avg_mask_ratio": 0.42836043585848527, + "avg_response_length": 258.5125, + "avg_student_mask_ratio": 0.42836043585848527, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2897560694203321, + "epoch": 1.6656, + "grad_norm": 0.2431640625, + "kd_loss": 0.34635278815572546, + "learning_rate": 3e-06, + "loss": 0.4802, + "masked_tokens": 119.0125, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 119.0125 + }, + { + "avg_mask_ratio": 0.46589430308085866, + "avg_response_length": 222.3125, + "avg_student_mask_ratio": 0.46589430308085866, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.21603642557238345, + "epoch": 1.6869333333333332, + "grad_norm": 0.140625, + "kd_loss": 0.33674514803767297, + "learning_rate": 3e-06, + "loss": 0.489, + "masked_tokens": 103.25, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 103.25 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2663005536277069, + "epoch": 1.7082666666666668, + "grad_norm": 0.23828125, + "kd_loss": 0.35138718315538425, + "learning_rate": 3e-06, + "loss": 0.5434, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.503375941584818, + "avg_response_length": 237.85, + "avg_student_mask_ratio": 0.503375941584818, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4813590554784753, + "epoch": 1.7296, + "grad_norm": 1.6015625, + "kd_loss": 0.45312339970045057, + "learning_rate": 3e-06, + "loss": 0.706, + "masked_tokens": 118.2125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 118.2125 + }, + { + "avg_mask_ratio": 0.5110091455746442, + "avg_response_length": 209.0875, + "avg_student_mask_ratio": 0.5110091455746442, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.4535834654417954, + "epoch": 1.7509333333333332, + "grad_norm": 0.70703125, + "kd_loss": 0.5985253949772413, + "learning_rate": 3e-06, + "loss": 0.7794, + "masked_tokens": 120.95, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 120.95 + }, + { + "avg_mask_ratio": 0.49899387182667854, + "avg_response_length": 263.975, + "avg_student_mask_ratio": 0.49899387182667854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.40083689643704473, + "epoch": 1.7722666666666667, + "grad_norm": 0.1708984375, + "kd_loss": 0.5644028104892641, + "learning_rate": 3e-06, + "loss": 0.7632, + "masked_tokens": 137.075, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.075 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.2870929398425915, + "epoch": 1.7936, + "grad_norm": 0.345703125, + "kd_loss": 0.4698917509396324, + "learning_rate": 3e-06, + "loss": 0.6327, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3023421537889817, + "epoch": 1.8149333333333333, + "grad_norm": 0.443359375, + "kd_loss": 0.3271854338312551, + "learning_rate": 3e-06, + "loss": 0.5634, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4635998342186213, + "avg_response_length": 229.125, + "avg_student_mask_ratio": 0.4635998342186213, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.37467331880507115, + "epoch": 1.8362666666666667, + "grad_norm": 0.384765625, + "kd_loss": 0.4431717619034316, + "learning_rate": 3e-06, + "loss": 0.5956, + "masked_tokens": 109.675, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.49111039767740294, + "avg_response_length": 229.1, + "avg_student_mask_ratio": 0.49111039767740294, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.3838037288314126, + "epoch": 1.8576000000000001, + "grad_norm": 0.333984375, + "kd_loss": 0.47523635068355363, + "learning_rate": 3e-06, + "loss": 0.6859, + "masked_tokens": 115.6625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 115.6625 + }, + { + "avg_mask_ratio": 0.4427660425659269, + "avg_response_length": 198.5625, + "avg_student_mask_ratio": 0.4427660425659269, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.33109274096627817, + "epoch": 1.8789333333333333, + "grad_norm": 1.0859375, + "kd_loss": 0.46695662873548827, + "learning_rate": 3e-06, + "loss": 0.6284, + "masked_tokens": 91.175, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.175 + }, + { + "avg_mask_ratio": 0.4464349385118112, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.4464349385118112, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.22229116438190885, + "epoch": 1.9002666666666665, + "grad_norm": 0.12890625, + "kd_loss": 0.4006316699657759, + "learning_rate": 3e-06, + "loss": 0.4934, + "masked_tokens": 101.75, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.75 + }, + { + "avg_mask_ratio": 0.44976164362160487, + "avg_response_length": 227.7875, + "avg_student_mask_ratio": 0.44976164362160487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 1.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_par_frac": 0.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.38169105723031577, + "epoch": 1.9216, + "grad_norm": 1.765625, + "kd_loss": 0.47280531010078086, + "learning_rate": 3e-06, + "loss": 0.6337, + "masked_tokens": 103.475, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.475 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/training_args.bin b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d9cde5cbc4fedb8fe690f2f561a165bf19ae18c --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ae704e41a4a62f6ca56789c45ce45887326cd2f8d1e97e398e5ada4a93398c +size 8312 diff --git a/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/debug_training_examples.jsonl b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/debug_training_examples.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1d9df73163403e20533920bba66b154f98661a7 --- /dev/null +++ b/math/INP/unmask_tags_leave_last_step_gold1_target1_ce0.5/debug_training_examples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2ddf04fa603864921dd0a04d25ff2dbd8245abdd0fb56d5db2bf9fa95866ac +size 1332804489