diff --git a/.gitattributes b/.gitattributes index 89138329e0953734b768a5a21bdb77b3ef90d228..9bedd0ab2c6ba0cb6247c5765e6eb6913936987d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -40,3 +40,4 @@ math/inp/SD-INP/math_self_distill_INP_u0.8-1.0_gold1_target1_ce0.5/debug_trainin math/SFT/inp-onehot_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text +math/INP-PAR/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7e6c424b2ef9cd482aa6f5388b9c6e4eb1590ded --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e0c745ab29c29d058af475735923aebe2c8f1ad9261e13068a1445d99180deb +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e545429ffc616fd0a048102c8147930df8c0d49 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431b247c6736b42b33ac8dea459f3c87476c22b983a06f88c6df8f6fe05384ec +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..efc426c3d9b0aeae6c0cf6c045e396a03b585dd5 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64fc209e4f8339f7f64f09116abc3335d3843ca29d9410394ed6bfd3431afc50 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bebd6f8aea042602cdbea7c81b9f67d21dc1bb50 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27f1021fb57112918a3a6091b09b0ccd50cb071a2324c12ae9afcc9851ee8bd3 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e899126a25538ff85c74c1e363ffbd951d4dda1e --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d8cdfceac9f7917b978dca661a3b8e04187faea5d5f6bd7b462d61d8234d57f +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3597e5af554d690a636163cb0db0d202df1f03 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json @@ -0,0 +1,273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21333333333333335, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6aad478407b0adc9f5125419d6fe7d71ca56fe6e --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dadb69409043d110f0f8c1c799b3827bf022897cf007c0fbc08d6f74f5022d9 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..083d469a64f1c21349ba517966a4eaffb06ff80c --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feef74a9920629ff20ab788a1fadcf97aad0450770804f32f725ceb5a5c3731f +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9bef0a4cfb05a0d1c7d93cdd76a62c34be65e408 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cbbccc533dc6035b9eb3e81ab0c37a3544ee2638528b1cb900a84d35f5b76b2 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..572cd86274b349eeda032872955f2209c42f9836 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:112d5e4d41a547b60596d50c5c50332e7991e1c7fbc0e60870f3b01c8a6d0f47 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f90e16891856bcfb31d679597efff574807cb3ce --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d9aa99505fc60c0db1b9cdacaa08b06e8a85c8aaaab4e389667a719fafb9bf +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae8b507128d9a3ffdea637293a286cde3e9a7fdd --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json @@ -0,0 +1,2433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1365333333333334, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + }, + { + "avg_mask_ratio": 0.4955552547937259, + "avg_response_length": 211.175, + "avg_student_mask_ratio": 0.4955552547937259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.275390625, + "kd_loss": 0.23970817765721222, + "learning_rate": 3e-06, + "loss": 0.2184, + "masked_tokens": 112.3375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 112.3375 + }, + { + "avg_mask_ratio": 0.4820543818641454, + "avg_response_length": 229.1375, + "avg_student_mask_ratio": 0.4820543818641454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.435546875, + "kd_loss": 0.18955910701470202, + "learning_rate": 3e-06, + "loss": 0.1978, + "masked_tokens": 114.55, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 114.55 + }, + { + "avg_mask_ratio": 0.4605119539948646, + "avg_response_length": 245.5625, + "avg_student_mask_ratio": 0.4605119539948646, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1103515625, + "kd_loss": 0.16859328244926958, + "learning_rate": 3e-06, + "loss": 0.1779, + "masked_tokens": 113.85, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 113.85 + }, + { + "avg_mask_ratio": 0.5134038798511028, + "avg_response_length": 194.125, + "avg_student_mask_ratio": 0.5134038798511028, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.0830078125, + "kd_loss": 0.17122714665274544, + "learning_rate": 3e-06, + "loss": 0.2018, + "masked_tokens": 102.9375, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 102.9375 + }, + { + "avg_mask_ratio": 0.4201362137740944, + "avg_response_length": 223.55, + "avg_student_mask_ratio": 0.4201362137740944, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.16796875, + "kd_loss": 0.19197621339357057, + "learning_rate": 3e-06, + "loss": 0.1792, + "masked_tokens": 94.7125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 94.7125 + }, + { + "avg_mask_ratio": 0.46924527404480615, + "avg_response_length": 227.35, + "avg_student_mask_ratio": 0.46924527404480615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.8125, + "kd_loss": 0.24938117066874382, + "learning_rate": 3e-06, + "loss": 0.2591, + "masked_tokens": 107.5375, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.5375 + }, + { + "avg_mask_ratio": 0.445551612455165, + "avg_response_length": 268.6625, + "avg_student_mask_ratio": 0.445551612455165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.1435546875, + "kd_loss": 0.1928954417056957, + "learning_rate": 3e-06, + "loss": 0.1563, + "masked_tokens": 124.0875, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4321410794305848, + "avg_response_length": 256.1625, + "avg_student_mask_ratio": 0.4321410794305848, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.2216796875, + "kd_loss": 0.15059620087446887, + "learning_rate": 3e-06, + "loss": 0.1534, + "masked_tokens": 117.7, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 117.7 + }, + { + "avg_mask_ratio": 0.4697655299096368, + "avg_response_length": 240.6125, + "avg_student_mask_ratio": 0.4697655299096368, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.16427693545232777, + "learning_rate": 3e-06, + "loss": 0.1489, + "masked_tokens": 113.5375, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 113.5375 + }, + { + "avg_mask_ratio": 0.4635992758907378, + "avg_response_length": 224.15, + "avg_student_mask_ratio": 0.4635992758907378, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.197265625, + "kd_loss": 0.15171801659575976, + "learning_rate": 3e-06, + "loss": 0.1526, + "masked_tokens": 107.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 107.5 + }, + { + "avg_mask_ratio": 0.5018501321552321, + "avg_response_length": 235.1625, + "avg_student_mask_ratio": 0.5018501321552321, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.1630859375, + "kd_loss": 0.18931926304685476, + "learning_rate": 3e-06, + "loss": 0.2031, + "masked_tokens": 116.6125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 116.6125 + }, + { + "avg_mask_ratio": 0.5050785383209586, + "avg_response_length": 207.6875, + "avg_student_mask_ratio": 0.5050785383209586, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.5546875, + "kd_loss": 0.24824176937403308, + "learning_rate": 3e-06, + "loss": 0.2566, + "masked_tokens": 119.175, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 119.175 + }, + { + "avg_mask_ratio": 0.4980328972451389, + "avg_response_length": 270.325, + "avg_student_mask_ratio": 0.4980328972451389, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.09912109375, + "kd_loss": 0.1924194690429431, + "learning_rate": 3e-06, + "loss": 0.2006, + "masked_tokens": 141.925, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 141.925 + }, + { + "avg_mask_ratio": 0.493249478796497, + "avg_response_length": 226.025, + "avg_student_mask_ratio": 0.493249478796497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.15751813794203606, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 118.825, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.5009000841644593, + "avg_response_length": 233.025, + "avg_student_mask_ratio": 0.5009000841644593, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.10205078125, + "kd_loss": 0.1860738446495816, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 117.3125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 117.3125 + }, + { + "avg_mask_ratio": 0.46293387678451836, + "avg_response_length": 231.3625, + "avg_student_mask_ratio": 0.46293387678451836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.1103515625, + "kd_loss": 0.19740513321539765, + "learning_rate": 3e-06, + "loss": 0.1841, + "masked_tokens": 110.5625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.4846805231412873, + "avg_response_length": 220.7375, + "avg_student_mask_ratio": 0.4846805231412873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.228515625, + "kd_loss": 0.19436422403705364, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 113.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 113.7625 + }, + { + "avg_mask_ratio": 0.4508363194297999, + "avg_response_length": 203.2875, + "avg_student_mask_ratio": 0.4508363194297999, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.16288868402702406, + "learning_rate": 3e-06, + "loss": 0.1845, + "masked_tokens": 95.3875, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 95.3875 + }, + { + "avg_mask_ratio": 0.43862658384023234, + "avg_response_length": 227.9375, + "avg_student_mask_ratio": 0.43862658384023234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.08251953125, + "kd_loss": 0.11281866748422545, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 101.2625, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.2625 + }, + { + "avg_mask_ratio": 0.44909207145101393, + "avg_response_length": 237.2375, + "avg_student_mask_ratio": 0.44909207145101393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1591796875, + "kd_loss": 0.15684176656744747, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 103.7, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.7 + }, + { + "avg_mask_ratio": 0.47512493645772336, + "avg_response_length": 246.175, + "avg_student_mask_ratio": 0.47512493645772336, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1982421875, + "kd_loss": 0.16909069961529893, + "learning_rate": 3e-06, + "loss": 0.1717, + "masked_tokens": 117.2125, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 117.2125 + }, + { + "avg_mask_ratio": 0.46720519906957636, + "avg_response_length": 229.0875, + "avg_student_mask_ratio": 0.46720519906957636, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.17078794931742322, + "learning_rate": 3e-06, + "loss": 0.1844, + "masked_tokens": 115.325, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 115.325 + }, + { + "avg_mask_ratio": 0.4805813999380916, + "avg_response_length": 236.1625, + "avg_student_mask_ratio": 0.4805813999380916, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.21484375, + "kd_loss": 0.20637793739751942, + "learning_rate": 3e-06, + "loss": 0.176, + "masked_tokens": 119.65, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 119.65 + }, + { + "avg_mask_ratio": 0.49833715503059683, + "avg_response_length": 229.39285714285714, + "avg_student_mask_ratio": 0.49833715503059683, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.259765625, + "kd_loss": 0.2117233988543549, + "learning_rate": 3e-06, + "loss": 0.2023, + "masked_tokens": 121.5952380952381, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 121.5952380952381 + }, + { + "avg_mask_ratio": 0.4347397161007393, + "avg_response_length": 226.925, + "avg_student_mask_ratio": 0.4347397161007393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.15234375, + "kd_loss": 0.16830119832033005, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 103.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 103.4125 + }, + { + "avg_mask_ratio": 0.4989688004134223, + "avg_response_length": 273.3125, + "avg_student_mask_ratio": 0.4989688004134223, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.123046875, + "kd_loss": 0.2304358719712809, + "learning_rate": 3e-06, + "loss": 0.22, + "masked_tokens": 137.125, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 137.125 + }, + { + "avg_mask_ratio": 0.4374056361732073, + "avg_response_length": 239.125, + "avg_student_mask_ratio": 0.4374056361732073, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.158095879086882, + "learning_rate": 3e-06, + "loss": 0.1573, + "masked_tokens": 107.4625, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4216216350672767, + "avg_response_length": 236.0125, + "avg_student_mask_ratio": 0.4216216350672767, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.177734375, + "kd_loss": 0.15213593625503563, + "learning_rate": 3e-06, + "loss": 0.1305, + "masked_tokens": 103.05, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 103.05 + }, + { + "avg_mask_ratio": 0.46076959141064433, + "avg_response_length": 257.375, + "avg_student_mask_ratio": 0.46076959141064433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 0.1484375, + "kd_loss": 0.18778659292394478, + "learning_rate": 3e-06, + "loss": 0.1727, + "masked_tokens": 121.2125, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2125 + }, + { + "avg_mask_ratio": 0.45376153094694016, + "avg_response_length": 223.4125, + "avg_student_mask_ratio": 0.45376153094694016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.10498046875, + "kd_loss": 0.16195435594947866, + "learning_rate": 3e-06, + "loss": 0.1554, + "masked_tokens": 102.175, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 102.175 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13ac09d9cb0166cdc758269ba964a068b2728381 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8cf9c4c96d199e2c6fc1014d956d44848bb3e22b70c3c1f8147514eca399ae +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..76571d203cc723a606df1e6f4d14a335d4530556 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3ec26485c65c40a39c48574ff60fee7ab396303873ba5b68ff02a9708afca38 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a414893d506cea6e26edc9aee4315ab3b08e349 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:907e39dff0cf7ad1a1affaa1e7047653794ab16e25c6977ce7b5524769fdf799 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea63ca1a93770d7356b817ed7c6200ffbed55cd3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7938e18b41a6ad67840013f5aac6c4576f92425b90ea79226bacc627193b9e +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..125c51aef8c1558b284b7ffdb401f40b1199eb92 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25795e3b7374d0f6abdd7ab4b34fbf7ab0447ba73c04014500c2ab8b5acec5b4 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a48cefd23fc3b55556374b0708e5849fe5516709 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json @@ -0,0 +1,2673 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3498666666666668, + "eval_steps": 500, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + }, + { + "avg_mask_ratio": 0.4955552547937259, + "avg_response_length": 211.175, + "avg_student_mask_ratio": 0.4955552547937259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.275390625, + "kd_loss": 0.23970817765721222, + "learning_rate": 3e-06, + "loss": 0.2184, + "masked_tokens": 112.3375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 112.3375 + }, + { + "avg_mask_ratio": 0.4820543818641454, + "avg_response_length": 229.1375, + "avg_student_mask_ratio": 0.4820543818641454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.435546875, + "kd_loss": 0.18955910701470202, + "learning_rate": 3e-06, + "loss": 0.1978, + "masked_tokens": 114.55, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 114.55 + }, + { + "avg_mask_ratio": 0.4605119539948646, + "avg_response_length": 245.5625, + "avg_student_mask_ratio": 0.4605119539948646, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1103515625, + "kd_loss": 0.16859328244926958, + "learning_rate": 3e-06, + "loss": 0.1779, + "masked_tokens": 113.85, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 113.85 + }, + { + "avg_mask_ratio": 0.5134038798511028, + "avg_response_length": 194.125, + "avg_student_mask_ratio": 0.5134038798511028, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.0830078125, + "kd_loss": 0.17122714665274544, + "learning_rate": 3e-06, + "loss": 0.2018, + "masked_tokens": 102.9375, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 102.9375 + }, + { + "avg_mask_ratio": 0.4201362137740944, + "avg_response_length": 223.55, + "avg_student_mask_ratio": 0.4201362137740944, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.16796875, + "kd_loss": 0.19197621339357057, + "learning_rate": 3e-06, + "loss": 0.1792, + "masked_tokens": 94.7125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 94.7125 + }, + { + "avg_mask_ratio": 0.46924527404480615, + "avg_response_length": 227.35, + "avg_student_mask_ratio": 0.46924527404480615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.8125, + "kd_loss": 0.24938117066874382, + "learning_rate": 3e-06, + "loss": 0.2591, + "masked_tokens": 107.5375, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.5375 + }, + { + "avg_mask_ratio": 0.445551612455165, + "avg_response_length": 268.6625, + "avg_student_mask_ratio": 0.445551612455165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.1435546875, + "kd_loss": 0.1928954417056957, + "learning_rate": 3e-06, + "loss": 0.1563, + "masked_tokens": 124.0875, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4321410794305848, + "avg_response_length": 256.1625, + "avg_student_mask_ratio": 0.4321410794305848, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.2216796875, + "kd_loss": 0.15059620087446887, + "learning_rate": 3e-06, + "loss": 0.1534, + "masked_tokens": 117.7, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 117.7 + }, + { + "avg_mask_ratio": 0.4697655299096368, + "avg_response_length": 240.6125, + "avg_student_mask_ratio": 0.4697655299096368, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.16427693545232777, + "learning_rate": 3e-06, + "loss": 0.1489, + "masked_tokens": 113.5375, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 113.5375 + }, + { + "avg_mask_ratio": 0.4635992758907378, + "avg_response_length": 224.15, + "avg_student_mask_ratio": 0.4635992758907378, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.197265625, + "kd_loss": 0.15171801659575976, + "learning_rate": 3e-06, + "loss": 0.1526, + "masked_tokens": 107.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 107.5 + }, + { + "avg_mask_ratio": 0.5018501321552321, + "avg_response_length": 235.1625, + "avg_student_mask_ratio": 0.5018501321552321, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.1630859375, + "kd_loss": 0.18931926304685476, + "learning_rate": 3e-06, + "loss": 0.2031, + "masked_tokens": 116.6125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 116.6125 + }, + { + "avg_mask_ratio": 0.5050785383209586, + "avg_response_length": 207.6875, + "avg_student_mask_ratio": 0.5050785383209586, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.5546875, + "kd_loss": 0.24824176937403308, + "learning_rate": 3e-06, + "loss": 0.2566, + "masked_tokens": 119.175, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 119.175 + }, + { + "avg_mask_ratio": 0.4980328972451389, + "avg_response_length": 270.325, + "avg_student_mask_ratio": 0.4980328972451389, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.09912109375, + "kd_loss": 0.1924194690429431, + "learning_rate": 3e-06, + "loss": 0.2006, + "masked_tokens": 141.925, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 141.925 + }, + { + "avg_mask_ratio": 0.493249478796497, + "avg_response_length": 226.025, + "avg_student_mask_ratio": 0.493249478796497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.15751813794203606, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 118.825, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.5009000841644593, + "avg_response_length": 233.025, + "avg_student_mask_ratio": 0.5009000841644593, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.10205078125, + "kd_loss": 0.1860738446495816, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 117.3125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 117.3125 + }, + { + "avg_mask_ratio": 0.46293387678451836, + "avg_response_length": 231.3625, + "avg_student_mask_ratio": 0.46293387678451836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.1103515625, + "kd_loss": 0.19740513321539765, + "learning_rate": 3e-06, + "loss": 0.1841, + "masked_tokens": 110.5625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.4846805231412873, + "avg_response_length": 220.7375, + "avg_student_mask_ratio": 0.4846805231412873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.228515625, + "kd_loss": 0.19436422403705364, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 113.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 113.7625 + }, + { + "avg_mask_ratio": 0.4508363194297999, + "avg_response_length": 203.2875, + "avg_student_mask_ratio": 0.4508363194297999, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.16288868402702406, + "learning_rate": 3e-06, + "loss": 0.1845, + "masked_tokens": 95.3875, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 95.3875 + }, + { + "avg_mask_ratio": 0.43862658384023234, + "avg_response_length": 227.9375, + "avg_student_mask_ratio": 0.43862658384023234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.08251953125, + "kd_loss": 0.11281866748422545, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 101.2625, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.2625 + }, + { + "avg_mask_ratio": 0.44909207145101393, + "avg_response_length": 237.2375, + "avg_student_mask_ratio": 0.44909207145101393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1591796875, + "kd_loss": 0.15684176656744747, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 103.7, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.7 + }, + { + "avg_mask_ratio": 0.47512493645772336, + "avg_response_length": 246.175, + "avg_student_mask_ratio": 0.47512493645772336, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1982421875, + "kd_loss": 0.16909069961529893, + "learning_rate": 3e-06, + "loss": 0.1717, + "masked_tokens": 117.2125, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 117.2125 + }, + { + "avg_mask_ratio": 0.46720519906957636, + "avg_response_length": 229.0875, + "avg_student_mask_ratio": 0.46720519906957636, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.17078794931742322, + "learning_rate": 3e-06, + "loss": 0.1844, + "masked_tokens": 115.325, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 115.325 + }, + { + "avg_mask_ratio": 0.4805813999380916, + "avg_response_length": 236.1625, + "avg_student_mask_ratio": 0.4805813999380916, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.21484375, + "kd_loss": 0.20637793739751942, + "learning_rate": 3e-06, + "loss": 0.176, + "masked_tokens": 119.65, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 119.65 + }, + { + "avg_mask_ratio": 0.49833715503059683, + "avg_response_length": 229.39285714285714, + "avg_student_mask_ratio": 0.49833715503059683, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.259765625, + "kd_loss": 0.2117233988543549, + "learning_rate": 3e-06, + "loss": 0.2023, + "masked_tokens": 121.5952380952381, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 121.5952380952381 + }, + { + "avg_mask_ratio": 0.4347397161007393, + "avg_response_length": 226.925, + "avg_student_mask_ratio": 0.4347397161007393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.15234375, + "kd_loss": 0.16830119832033005, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 103.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 103.4125 + }, + { + "avg_mask_ratio": 0.4989688004134223, + "avg_response_length": 273.3125, + "avg_student_mask_ratio": 0.4989688004134223, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.123046875, + "kd_loss": 0.2304358719712809, + "learning_rate": 3e-06, + "loss": 0.22, + "masked_tokens": 137.125, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 137.125 + }, + { + "avg_mask_ratio": 0.4374056361732073, + "avg_response_length": 239.125, + "avg_student_mask_ratio": 0.4374056361732073, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.158095879086882, + "learning_rate": 3e-06, + "loss": 0.1573, + "masked_tokens": 107.4625, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4216216350672767, + "avg_response_length": 236.0125, + "avg_student_mask_ratio": 0.4216216350672767, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.177734375, + "kd_loss": 0.15213593625503563, + "learning_rate": 3e-06, + "loss": 0.1305, + "masked_tokens": 103.05, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 103.05 + }, + { + "avg_mask_ratio": 0.46076959141064433, + "avg_response_length": 257.375, + "avg_student_mask_ratio": 0.46076959141064433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 0.1484375, + "kd_loss": 0.18778659292394478, + "learning_rate": 3e-06, + "loss": 0.1727, + "masked_tokens": 121.2125, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2125 + }, + { + "avg_mask_ratio": 0.45376153094694016, + "avg_response_length": 223.4125, + "avg_student_mask_ratio": 0.45376153094694016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.10498046875, + "kd_loss": 0.16195435594947866, + "learning_rate": 3e-06, + "loss": 0.1554, + "masked_tokens": 102.175, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 102.175 + }, + { + "avg_mask_ratio": 0.4360893844394013, + "avg_response_length": 230.8375, + "avg_student_mask_ratio": 0.4360893844394013, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1578666666666666, + "grad_norm": 0.09130859375, + "kd_loss": 0.15129900982686878, + "learning_rate": 3e-06, + "loss": 0.1525, + "masked_tokens": 95.2875, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 95.2875 + }, + { + "avg_mask_ratio": 0.5085378198185936, + "avg_response_length": 228.975, + "avg_student_mask_ratio": 0.5085378198185936, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1792, + "grad_norm": 0.1279296875, + "kd_loss": 0.1835148020709127, + "learning_rate": 3e-06, + "loss": 0.2059, + "masked_tokens": 118.825, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.4720251938910224, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4720251938910224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2005333333333335, + "grad_norm": 0.27734375, + "kd_loss": 0.19297441139430588, + "learning_rate": 3e-06, + "loss": 0.195, + "masked_tokens": 113.1, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 113.1 + }, + { + "avg_mask_ratio": 0.4877920758561231, + "avg_response_length": 206.2875, + "avg_student_mask_ratio": 0.4877920758561231, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2218666666666667, + "grad_norm": 0.1865234375, + "kd_loss": 0.18438395577541086, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 99.8625, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.5081706921104342, + "avg_response_length": 237.05, + "avg_student_mask_ratio": 0.5081706921104342, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2432, + "grad_norm": 0.158203125, + "kd_loss": 0.1950792422781717, + "learning_rate": 3e-06, + "loss": 0.1941, + "masked_tokens": 122.7375, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 122.7375 + }, + { + "avg_mask_ratio": 0.4656533743953332, + "avg_response_length": 259.25, + "avg_student_mask_ratio": 0.4656533743953332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2645333333333335, + "grad_norm": 0.1552734375, + "kd_loss": 0.14457174434564876, + "learning_rate": 3e-06, + "loss": 0.1245, + "masked_tokens": 124.0875, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4821103142516222, + "avg_response_length": 265.35, + "avg_student_mask_ratio": 0.4821103142516222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2858666666666667, + "grad_norm": 0.146484375, + "kd_loss": 0.1856000915321232, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 142.1875, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 142.1875 + }, + { + "avg_mask_ratio": 0.461884257895872, + "avg_response_length": 232.4, + "avg_student_mask_ratio": 0.461884257895872, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3072, + "grad_norm": 0.09033203125, + "kd_loss": 0.16016541003627935, + "learning_rate": 3e-06, + "loss": 0.1633, + "masked_tokens": 106.725, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 106.725 + }, + { + "avg_mask_ratio": 0.4401097826776095, + "avg_response_length": 207.325, + "avg_student_mask_ratio": 0.4401097826776095, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3285333333333336, + "grad_norm": 0.0908203125, + "kd_loss": 0.1443735401110633, + "learning_rate": 3e-06, + "loss": 0.1676, + "masked_tokens": 88.5375, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 88.5375 + }, + { + "avg_mask_ratio": 0.5224281516042538, + "avg_response_length": 257.675, + "avg_student_mask_ratio": 0.5224281516042538, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3498666666666668, + "grad_norm": 0.1953125, + "kd_loss": 0.18995505797054618, + "learning_rate": 3e-06, + "loss": 0.2181, + "masked_tokens": 142.15, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 142.15 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..31310ee723f16b4a42b04a3e5f10a55852d8e392 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2dca70abd85b31b564a1982f92388cdf4292993f9da9e8b65efe47e5ccbc6db +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3a76791fb98a2472e353263e1c5e1845c359f78 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfe31031f6599e908b2bf7a71f0b3239c56a319ecf746fa7cae7cd9230e52f9a +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..921a14eba8310d556048263a58727eadbc6dcc1b --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1394463a46489e6dce7c0369a296b9effad20c6a87b30dbb892b34b73b5d6365 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..546887788f5f75be851e17cf9048a7f5b51ab7d2 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4761cd452b58b0c04d9b6cd8ff1e6d128acb7cabdb300e86daf48f850e7e941 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b1aeae281b5040d3bcd2aa5b378a5c2504e2b5 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8c95a6d9085dfcee1e6620c88ede526366d3a02c5018932b1bc04809c0e0c7 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..06e9d6d1145a0e7040e22f255a529afe252fc300 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json @@ -0,0 +1,2913 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5632, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + }, + { + "avg_mask_ratio": 0.4955552547937259, + "avg_response_length": 211.175, + "avg_student_mask_ratio": 0.4955552547937259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.275390625, + "kd_loss": 0.23970817765721222, + "learning_rate": 3e-06, + "loss": 0.2184, + "masked_tokens": 112.3375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 112.3375 + }, + { + "avg_mask_ratio": 0.4820543818641454, + "avg_response_length": 229.1375, + "avg_student_mask_ratio": 0.4820543818641454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.435546875, + "kd_loss": 0.18955910701470202, + "learning_rate": 3e-06, + "loss": 0.1978, + "masked_tokens": 114.55, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 114.55 + }, + { + "avg_mask_ratio": 0.4605119539948646, + "avg_response_length": 245.5625, + "avg_student_mask_ratio": 0.4605119539948646, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1103515625, + "kd_loss": 0.16859328244926958, + "learning_rate": 3e-06, + "loss": 0.1779, + "masked_tokens": 113.85, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 113.85 + }, + { + "avg_mask_ratio": 0.5134038798511028, + "avg_response_length": 194.125, + "avg_student_mask_ratio": 0.5134038798511028, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.0830078125, + "kd_loss": 0.17122714665274544, + "learning_rate": 3e-06, + "loss": 0.2018, + "masked_tokens": 102.9375, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 102.9375 + }, + { + "avg_mask_ratio": 0.4201362137740944, + "avg_response_length": 223.55, + "avg_student_mask_ratio": 0.4201362137740944, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.16796875, + "kd_loss": 0.19197621339357057, + "learning_rate": 3e-06, + "loss": 0.1792, + "masked_tokens": 94.7125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 94.7125 + }, + { + "avg_mask_ratio": 0.46924527404480615, + "avg_response_length": 227.35, + "avg_student_mask_ratio": 0.46924527404480615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.8125, + "kd_loss": 0.24938117066874382, + "learning_rate": 3e-06, + "loss": 0.2591, + "masked_tokens": 107.5375, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.5375 + }, + { + "avg_mask_ratio": 0.445551612455165, + "avg_response_length": 268.6625, + "avg_student_mask_ratio": 0.445551612455165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.1435546875, + "kd_loss": 0.1928954417056957, + "learning_rate": 3e-06, + "loss": 0.1563, + "masked_tokens": 124.0875, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4321410794305848, + "avg_response_length": 256.1625, + "avg_student_mask_ratio": 0.4321410794305848, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.2216796875, + "kd_loss": 0.15059620087446887, + "learning_rate": 3e-06, + "loss": 0.1534, + "masked_tokens": 117.7, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 117.7 + }, + { + "avg_mask_ratio": 0.4697655299096368, + "avg_response_length": 240.6125, + "avg_student_mask_ratio": 0.4697655299096368, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.16427693545232777, + "learning_rate": 3e-06, + "loss": 0.1489, + "masked_tokens": 113.5375, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 113.5375 + }, + { + "avg_mask_ratio": 0.4635992758907378, + "avg_response_length": 224.15, + "avg_student_mask_ratio": 0.4635992758907378, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.197265625, + "kd_loss": 0.15171801659575976, + "learning_rate": 3e-06, + "loss": 0.1526, + "masked_tokens": 107.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 107.5 + }, + { + "avg_mask_ratio": 0.5018501321552321, + "avg_response_length": 235.1625, + "avg_student_mask_ratio": 0.5018501321552321, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.1630859375, + "kd_loss": 0.18931926304685476, + "learning_rate": 3e-06, + "loss": 0.2031, + "masked_tokens": 116.6125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 116.6125 + }, + { + "avg_mask_ratio": 0.5050785383209586, + "avg_response_length": 207.6875, + "avg_student_mask_ratio": 0.5050785383209586, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.5546875, + "kd_loss": 0.24824176937403308, + "learning_rate": 3e-06, + "loss": 0.2566, + "masked_tokens": 119.175, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 119.175 + }, + { + "avg_mask_ratio": 0.4980328972451389, + "avg_response_length": 270.325, + "avg_student_mask_ratio": 0.4980328972451389, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.09912109375, + "kd_loss": 0.1924194690429431, + "learning_rate": 3e-06, + "loss": 0.2006, + "masked_tokens": 141.925, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 141.925 + }, + { + "avg_mask_ratio": 0.493249478796497, + "avg_response_length": 226.025, + "avg_student_mask_ratio": 0.493249478796497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.15751813794203606, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 118.825, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.5009000841644593, + "avg_response_length": 233.025, + "avg_student_mask_ratio": 0.5009000841644593, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.10205078125, + "kd_loss": 0.1860738446495816, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 117.3125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 117.3125 + }, + { + "avg_mask_ratio": 0.46293387678451836, + "avg_response_length": 231.3625, + "avg_student_mask_ratio": 0.46293387678451836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.1103515625, + "kd_loss": 0.19740513321539765, + "learning_rate": 3e-06, + "loss": 0.1841, + "masked_tokens": 110.5625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.4846805231412873, + "avg_response_length": 220.7375, + "avg_student_mask_ratio": 0.4846805231412873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.228515625, + "kd_loss": 0.19436422403705364, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 113.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 113.7625 + }, + { + "avg_mask_ratio": 0.4508363194297999, + "avg_response_length": 203.2875, + "avg_student_mask_ratio": 0.4508363194297999, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.16288868402702406, + "learning_rate": 3e-06, + "loss": 0.1845, + "masked_tokens": 95.3875, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 95.3875 + }, + { + "avg_mask_ratio": 0.43862658384023234, + "avg_response_length": 227.9375, + "avg_student_mask_ratio": 0.43862658384023234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.08251953125, + "kd_loss": 0.11281866748422545, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 101.2625, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.2625 + }, + { + "avg_mask_ratio": 0.44909207145101393, + "avg_response_length": 237.2375, + "avg_student_mask_ratio": 0.44909207145101393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1591796875, + "kd_loss": 0.15684176656744747, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 103.7, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.7 + }, + { + "avg_mask_ratio": 0.47512493645772336, + "avg_response_length": 246.175, + "avg_student_mask_ratio": 0.47512493645772336, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1982421875, + "kd_loss": 0.16909069961529893, + "learning_rate": 3e-06, + "loss": 0.1717, + "masked_tokens": 117.2125, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 117.2125 + }, + { + "avg_mask_ratio": 0.46720519906957636, + "avg_response_length": 229.0875, + "avg_student_mask_ratio": 0.46720519906957636, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.17078794931742322, + "learning_rate": 3e-06, + "loss": 0.1844, + "masked_tokens": 115.325, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 115.325 + }, + { + "avg_mask_ratio": 0.4805813999380916, + "avg_response_length": 236.1625, + "avg_student_mask_ratio": 0.4805813999380916, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.21484375, + "kd_loss": 0.20637793739751942, + "learning_rate": 3e-06, + "loss": 0.176, + "masked_tokens": 119.65, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 119.65 + }, + { + "avg_mask_ratio": 0.49833715503059683, + "avg_response_length": 229.39285714285714, + "avg_student_mask_ratio": 0.49833715503059683, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.259765625, + "kd_loss": 0.2117233988543549, + "learning_rate": 3e-06, + "loss": 0.2023, + "masked_tokens": 121.5952380952381, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 121.5952380952381 + }, + { + "avg_mask_ratio": 0.4347397161007393, + "avg_response_length": 226.925, + "avg_student_mask_ratio": 0.4347397161007393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.15234375, + "kd_loss": 0.16830119832033005, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 103.4125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 103.4125 + }, + { + "avg_mask_ratio": 0.4989688004134223, + "avg_response_length": 273.3125, + "avg_student_mask_ratio": 0.4989688004134223, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.123046875, + "kd_loss": 0.2304358719712809, + "learning_rate": 3e-06, + "loss": 0.22, + "masked_tokens": 137.125, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 137.125 + }, + { + "avg_mask_ratio": 0.4374056361732073, + "avg_response_length": 239.125, + "avg_student_mask_ratio": 0.4374056361732073, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.158095879086882, + "learning_rate": 3e-06, + "loss": 0.1573, + "masked_tokens": 107.4625, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4216216350672767, + "avg_response_length": 236.0125, + "avg_student_mask_ratio": 0.4216216350672767, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.177734375, + "kd_loss": 0.15213593625503563, + "learning_rate": 3e-06, + "loss": 0.1305, + "masked_tokens": 103.05, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 103.05 + }, + { + "avg_mask_ratio": 0.46076959141064433, + "avg_response_length": 257.375, + "avg_student_mask_ratio": 0.46076959141064433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 0.1484375, + "kd_loss": 0.18778659292394478, + "learning_rate": 3e-06, + "loss": 0.1727, + "masked_tokens": 121.2125, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2125 + }, + { + "avg_mask_ratio": 0.45376153094694016, + "avg_response_length": 223.4125, + "avg_student_mask_ratio": 0.45376153094694016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.10498046875, + "kd_loss": 0.16195435594947866, + "learning_rate": 3e-06, + "loss": 0.1554, + "masked_tokens": 102.175, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 102.175 + }, + { + "avg_mask_ratio": 0.4360893844394013, + "avg_response_length": 230.8375, + "avg_student_mask_ratio": 0.4360893844394013, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1578666666666666, + "grad_norm": 0.09130859375, + "kd_loss": 0.15129900982686878, + "learning_rate": 3e-06, + "loss": 0.1525, + "masked_tokens": 95.2875, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 95.2875 + }, + { + "avg_mask_ratio": 0.5085378198185936, + "avg_response_length": 228.975, + "avg_student_mask_ratio": 0.5085378198185936, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1792, + "grad_norm": 0.1279296875, + "kd_loss": 0.1835148020709127, + "learning_rate": 3e-06, + "loss": 0.2059, + "masked_tokens": 118.825, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.4720251938910224, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4720251938910224, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2005333333333335, + "grad_norm": 0.27734375, + "kd_loss": 0.19297441139430588, + "learning_rate": 3e-06, + "loss": 0.195, + "masked_tokens": 113.1, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 113.1 + }, + { + "avg_mask_ratio": 0.4877920758561231, + "avg_response_length": 206.2875, + "avg_student_mask_ratio": 0.4877920758561231, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2218666666666667, + "grad_norm": 0.1865234375, + "kd_loss": 0.18438395577541086, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 99.8625, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 99.8625 + }, + { + "avg_mask_ratio": 0.5081706921104342, + "avg_response_length": 237.05, + "avg_student_mask_ratio": 0.5081706921104342, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2432, + "grad_norm": 0.158203125, + "kd_loss": 0.1950792422781717, + "learning_rate": 3e-06, + "loss": 0.1941, + "masked_tokens": 122.7375, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 122.7375 + }, + { + "avg_mask_ratio": 0.4656533743953332, + "avg_response_length": 259.25, + "avg_student_mask_ratio": 0.4656533743953332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2645333333333335, + "grad_norm": 0.1552734375, + "kd_loss": 0.14457174434564876, + "learning_rate": 3e-06, + "loss": 0.1245, + "masked_tokens": 124.0875, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4821103142516222, + "avg_response_length": 265.35, + "avg_student_mask_ratio": 0.4821103142516222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2858666666666667, + "grad_norm": 0.146484375, + "kd_loss": 0.1856000915321232, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 142.1875, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 142.1875 + }, + { + "avg_mask_ratio": 0.461884257895872, + "avg_response_length": 232.4, + "avg_student_mask_ratio": 0.461884257895872, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3072, + "grad_norm": 0.09033203125, + "kd_loss": 0.16016541003627935, + "learning_rate": 3e-06, + "loss": 0.1633, + "masked_tokens": 106.725, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 106.725 + }, + { + "avg_mask_ratio": 0.4401097826776095, + "avg_response_length": 207.325, + "avg_student_mask_ratio": 0.4401097826776095, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3285333333333336, + "grad_norm": 0.0908203125, + "kd_loss": 0.1443735401110633, + "learning_rate": 3e-06, + "loss": 0.1676, + "masked_tokens": 88.5375, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 88.5375 + }, + { + "avg_mask_ratio": 0.5224281516042538, + "avg_response_length": 257.675, + "avg_student_mask_ratio": 0.5224281516042538, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3498666666666668, + "grad_norm": 0.1953125, + "kd_loss": 0.18995505797054618, + "learning_rate": 3e-06, + "loss": 0.2181, + "masked_tokens": 142.15, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 142.15 + }, + { + "avg_mask_ratio": 0.4417481235635933, + "avg_response_length": 225.2625, + "avg_student_mask_ratio": 0.4417481235635933, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3712, + "grad_norm": 0.56640625, + "kd_loss": 0.14014514066888067, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 101.725, + "mean_t": 0.4757364276825683, + "step": 1110, + "student_masked_tokens": 101.725 + }, + { + "avg_mask_ratio": 0.4719216389814392, + "avg_response_length": 245.7875, + "avg_student_mask_ratio": 0.4719216389814392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.392533333333333, + "grad_norm": 0.2109375, + "kd_loss": 0.16249904381531924, + "learning_rate": 3e-06, + "loss": 0.1673, + "masked_tokens": 112.05, + "mean_t": 0.5013068238971755, + "step": 1120, + "student_masked_tokens": 112.05 + }, + { + "avg_mask_ratio": 0.5034189606667496, + "avg_response_length": 249.6625, + "avg_student_mask_ratio": 0.5034189606667496, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.413866666666667, + "grad_norm": 0.28125, + "kd_loss": 0.18220705530329723, + "learning_rate": 3e-06, + "loss": 0.1841, + "masked_tokens": 129.0125, + "mean_t": 0.5303254407714121, + "step": 1130, + "student_masked_tokens": 129.0125 + }, + { + "avg_mask_ratio": 0.4614338358165696, + "avg_response_length": 210.7125, + "avg_student_mask_ratio": 0.4614338358165696, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4352, + "grad_norm": 0.41015625, + "kd_loss": 0.15378471011499642, + "learning_rate": 3e-06, + "loss": 0.1687, + "masked_tokens": 102.1875, + "mean_t": 0.4845335395424627, + "step": 1140, + "student_masked_tokens": 102.1875 + }, + { + "avg_mask_ratio": 0.5357818282442167, + "avg_response_length": 229.425, + "avg_student_mask_ratio": 0.5357818282442167, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4565333333333332, + "grad_norm": 0.2197265625, + "kd_loss": 0.2261572911098483, + "learning_rate": 3e-06, + "loss": 0.2287, + "masked_tokens": 123.675, + "mean_t": 0.5690932425903157, + "step": 1150, + "student_masked_tokens": 123.675 + }, + { + "avg_mask_ratio": 0.47289583514211697, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.47289583514211697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4778666666666664, + "grad_norm": 0.205078125, + "kd_loss": 0.1848451584734164, + "learning_rate": 3e-06, + "loss": 0.158, + "masked_tokens": 107.175, + "mean_t": 0.5040684466948733, + "step": 1160, + "student_masked_tokens": 107.175 + }, + { + "avg_mask_ratio": 0.48331039408221843, + "avg_response_length": 254.9, + "avg_student_mask_ratio": 0.48331039408221843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4992, + "grad_norm": 0.27734375, + "kd_loss": 0.18950699392153184, + "learning_rate": 3e-06, + "loss": 0.1895, + "masked_tokens": 120.575, + "mean_t": 0.5114516971167177, + "step": 1170, + "student_masked_tokens": 120.575 + }, + { + "avg_mask_ratio": 0.4191652584122494, + "avg_response_length": 214.3, + "avg_student_mask_ratio": 0.4191652584122494, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5205333333333333, + "grad_norm": 0.16015625, + "kd_loss": 0.12068161078326511, + "learning_rate": 3e-06, + "loss": 0.143, + "masked_tokens": 86.8625, + "mean_t": 0.4491677140351385, + "step": 1180, + "student_masked_tokens": 86.8625 + }, + { + "avg_mask_ratio": 0.520819503068924, + "avg_response_length": 234.775, + "avg_student_mask_ratio": 0.520819503068924, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5418666666666665, + "grad_norm": 0.271484375, + "kd_loss": 0.2008784212745013, + "learning_rate": 3e-06, + "loss": 0.2037, + "masked_tokens": 131.8, + "mean_t": 0.5590635397238657, + "step": 1190, + "student_masked_tokens": 131.8 + }, + { + "avg_mask_ratio": 0.4767530772660393, + "avg_response_length": 233.275, + "avg_student_mask_ratio": 0.4767530772660393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5632, + "grad_norm": 0.1396484375, + "kd_loss": 0.16206236403125446, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 106.375, + "mean_t": 0.506370971655997, + "step": 1200, + "student_masked_tokens": 106.375 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd8567a766d6035181ee71785c07fa507f2777c7 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6152a1f978ab9113396a93370fd3f8dff0188cda94938a210d3f30daf65456e +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d51e0dacc9af07792d2194802736facde1a408 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b98b21ae1fd37544eb0e71be8c56c48ef31d5d2dc1f526a72c69358de436c2b7 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcef25425b3ac75f5de5c05e282e6102c4db73a1 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976572fc3dc5a068a2dabeece6d63c59a59ca329314c851e41f3c3eb08cfd244 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b8d56bcc74f1717d5097f548185f88d31688ccf1 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60246bba8f414ee9c88a354a9ea815a89b0ca4caa98573b704dc68fdff12dea3 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..421781b8dda6971ad78c51f1dc130f1fff19ce51 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3f0e18fd4ce38e61410a1f0e851c2762584e71a80ec7ce0bc5150325adcecc +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42c00cc3786d2acdc8c96031fabf8b3de089afd7 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json @@ -0,0 +1,513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4266666666666667, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..56165457dbd433e5742b548ac442eeb56e3c43a1 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31580c01ba9f7e42b4e3a77ad2ffeffe859d26996818904ad40a43e9ade88006 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8bfe43f075412338f322e52a3a94b69d520b56f --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64be610340e992bfcd9cce3b1bf82167c267b33aae29fe2cdc293f818f0d359 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a5f8e197a64f3ac262eba66651745eed3305ae9 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77ce3730be60ed30942bd0e98344fad78fcd0dc9cdda7aa8b73af31cde8eadfc +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..25a24dd8c9ed527431a4aedb8d74bfb57d890632 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2603351afdd1223a7cc50d6c4ccf2255952011573dee7c7d36ae39bf32267736 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63622572f2f8fd0f5991a8ee55768496dcd77b8 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc305f845008b8f20405e65b1f962cf273957c5abdd0858e9cccb461f9b6d925 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..18b68ffcd5f0c1ce3f329d011d87a9bf49867a2a --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json @@ -0,0 +1,753 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.64, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f3724fcc5946d7def475533dd939eb00556a2cd --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1af9b45c1b19a7cbb44a9746d14388f74aba01f05d88d64c5ba4af9b48bb53f +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..232f1bfcfac4162224b4764435f5f38275b8af6e --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dfb92077d26f0dc1b977cf6c7e9ddee427a0242517949014267ada41caf008 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c548c20836f8a33535de3d41d4d3652197dbc653 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52434163fd05e4e2013a934eb59dd2abc0d83b170663289db2ed3eaa4aaacc56 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c41d8a306bfb256f3df37659ab78209d4d0afb1 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df079b5d90e785daa9524bb8f93da448a75defc3b5b809e34a3bcef79daf3e45 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c306ef4bb7e6a16b41c020edcc557eaff2f11b3f --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9830a27ccf317f0eed7689e600baef1933674a8f45133ca57c902cf16747aad2 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88b6015f94d5699a4caffeab368b54b3121a74db --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json @@ -0,0 +1,993 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8533333333333334, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48818e14a4fa7790f7d916f8f74463d6e5576124 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fba914e5cd86d1b5e9ca40a5ab02bf19556b5c5f17e0bc04a81eaacb6168316 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1307e01473c0eafe0807b7fb738f6dd6897bff8a --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e49df1f8da6085627cab9dc23adf7202bf01662fca88422933901398324e2bdc +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a51a92808c0bafffe1f3103c638d6c4b49c4aa7 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b799074ae77b70e942a30fe274b81c8cc846644aed57ad6cde070ee399f5a7f +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d177ccc190d093556caf4b04f23861ed2e306a8 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe45cdfafc4e908058327c173b9fb322e1810564dd510610209d2d2d72711bd +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..931e4be8f6a79592ab2ba42943c7a73e26c7bc07 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a517e97adb4dde873654de5d66064258ac3222271d3ace011285ec503f6a5b2 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e386612ba459343777555309671f880980e9944f --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json @@ -0,0 +1,1233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0682666666666667, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a32ab6ef85c8514c609968868d30e413dc8b534b --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78dea922826171cb2d6368b26692476d9523cb5226ba41d1162c7fcd68ef0b62 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..80a60a2b2fb1288ba9f1189825a21da084828d27 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada4b37e9faa331c5d02dd558de323b87bbb65a9e5449b71d50658a173bec864 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab65ae132e72f77ce216a2787647b2bbd1e97dab --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a970fe99df6f0c8b67092e1f3bf38318b980159b8db0530227e6de94f9f6ef38 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..049b86aa7288758d8e05046f2b293ba4fff46557 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0310c064726303c025bf821c32e2fe4b4e8aa616629cb27365f32b9397fced0a +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b530cce6692c8b72c51afea911741a4a11eef386 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f247b9a9f5a42bb05b5f94047806ee145b80e59e6134cfbac5720987816b080b +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6846a45f3aa00e9f98c402c48fff0cce10287ec --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json @@ -0,0 +1,1473 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2816, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9ad01579d6f906aedf5f1c87412daa669ab5483 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e06651ad1770beb31366fe38148bd3e88c62dc705a72765d7f2bb8648b07293 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1398e5b9b205c77bcd7cc9c27c90996dd2f4a615 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c1da3d1c0141f91dac2024043e873aa480a95f6c041d3718dc432d6539f8a3 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..500e5ec920565aa1de527248cebefade3800997c --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71255ed6179013d58ea121d2387431fd41bdf5b5e2ca8cd71dccc5054540d2bc +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e2312a4da7e0ab07379364bbf918ec768f2544b --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eccbd02c1e7b3be23b7275b76ee3012efdea63049a29d59e1c12af9478ab4b0b +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..45cb6d6e9c58698e39624654c64f68865acc1e8c --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31fc166825c283cd6e21942858b480fed83fd7716de86c3ed00fd14e8e22122 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f9c8a72420d8df797cc1fe14027970048796d496 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json @@ -0,0 +1,1713 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4949333333333334, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc5a03fa92e079f92094751d87c8b6b1586abbc1 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52cbe7240faf5a4ab64813caf3501946c3a95af79acb14f5305ec5a1d7455325 +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..944cd57dd4b941a431739efd8a251fa8caf00466 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4edaec65ff0fddeec3263d21d43a6fe54d9e2bf81a580c81261c69708f75f31a +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9394ee2a94b35d195acd35598a6bc497e61404e5 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807e3adbf353e56ff37441e4e9da7eb93fc395ce183c435891d7e078922b93b2 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4bae909d0195ffe9da4137919aa3ba9d0ea681f --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228abd63367b4b27a2479d78f4f23e8d099e13995ba0d786a790dfc1184cf284 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..de962ad5d0694c2759ebc84569f3ce66309888ee --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0971d510e1111e0fef1ce3a2af63a62a1fc1c7d7b17a17e0c2de3f5ab7c9d0 +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4625ff27a439a8b1bff7d5ec894c590dfd2a2f6f --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json @@ -0,0 +1,1953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7082666666666668, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + }, + { + "avg_mask_ratio": 0.4955552547937259, + "avg_response_length": 211.175, + "avg_student_mask_ratio": 0.4955552547937259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.275390625, + "kd_loss": 0.23970817765721222, + "learning_rate": 3e-06, + "loss": 0.2184, + "masked_tokens": 112.3375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 112.3375 + }, + { + "avg_mask_ratio": 0.4820543818641454, + "avg_response_length": 229.1375, + "avg_student_mask_ratio": 0.4820543818641454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.435546875, + "kd_loss": 0.18955910701470202, + "learning_rate": 3e-06, + "loss": 0.1978, + "masked_tokens": 114.55, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 114.55 + }, + { + "avg_mask_ratio": 0.4605119539948646, + "avg_response_length": 245.5625, + "avg_student_mask_ratio": 0.4605119539948646, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1103515625, + "kd_loss": 0.16859328244926958, + "learning_rate": 3e-06, + "loss": 0.1779, + "masked_tokens": 113.85, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 113.85 + }, + { + "avg_mask_ratio": 0.5134038798511028, + "avg_response_length": 194.125, + "avg_student_mask_ratio": 0.5134038798511028, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.0830078125, + "kd_loss": 0.17122714665274544, + "learning_rate": 3e-06, + "loss": 0.2018, + "masked_tokens": 102.9375, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 102.9375 + }, + { + "avg_mask_ratio": 0.4201362137740944, + "avg_response_length": 223.55, + "avg_student_mask_ratio": 0.4201362137740944, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.16796875, + "kd_loss": 0.19197621339357057, + "learning_rate": 3e-06, + "loss": 0.1792, + "masked_tokens": 94.7125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 94.7125 + }, + { + "avg_mask_ratio": 0.46924527404480615, + "avg_response_length": 227.35, + "avg_student_mask_ratio": 0.46924527404480615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.8125, + "kd_loss": 0.24938117066874382, + "learning_rate": 3e-06, + "loss": 0.2591, + "masked_tokens": 107.5375, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.5375 + }, + { + "avg_mask_ratio": 0.445551612455165, + "avg_response_length": 268.6625, + "avg_student_mask_ratio": 0.445551612455165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.1435546875, + "kd_loss": 0.1928954417056957, + "learning_rate": 3e-06, + "loss": 0.1563, + "masked_tokens": 124.0875, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4321410794305848, + "avg_response_length": 256.1625, + "avg_student_mask_ratio": 0.4321410794305848, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.2216796875, + "kd_loss": 0.15059620087446887, + "learning_rate": 3e-06, + "loss": 0.1534, + "masked_tokens": 117.7, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 117.7 + }, + { + "avg_mask_ratio": 0.4697655299096368, + "avg_response_length": 240.6125, + "avg_student_mask_ratio": 0.4697655299096368, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.16427693545232777, + "learning_rate": 3e-06, + "loss": 0.1489, + "masked_tokens": 113.5375, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 113.5375 + }, + { + "avg_mask_ratio": 0.4635992758907378, + "avg_response_length": 224.15, + "avg_student_mask_ratio": 0.4635992758907378, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.197265625, + "kd_loss": 0.15171801659575976, + "learning_rate": 3e-06, + "loss": 0.1526, + "masked_tokens": 107.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 107.5 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aebf9ea6690ed4ecc23ae3af9402e39470fff9f3 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c151016404e49b096553e25e46d25e288c143681 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edebe98f9482a1dd729bcb486b0f6ba2164c764150acdf45cc4e6b08d270767f +size 2406624648 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8ed2883c04cb939111d757d9426b2db5e621a62 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef3d5000fe4ba571bfe0678dce35391d7caa349f3fada3873d75911f2b18ff47 +size 671304442 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a83b9776b9d15d5845dd4fd0c47a71f865302cd --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:912eb5b73ad8be816bc34c3e23e2525e429684d31fd0f297df673d0b4d76559f +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c291e6ebbe163eb09404b9cd50d9ccbab2705ac --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4bfd20cfca4b11465f228f1931205631b5df1ab98a06b82be9c5bea11045a7 +size 14512 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..186dc458d73d11481b005defcffdd17b8b9b8a93 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf0430fbf8ed72ad90ba29a6f885082e3cf20a4095c07f619baeb5e62ae385d +size 1064 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ace95bb45947ff5cb71af0467095dca738b3857 --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json @@ -0,0 +1,2193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9216, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4933756332669873, + "avg_response_length": 252.0625, + "avg_student_mask_ratio": 0.4933756332669873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1650390625, + "kd_loss": 0.24795629351958723, + "learning_rate": 3e-06, + "loss": 0.2758, + "masked_tokens": 120.975, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 120.975 + }, + { + "avg_mask_ratio": 0.41923869140446185, + "avg_response_length": 221.7125, + "avg_student_mask_ratio": 0.41923869140446185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.388671875, + "kd_loss": 0.21509853232191353, + "learning_rate": 3e-06, + "loss": 0.2046, + "masked_tokens": 86.1875, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 86.1875 + }, + { + "avg_mask_ratio": 0.4542569225654006, + "avg_response_length": 231.45, + "avg_student_mask_ratio": 0.4542569225654006, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.52734375, + "kd_loss": 0.19756361616970963, + "learning_rate": 3e-06, + "loss": 0.1976, + "masked_tokens": 116.2875, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 116.2875 + }, + { + "avg_mask_ratio": 0.41855402445653456, + "avg_response_length": 214.125, + "avg_student_mask_ratio": 0.41855402445653456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.263671875, + "kd_loss": 0.1576978968325534, + "learning_rate": 3e-06, + "loss": 0.1551, + "masked_tokens": 94.225, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 94.225 + }, + { + "avg_mask_ratio": 0.4331560767372139, + "avg_response_length": 222.225, + "avg_student_mask_ratio": 0.4331560767372139, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.125, + "kd_loss": 0.17712681048956255, + "learning_rate": 3e-06, + "loss": 0.1648, + "masked_tokens": 97.825, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 97.825 + }, + { + "avg_mask_ratio": 0.4547638618387282, + "avg_response_length": 242.9, + "avg_student_mask_ratio": 0.4547638618387282, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.130859375, + "kd_loss": 0.1745696667137963, + "learning_rate": 3e-06, + "loss": 0.1745, + "masked_tokens": 119.125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.125 + }, + { + "avg_mask_ratio": 0.5151988173020072, + "avg_response_length": 214.3375, + "avg_student_mask_ratio": 0.5151988173020072, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.17578125, + "kd_loss": 0.1657758254527316, + "learning_rate": 3e-06, + "loss": 0.1972, + "masked_tokens": 111.7875, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 111.7875 + }, + { + "avg_mask_ratio": 0.37937068473547697, + "avg_response_length": 250.2, + "avg_student_mask_ratio": 0.37937068473547697, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.051513671875, + "kd_loss": 0.13968983994418097, + "learning_rate": 3e-06, + "loss": 0.1369, + "masked_tokens": 95.1875, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 95.1875 + }, + { + "avg_mask_ratio": 0.5006106478627771, + "avg_response_length": 242.1125, + "avg_student_mask_ratio": 0.5006106478627771, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.22265625, + "kd_loss": 0.20869405062871707, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 116.3875, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 116.3875 + }, + { + "avg_mask_ratio": 0.4596128500183113, + "avg_response_length": 229.0625, + "avg_student_mask_ratio": 0.4596128500183113, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.1865234375, + "kd_loss": 0.17640120884607313, + "learning_rate": 3e-06, + "loss": 0.1864, + "masked_tokens": 109.7125, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 109.7125 + }, + { + "avg_mask_ratio": 0.4920400592498481, + "avg_response_length": 229.9875, + "avg_student_mask_ratio": 0.4920400592498481, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.326171875, + "kd_loss": 0.2011610215539008, + "learning_rate": 3e-06, + "loss": 0.2334, + "masked_tokens": 109.4, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 109.4 + }, + { + "avg_mask_ratio": 0.450224511185661, + "avg_response_length": 229.225, + "avg_student_mask_ratio": 0.450224511185661, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.171875, + "kd_loss": 0.21365654302464918, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 102.6375, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 102.6375 + }, + { + "avg_mask_ratio": 0.4397759020910598, + "avg_response_length": 214.15, + "avg_student_mask_ratio": 0.4397759020910598, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.173828125, + "kd_loss": 0.13860440934267615, + "learning_rate": 3e-06, + "loss": 0.1362, + "masked_tokens": 98.4, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.4890626976499334, + "avg_response_length": 242.1625, + "avg_student_mask_ratio": 0.4890626976499334, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.27734375, + "kd_loss": 0.2106460814328841, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 129.725, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 129.725 + }, + { + "avg_mask_ratio": 0.47298577734036373, + "avg_response_length": 262.9875, + "avg_student_mask_ratio": 0.47298577734036373, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.0673828125, + "kd_loss": 0.2408599746217078, + "learning_rate": 3e-06, + "loss": 0.2276, + "masked_tokens": 128.375, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 128.375 + }, + { + "avg_mask_ratio": 0.5043223856599071, + "avg_response_length": 217.5, + "avg_student_mask_ratio": 0.5043223856599071, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.20484722793833043, + "learning_rate": 3e-06, + "loss": 0.2154, + "masked_tokens": 106.025, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 106.025 + }, + { + "avg_mask_ratio": 0.48419030708028005, + "avg_response_length": 196.2625, + "avg_student_mask_ratio": 0.48419030708028005, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.1611328125, + "kd_loss": 0.20407032655223248, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 96.2625, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 96.2625 + }, + { + "avg_mask_ratio": 0.4485494759515859, + "avg_response_length": 228.8625, + "avg_student_mask_ratio": 0.4485494759515859, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.14453125, + "kd_loss": 0.16957379382825621, + "learning_rate": 3e-06, + "loss": 0.1796, + "masked_tokens": 101.275, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 101.275 + }, + { + "avg_mask_ratio": 0.48665878190658984, + "avg_response_length": 230.5, + "avg_student_mask_ratio": 0.48665878190658984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.3515625, + "kd_loss": 0.22313492714965832, + "learning_rate": 3e-06, + "loss": 0.2112, + "masked_tokens": 107.975, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 107.975 + }, + { + "avg_mask_ratio": 0.4670982737792656, + "avg_response_length": 210.9125, + "avg_student_mask_ratio": 0.4670982737792656, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.142578125, + "kd_loss": 0.15220829088375468, + "learning_rate": 3e-06, + "loss": 0.1609, + "masked_tokens": 98.2125, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 98.2125 + }, + { + "avg_mask_ratio": 0.4568137794849463, + "avg_response_length": 218.7375, + "avg_student_mask_ratio": 0.4568137794849463, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.1826171875, + "kd_loss": 0.1580252643583208, + "learning_rate": 3e-06, + "loss": 0.1798, + "masked_tokens": 99.15, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 99.15 + }, + { + "avg_mask_ratio": 0.5379857187625021, + "avg_response_length": 252.1375, + "avg_student_mask_ratio": 0.5379857187625021, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.3203125, + "kd_loss": 0.2511090909683844, + "learning_rate": 3e-06, + "loss": 0.2583, + "masked_tokens": 135.4, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 135.4 + }, + { + "avg_mask_ratio": 0.43395056116278286, + "avg_response_length": 245.2625, + "avg_student_mask_ratio": 0.43395056116278286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.1767578125, + "kd_loss": 0.14414861655371852, + "learning_rate": 3e-06, + "loss": 0.1964, + "masked_tokens": 102.5125, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 102.5125 + }, + { + "avg_mask_ratio": 0.46948411157354714, + "avg_response_length": 202.975, + "avg_student_mask_ratio": 0.46948411157354714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.1181640625, + "kd_loss": 0.2197965504183493, + "learning_rate": 3e-06, + "loss": 0.243, + "masked_tokens": 97.0625, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 97.0625 + }, + { + "avg_mask_ratio": 0.44631263689370826, + "avg_response_length": 243.425, + "avg_student_mask_ratio": 0.44631263689370826, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.1064453125, + "kd_loss": 0.2151024747882957, + "learning_rate": 3e-06, + "loss": 0.1892, + "masked_tokens": 107.4625, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 107.4625 + }, + { + "avg_mask_ratio": 0.4607314572727773, + "avg_response_length": 243.375, + "avg_student_mask_ratio": 0.4607314572727773, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.203125, + "kd_loss": 0.176242933875335, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 110.8875, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.8875 + }, + { + "avg_mask_ratio": 0.5309946444118395, + "avg_response_length": 231.6875, + "avg_student_mask_ratio": 0.5309946444118395, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.208984375, + "kd_loss": 0.26651088480309115, + "learning_rate": 3e-06, + "loss": 0.2828, + "masked_tokens": 123.2875, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 123.2875 + }, + { + "avg_mask_ratio": 0.45879559536697345, + "avg_response_length": 251.8, + "avg_student_mask_ratio": 0.45879559536697345, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.11474609375, + "kd_loss": 0.145786481295454, + "learning_rate": 3e-06, + "loss": 0.1439, + "masked_tokens": 125.425, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 125.425 + }, + { + "avg_mask_ratio": 0.3955249205115251, + "avg_response_length": 238.675, + "avg_student_mask_ratio": 0.3955249205115251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.057373046875, + "kd_loss": 0.15104623195453543, + "learning_rate": 3e-06, + "loss": 0.1578, + "masked_tokens": 95.2125, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 95.2125 + }, + { + "avg_mask_ratio": 0.4504710016073659, + "avg_response_length": 202.575, + "avg_student_mask_ratio": 0.4504710016073659, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.150390625, + "kd_loss": 0.16011972622800386, + "learning_rate": 3e-06, + "loss": 0.179, + "masked_tokens": 90.3375, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 90.3375 + }, + { + "avg_mask_ratio": 0.4822002159198746, + "avg_response_length": 189.9875, + "avg_student_mask_ratio": 0.4822002159198746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.1630859375, + "kd_loss": 0.21744939284749734, + "learning_rate": 3e-06, + "loss": 0.201, + "masked_tokens": 92.7, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 92.7 + }, + { + "avg_mask_ratio": 0.5024422638700343, + "avg_response_length": 237.2625, + "avg_student_mask_ratio": 0.5024422638700343, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.1259765625, + "kd_loss": 0.21489343987664142, + "learning_rate": 3e-06, + "loss": 0.2268, + "masked_tokens": 123.8125, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 123.8125 + }, + { + "avg_mask_ratio": 0.510216062690597, + "avg_response_length": 257.525, + "avg_student_mask_ratio": 0.510216062690597, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.1337890625, + "kd_loss": 0.17950079924535203, + "learning_rate": 3e-06, + "loss": 0.2007, + "masked_tokens": 142.8125, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 142.8125 + }, + { + "avg_mask_ratio": 0.45074162067612633, + "avg_response_length": 245.525, + "avg_student_mask_ratio": 0.45074162067612633, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.310546875, + "kd_loss": 0.14043198096701417, + "learning_rate": 3e-06, + "loss": 0.1669, + "masked_tokens": 116.0875, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 116.0875 + }, + { + "avg_mask_ratio": 0.4926802407717332, + "avg_response_length": 238.2, + "avg_student_mask_ratio": 0.4926802407717332, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.330078125, + "kd_loss": 0.21977804936059328, + "learning_rate": 3e-06, + "loss": 0.2497, + "masked_tokens": 127.4875, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 127.4875 + }, + { + "avg_mask_ratio": 0.4706261330051348, + "avg_response_length": 251.7375, + "avg_student_mask_ratio": 0.4706261330051348, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.056640625, + "kd_loss": 0.29630907970476983, + "learning_rate": 3e-06, + "loss": 0.2329, + "masked_tokens": 116.8625, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 116.8625 + }, + { + "avg_mask_ratio": 0.48096118308603764, + "avg_response_length": 262.25, + "avg_student_mask_ratio": 0.48096118308603764, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.12255859375, + "kd_loss": 0.20822112379132704, + "learning_rate": 3e-06, + "loss": 0.186, + "masked_tokens": 132.2, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 132.2 + }, + { + "avg_mask_ratio": 0.4433969090110622, + "avg_response_length": 209.7125, + "avg_student_mask_ratio": 0.4433969090110622, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.123046875, + "kd_loss": 0.14427866181035826, + "learning_rate": 3e-06, + "loss": 0.159, + "masked_tokens": 95.8625, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 95.8625 + }, + { + "avg_mask_ratio": 0.4764250977139454, + "avg_response_length": 226.3875, + "avg_student_mask_ratio": 0.4764250977139454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.185546875, + "kd_loss": 0.18584371552193701, + "learning_rate": 3e-06, + "loss": 0.1823, + "masked_tokens": 113.95, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 113.95 + }, + { + "avg_mask_ratio": 0.47088071387261154, + "avg_response_length": 233.2125, + "avg_student_mask_ratio": 0.47088071387261154, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.2734375, + "kd_loss": 0.22232839192147366, + "learning_rate": 3e-06, + "loss": 0.1961, + "masked_tokens": 116.675, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4870647343515884, + "avg_response_length": 228.3875, + "avg_student_mask_ratio": 0.4870647343515884, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.25390625, + "kd_loss": 0.25729746209006665, + "learning_rate": 3e-06, + "loss": 0.2306, + "masked_tokens": 114.3625, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 114.3625 + }, + { + "avg_mask_ratio": 0.440834702400025, + "avg_response_length": 209.85, + "avg_student_mask_ratio": 0.440834702400025, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.2275390625, + "kd_loss": 0.15747290870124503, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 87.575, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 87.575 + }, + { + "avg_mask_ratio": 0.4660509963519871, + "avg_response_length": 250.9125, + "avg_student_mask_ratio": 0.4660509963519871, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.392578125, + "kd_loss": 0.17299866449352522, + "learning_rate": 3e-06, + "loss": 0.178, + "masked_tokens": 109.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 109.675 + }, + { + "avg_mask_ratio": 0.4507861359976232, + "avg_response_length": 235.875, + "avg_student_mask_ratio": 0.4507861359976232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2113636662043291, + "learning_rate": 3e-06, + "loss": 0.1795, + "masked_tokens": 106.95, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 106.95 + }, + { + "avg_mask_ratio": 0.4283985076006502, + "avg_response_length": 230.95, + "avg_student_mask_ratio": 0.4283985076006502, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.16015625, + "kd_loss": 0.15304818171161969, + "learning_rate": 3e-06, + "loss": 0.1724, + "masked_tokens": 101.15, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 101.15 + }, + { + "avg_mask_ratio": 0.47474822774529457, + "avg_response_length": 233.1, + "avg_student_mask_ratio": 0.47474822774529457, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.06689453125, + "kd_loss": 0.1363761811357108, + "learning_rate": 3e-06, + "loss": 0.171, + "masked_tokens": 112.725, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 112.725 + }, + { + "avg_mask_ratio": 0.4808142688901474, + "avg_response_length": 238.54761904761904, + "avg_student_mask_ratio": 0.4808142688901474, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.22379483340171732, + "learning_rate": 3e-06, + "loss": 0.2466, + "masked_tokens": 113.67857142857143, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 113.67857142857143 + }, + { + "avg_mask_ratio": 0.47175657459301873, + "avg_response_length": 249.9625, + "avg_student_mask_ratio": 0.47175657459301873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.166015625, + "kd_loss": 0.17491777570117745, + "learning_rate": 3e-06, + "loss": 0.2029, + "masked_tokens": 119.4625, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 119.4625 + }, + { + "avg_mask_ratio": 0.4564988439786248, + "avg_response_length": 238.8875, + "avg_student_mask_ratio": 0.4564988439786248, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.1279296875, + "kd_loss": 0.12884608846077866, + "learning_rate": 3e-06, + "loss": 0.1536, + "masked_tokens": 104.0, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 104.0 + }, + { + "avg_mask_ratio": 0.5083174118888565, + "avg_response_length": 258.1375, + "avg_student_mask_ratio": 0.5083174118888565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.1357421875, + "kd_loss": 0.18128383785248586, + "learning_rate": 3e-06, + "loss": 0.1811, + "masked_tokens": 133.7125, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 133.7125 + }, + { + "avg_mask_ratio": 0.5130103683215566, + "avg_response_length": 246.4875, + "avg_student_mask_ratio": 0.5130103683215566, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.11181640625, + "kd_loss": 0.23847924997493805, + "learning_rate": 3e-06, + "loss": 0.2289, + "masked_tokens": 132.5625, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 132.5625 + }, + { + "avg_mask_ratio": 0.4543681625276804, + "avg_response_length": 199.65, + "avg_student_mask_ratio": 0.4543681625276804, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.1533203125, + "kd_loss": 0.1353249137787543, + "learning_rate": 3e-06, + "loss": 0.1403, + "masked_tokens": 87.9875, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 87.9875 + }, + { + "avg_mask_ratio": 0.46718079667771234, + "avg_response_length": 218.0875, + "avg_student_mask_ratio": 0.46718079667771234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.2109375, + "kd_loss": 0.15268151032492625, + "learning_rate": 3e-06, + "loss": 0.1789, + "masked_tokens": 101.3875, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 101.3875 + }, + { + "avg_mask_ratio": 0.4890203754650429, + "avg_response_length": 244.3875, + "avg_student_mask_ratio": 0.4890203754650429, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.27734375, + "kd_loss": 0.17835129436630268, + "learning_rate": 3e-06, + "loss": 0.2173, + "masked_tokens": 116.175, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 116.175 + }, + { + "avg_mask_ratio": 0.45064474650425834, + "avg_response_length": 217.8375, + "avg_student_mask_ratio": 0.45064474650425834, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.17578125, + "kd_loss": 0.16049880692362706, + "learning_rate": 3e-06, + "loss": 0.1855, + "masked_tokens": 98.6375, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 98.6375 + }, + { + "avg_mask_ratio": 0.3844255942822201, + "avg_response_length": 240.825, + "avg_student_mask_ratio": 0.3844255942822201, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.5859375, + "kd_loss": 0.17605857607457268, + "learning_rate": 3e-06, + "loss": 0.1885, + "masked_tokens": 96.85, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 96.85 + }, + { + "avg_mask_ratio": 0.45103558609262107, + "avg_response_length": 231.025, + "avg_student_mask_ratio": 0.45103558609262107, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.234375, + "kd_loss": 0.1386162672638477, + "learning_rate": 3e-06, + "loss": 0.1681, + "masked_tokens": 105.6875, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 105.6875 + }, + { + "avg_mask_ratio": 0.47033366551622746, + "avg_response_length": 248.3875, + "avg_student_mask_ratio": 0.47033366551622746, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.26953125, + "kd_loss": 0.17702910760917803, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 125.05, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 125.05 + }, + { + "avg_mask_ratio": 0.5230229062028229, + "avg_response_length": 241.8125, + "avg_student_mask_ratio": 0.5230229062028229, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.20703125, + "kd_loss": 0.22054996666956866, + "learning_rate": 3e-06, + "loss": 0.2233, + "masked_tokens": 129.0, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 129.0 + }, + { + "avg_mask_ratio": 0.44929012526990847, + "avg_response_length": 246.7375, + "avg_student_mask_ratio": 0.44929012526990847, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.212890625, + "kd_loss": 0.15257543138572202, + "learning_rate": 3e-06, + "loss": 0.1581, + "masked_tokens": 113.9375, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 113.9375 + }, + { + "avg_mask_ratio": 0.5061312943696976, + "avg_response_length": 237.175, + "avg_student_mask_ratio": 0.5061312943696976, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.06884765625, + "kd_loss": 0.21158275993875578, + "learning_rate": 3e-06, + "loss": 0.1942, + "masked_tokens": 116.675, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 116.675 + }, + { + "avg_mask_ratio": 0.4925117701757699, + "avg_response_length": 233.2625, + "avg_student_mask_ratio": 0.4925117701757699, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1787109375, + "kd_loss": 0.1805886138310143, + "learning_rate": 3e-06, + "loss": 0.1744, + "masked_tokens": 111.6875, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 111.6875 + }, + { + "avg_mask_ratio": 0.5307835865532979, + "avg_response_length": 249.65, + "avg_student_mask_ratio": 0.5307835865532979, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.28125, + "kd_loss": 0.25403604302136956, + "learning_rate": 3e-06, + "loss": 0.2627, + "masked_tokens": 126.6875, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 126.6875 + }, + { + "avg_mask_ratio": 0.48631439574528484, + "avg_response_length": 241.475, + "avg_student_mask_ratio": 0.48631439574528484, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.2412109375, + "kd_loss": 0.1642333460577902, + "learning_rate": 3e-06, + "loss": 0.1731, + "masked_tokens": 122.1125, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 122.1125 + }, + { + "avg_mask_ratio": 0.5248487972887232, + "avg_response_length": 231.5375, + "avg_student_mask_ratio": 0.5248487972887232, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.193359375, + "kd_loss": 0.2508082524812494, + "learning_rate": 3e-06, + "loss": 0.2054, + "masked_tokens": 127.175, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 127.175 + }, + { + "avg_mask_ratio": 0.48489007767057046, + "avg_response_length": 246.0125, + "avg_student_mask_ratio": 0.48489007767057046, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.28125, + "kd_loss": 0.18781521328146483, + "learning_rate": 3e-06, + "loss": 0.2045, + "masked_tokens": 125.225, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 125.225 + }, + { + "avg_mask_ratio": 0.4605832444969565, + "avg_response_length": 244.5, + "avg_student_mask_ratio": 0.4605832444969565, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.19140625, + "kd_loss": 0.1806626110754223, + "learning_rate": 3e-06, + "loss": 0.1702, + "masked_tokens": 120.825, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 120.825 + }, + { + "avg_mask_ratio": 0.4662455078505445, + "avg_response_length": 244.6125, + "avg_student_mask_ratio": 0.4662455078505445, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.16796875, + "kd_loss": 0.20038694294766798, + "learning_rate": 3e-06, + "loss": 0.1824, + "masked_tokens": 114.2, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 114.2 + }, + { + "avg_mask_ratio": 0.4820630593923852, + "avg_response_length": 217.3, + "avg_student_mask_ratio": 0.4820630593923852, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.11279296875, + "kd_loss": 0.16563009086588637, + "learning_rate": 3e-06, + "loss": 0.17, + "masked_tokens": 118.4875, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 118.4875 + }, + { + "avg_mask_ratio": 0.5206489040749147, + "avg_response_length": 216.45, + "avg_student_mask_ratio": 0.5206489040749147, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.236328125, + "kd_loss": 0.23649522811845144, + "learning_rate": 3e-06, + "loss": 0.2241, + "masked_tokens": 118.4375, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 118.4375 + }, + { + "avg_mask_ratio": 0.4955552547937259, + "avg_response_length": 211.175, + "avg_student_mask_ratio": 0.4955552547937259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.275390625, + "kd_loss": 0.23970817765721222, + "learning_rate": 3e-06, + "loss": 0.2184, + "masked_tokens": 112.3375, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 112.3375 + }, + { + "avg_mask_ratio": 0.4820543818641454, + "avg_response_length": 229.1375, + "avg_student_mask_ratio": 0.4820543818641454, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.435546875, + "kd_loss": 0.18955910701470202, + "learning_rate": 3e-06, + "loss": 0.1978, + "masked_tokens": 114.55, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 114.55 + }, + { + "avg_mask_ratio": 0.4605119539948646, + "avg_response_length": 245.5625, + "avg_student_mask_ratio": 0.4605119539948646, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1103515625, + "kd_loss": 0.16859328244926958, + "learning_rate": 3e-06, + "loss": 0.1779, + "masked_tokens": 113.85, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 113.85 + }, + { + "avg_mask_ratio": 0.5134038798511028, + "avg_response_length": 194.125, + "avg_student_mask_ratio": 0.5134038798511028, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.0830078125, + "kd_loss": 0.17122714665274544, + "learning_rate": 3e-06, + "loss": 0.2018, + "masked_tokens": 102.9375, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 102.9375 + }, + { + "avg_mask_ratio": 0.4201362137740944, + "avg_response_length": 223.55, + "avg_student_mask_ratio": 0.4201362137740944, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.16796875, + "kd_loss": 0.19197621339357057, + "learning_rate": 3e-06, + "loss": 0.1792, + "masked_tokens": 94.7125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 94.7125 + }, + { + "avg_mask_ratio": 0.46924527404480615, + "avg_response_length": 227.35, + "avg_student_mask_ratio": 0.46924527404480615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.8125, + "kd_loss": 0.24938117066874382, + "learning_rate": 3e-06, + "loss": 0.2591, + "masked_tokens": 107.5375, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 107.5375 + }, + { + "avg_mask_ratio": 0.445551612455165, + "avg_response_length": 268.6625, + "avg_student_mask_ratio": 0.445551612455165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.1435546875, + "kd_loss": 0.1928954417056957, + "learning_rate": 3e-06, + "loss": 0.1563, + "masked_tokens": 124.0875, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 124.0875 + }, + { + "avg_mask_ratio": 0.4321410794305848, + "avg_response_length": 256.1625, + "avg_student_mask_ratio": 0.4321410794305848, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.2216796875, + "kd_loss": 0.15059620087446887, + "learning_rate": 3e-06, + "loss": 0.1534, + "masked_tokens": 117.7, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 117.7 + }, + { + "avg_mask_ratio": 0.4697655299096368, + "avg_response_length": 240.6125, + "avg_student_mask_ratio": 0.4697655299096368, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.16427693545232777, + "learning_rate": 3e-06, + "loss": 0.1489, + "masked_tokens": 113.5375, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 113.5375 + }, + { + "avg_mask_ratio": 0.4635992758907378, + "avg_response_length": 224.15, + "avg_student_mask_ratio": 0.4635992758907378, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.197265625, + "kd_loss": 0.15171801659575976, + "learning_rate": 3e-06, + "loss": 0.1526, + "masked_tokens": 107.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 107.5 + }, + { + "avg_mask_ratio": 0.5018501321552321, + "avg_response_length": 235.1625, + "avg_student_mask_ratio": 0.5018501321552321, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.1630859375, + "kd_loss": 0.18931926304685476, + "learning_rate": 3e-06, + "loss": 0.2031, + "masked_tokens": 116.6125, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 116.6125 + }, + { + "avg_mask_ratio": 0.5050785383209586, + "avg_response_length": 207.6875, + "avg_student_mask_ratio": 0.5050785383209586, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.5546875, + "kd_loss": 0.24824176937403308, + "learning_rate": 3e-06, + "loss": 0.2566, + "masked_tokens": 119.175, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 119.175 + }, + { + "avg_mask_ratio": 0.4980328972451389, + "avg_response_length": 270.325, + "avg_student_mask_ratio": 0.4980328972451389, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.09912109375, + "kd_loss": 0.1924194690429431, + "learning_rate": 3e-06, + "loss": 0.2006, + "masked_tokens": 141.925, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 141.925 + }, + { + "avg_mask_ratio": 0.493249478796497, + "avg_response_length": 226.025, + "avg_student_mask_ratio": 0.493249478796497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.15751813794203606, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 118.825, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 118.825 + }, + { + "avg_mask_ratio": 0.5009000841644593, + "avg_response_length": 233.025, + "avg_student_mask_ratio": 0.5009000841644593, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.10205078125, + "kd_loss": 0.1860738446495816, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 117.3125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 117.3125 + }, + { + "avg_mask_ratio": 0.46293387678451836, + "avg_response_length": 231.3625, + "avg_student_mask_ratio": 0.46293387678451836, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.1103515625, + "kd_loss": 0.19740513321539765, + "learning_rate": 3e-06, + "loss": 0.1841, + "masked_tokens": 110.5625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.4846805231412873, + "avg_response_length": 220.7375, + "avg_student_mask_ratio": 0.4846805231412873, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.228515625, + "kd_loss": 0.19436422403705364, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 113.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 113.7625 + }, + { + "avg_mask_ratio": 0.4508363194297999, + "avg_response_length": 203.2875, + "avg_student_mask_ratio": 0.4508363194297999, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.1962890625, + "kd_loss": 0.16288868402702406, + "learning_rate": 3e-06, + "loss": 0.1845, + "masked_tokens": 95.3875, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 95.3875 + }, + { + "avg_mask_ratio": 0.43862658384023234, + "avg_response_length": 227.9375, + "avg_student_mask_ratio": 0.43862658384023234, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.08251953125, + "kd_loss": 0.11281866748422545, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 101.2625, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 101.2625 + }, + { + "avg_mask_ratio": 0.44909207145101393, + "avg_response_length": 237.2375, + "avg_student_mask_ratio": 0.44909207145101393, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 1.0, + "batch_inp_par_reverse_frac": 0.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1591796875, + "kd_loss": 0.15684176656744747, + "learning_rate": 3e-06, + "loss": 0.1737, + "masked_tokens": 103.7, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 103.7 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e94a11824a7a1de5f3a0a00320426e3e4de0eff --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c347df37da6e348160afc3fbb65d12595e5064bd8e5bfd591004e5e86a703f42 +size 7992 diff --git a/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcecb762b680fb786d1e2df7934f055aecaa042a --- /dev/null +++ b/math/INP-PAR/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eb3f17c41e3a38e7ca5fd1c5754e5d571dffd205c3299471ac5e06f00425071 +size 1425793329