diff --git a/.gitattributes b/.gitattributes index 28c2218b138fe04158b86448451600e913f85856..89138329e0953734b768a5a21bdb77b3ef90d228 100644 --- a/.gitattributes +++ b/.gitattributes @@ -39,3 +39,4 @@ math/inp/SD-INP/math_self_distill_INP_u0.6-1.0_gold1_target1_ce0.5/debug_trainin math/inp/SD-INP/math_self_distill_INP_u0.8-1.0_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/SFT/inp-onehot_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.5/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text +math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93600cc168d04f7b9e11aaa5c4ef5c2f7102e776 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b9999e8af4b990d1e83bb8ae6e9a2873050ab44656444d2f06dc1ac116026d +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdc3c94b0854113608e4ce30b10e17265849d745 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba0cf7ab58f9a90dfce592b8cdc204e3d1d32dda73db3c25daa0a896f2e25fc +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5106c75aada198e94c487edc3d897241d65996e0 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cedbb24eb08e412f2b6567529f919723c479356a0b4861fb1f0133d92b4e4aa +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bebd6f8aea042602cdbea7c81b9f67d21dc1bb50 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27f1021fb57112918a3a6091b09b0ccd50cb071a2324c12ae9afcc9851ee8bd3 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e899126a25538ff85c74c1e363ffbd951d4dda1e --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d8cdfceac9f7917b978dca661a3b8e04187faea5d5f6bd7b462d61d8234d57f +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e0061eb4b563daf29bb20b3fded67183519e302 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/trainer_state.json @@ -0,0 +1,273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21333333333333335, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38e14085aa02dbd7431052d41e49bcce663ce73f --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce2ab96de657affb59dc5ce9c0d71ffeb27e35d07f8e3ac6368b3523e179b5ed +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddd85e922738571b548a8a73d8a4e621e8c5a8c8 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152dbdb4d1095d325de67b9aff1e47c443bfc87cca8d5deddbd8616e8b888225 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..69b02098a627695df3c67c7728c514598ecdbd30 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2658504939ac737621bd8a2b154858cd4e206cff2216ea3f98af5ed6e476c37 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e9bbb4a191b64b9cd4201915ad6492457c864eb --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ccd67cb51eba4d5d9beb26859701364d21cc26cf86c5a531d92547a5efdbdcb +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f90e16891856bcfb31d679597efff574807cb3ce --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d9aa99505fc60c0db1b9cdacaa08b06e8a85c8aaaab4e389667a719fafb9bf +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43e6c3022586c3e95fb0fdedbad294f95e0dcc55 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/trainer_state.json @@ -0,0 +1,2433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1365333333333334, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4788092178525403, + "avg_response_length": 215.325, + "avg_student_mask_ratio": 0.4788092178525403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.1640625, + "kd_loss": 0.17339815042200826, + "learning_rate": 3e-06, + "loss": 0.1787, + "masked_tokens": 108.7125, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7125 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.33203125, + "kd_loss": 0.21160616380475403, + "learning_rate": 3e-06, + "loss": 0.2144, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1689453125, + "kd_loss": 0.17100098597317698, + "learning_rate": 3e-06, + "loss": 0.173, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.09326171875, + "kd_loss": 0.19763285817334691, + "learning_rate": 3e-06, + "loss": 0.2275, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.15625, + "kd_loss": 0.1989821564191459, + "learning_rate": 3e-06, + "loss": 0.1661, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46666589792585, + "avg_response_length": 228.5125, + "avg_student_mask_ratio": 0.46666589792585, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.146484375, + "kd_loss": 0.2019220097962581, + "learning_rate": 3e-06, + "loss": 0.1894, + "masked_tokens": 117.2625, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 117.2625 + }, + { + "avg_mask_ratio": 0.4440126782981679, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.4440126782981679, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.103515625, + "kd_loss": 0.14956598446678698, + "learning_rate": 3e-06, + "loss": 0.1431, + "masked_tokens": 117.8, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.8 + }, + { + "avg_mask_ratio": 0.42723098206624854, + "avg_response_length": 258.0125, + "avg_student_mask_ratio": 0.42723098206624854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.1298828125, + "kd_loss": 0.12485562007910005, + "learning_rate": 3e-06, + "loss": 0.1494, + "masked_tokens": 118.575, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 118.575 + }, + { + "avg_mask_ratio": 0.46588709874195045, + "avg_response_length": 220.7, + "avg_student_mask_ratio": 0.46588709874195045, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.2353515625, + "kd_loss": 0.1650387186985956, + "learning_rate": 3e-06, + "loss": 0.151, + "masked_tokens": 102.7625, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 102.7625 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.19140625, + "kd_loss": 0.13447051951316097, + "learning_rate": 3e-06, + "loss": 0.139, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.5037169002695009, + "avg_response_length": 250.2875, + "avg_student_mask_ratio": 0.5037169002695009, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.255859375, + "kd_loss": 0.18524283417677906, + "learning_rate": 3e-06, + "loss": 0.1925, + "masked_tokens": 119.575, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 119.575 + }, + { + "avg_mask_ratio": 0.5109186505898833, + "avg_response_length": 225.95, + "avg_student_mask_ratio": 0.5109186505898833, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.22792843131458085, + "learning_rate": 3e-06, + "loss": 0.2303, + "masked_tokens": 129.25, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 129.25 + }, + { + "avg_mask_ratio": 0.4988811274059117, + "avg_response_length": 263.7875, + "avg_student_mask_ratio": 0.4988811274059117, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.08349609375, + "kd_loss": 0.19122445829223125, + "learning_rate": 3e-06, + "loss": 0.1808, + "masked_tokens": 137.0, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.0 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.17390446548149613, + "learning_rate": 3e-06, + "loss": 0.1818, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.130859375, + "kd_loss": 0.17035312611951667, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4497753610135987, + "avg_response_length": 260.15, + "avg_student_mask_ratio": 0.4497753610135987, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.06591796875, + "kd_loss": 0.2005822003855428, + "learning_rate": 3e-06, + "loss": 0.1598, + "masked_tokens": 121.7625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 121.7625 + }, + { + "avg_mask_ratio": 0.48591957957251, + "avg_response_length": 231.0875, + "avg_student_mask_ratio": 0.48591957957251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.2412109375, + "kd_loss": 0.17790169413831336, + "learning_rate": 3e-06, + "loss": 0.1902, + "masked_tokens": 116.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 116.7625 + }, + { + "avg_mask_ratio": 0.44369487821822984, + "avg_response_length": 197.9125, + "avg_student_mask_ratio": 0.44369487821822984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.224609375, + "kd_loss": 0.15859377338159675, + "learning_rate": 3e-06, + "loss": 0.1765, + "masked_tokens": 91.4125, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.4125 + }, + { + "avg_mask_ratio": 0.44944015803339427, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.44944015803339427, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.072265625, + "kd_loss": 0.15013304932544996, + "learning_rate": 3e-06, + "loss": 0.1349, + "masked_tokens": 103.2375, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 103.2375 + }, + { + "avg_mask_ratio": 0.45069065956631676, + "avg_response_length": 230.175, + "avg_student_mask_ratio": 0.45069065956631676, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1982421875, + "kd_loss": 0.17926409896495557, + "learning_rate": 3e-06, + "loss": 0.1615, + "masked_tokens": 104.325, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 104.325 + }, + { + "avg_mask_ratio": 0.475881968671456, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475881968671456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1328125, + "kd_loss": 0.15231701551953164, + "learning_rate": 3e-06, + "loss": 0.1807, + "masked_tokens": 125.1625, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 125.1625 + }, + { + "avg_mask_ratio": 0.4633113604504615, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4633113604504615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.1591796875, + "kd_loss": 0.16991043191227914, + "learning_rate": 3e-06, + "loss": 0.1889, + "masked_tokens": 109.5375, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 109.5375 + }, + { + "avg_mask_ratio": 0.47329409609083084, + "avg_response_length": 244.875, + "avg_student_mask_ratio": 0.47329409609083084, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.1708984375, + "kd_loss": 0.16787025193963528, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 120.525, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 120.525 + }, + { + "avg_mask_ratio": 0.4973435569776311, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.4973435569776311, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.134765625, + "kd_loss": 0.18026482684338893, + "learning_rate": 3e-06, + "loss": 0.1888, + "masked_tokens": 120.63095238095238, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.63095238095238 + }, + { + "avg_mask_ratio": 0.4365456592233386, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4365456592233386, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.154296875, + "kd_loss": 0.15423739737520278, + "learning_rate": 3e-06, + "loss": 0.1463, + "masked_tokens": 118.8125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 118.8125 + }, + { + "avg_mask_ratio": 0.4914003949146718, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4914003949146718, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.08544921875, + "kd_loss": 0.22274305121804333, + "learning_rate": 3e-06, + "loss": 0.1988, + "masked_tokens": 143.075, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.075 + }, + { + "avg_mask_ratio": 0.4381961932755075, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.4381961932755075, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.318359375, + "kd_loss": 0.17543826163571338, + "learning_rate": 3e-06, + "loss": 0.1612, + "masked_tokens": 106.5125, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 106.5125 + }, + { + "avg_mask_ratio": 0.42702240714570505, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42702240714570505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.13671875, + "kd_loss": 0.13624779113131352, + "learning_rate": 3e-06, + "loss": 0.1282, + "masked_tokens": 98.525, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 98.525 + }, + { + "avg_mask_ratio": 0.4583221158827655, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4583221158827655, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 6.8125, + "kd_loss": 0.14998470883065806, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 121.2875, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2875 + }, + { + "avg_mask_ratio": 0.45086776099633424, + "avg_response_length": 214.925, + "avg_student_mask_ratio": 0.45086776099633424, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.205078125, + "kd_loss": 0.1474926151762702, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 100.4875, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.4875 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16773a67bd96f6c58086e4a60979ed214834c1f2 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27307405e263f14ff96a338e820c415ada970b058503d305e0ba924ab1574c3 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..555102533b46ae47f71f3ef140ce8d3af587f6b4 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e609c8a1b3167f6545a82494a96c9e73abb5f157a7a35c3f1874604014078d +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a414893d506cea6e26edc9aee4315ab3b08e349 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:907e39dff0cf7ad1a1affaa1e7047653794ab16e25c6977ce7b5524769fdf799 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe78137f4dfdca7e8446f9258340eb924d1e97d4 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93b99032f4d9f420b20d654e1d4d712556184b2955893a2ffe42549865e9d1b4 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..125c51aef8c1558b284b7ffdb401f40b1199eb92 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25795e3b7374d0f6abdd7ab4b34fbf7ab0447ba73c04014500c2ab8b5acec5b4 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2dd4c6cae765a1394c9015eb1f25deb2f0e3fcd6 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/trainer_state.json @@ -0,0 +1,2673 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3498666666666668, + "eval_steps": 500, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4788092178525403, + "avg_response_length": 215.325, + "avg_student_mask_ratio": 0.4788092178525403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.1640625, + "kd_loss": 0.17339815042200826, + "learning_rate": 3e-06, + "loss": 0.1787, + "masked_tokens": 108.7125, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7125 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.33203125, + "kd_loss": 0.21160616380475403, + "learning_rate": 3e-06, + "loss": 0.2144, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1689453125, + "kd_loss": 0.17100098597317698, + "learning_rate": 3e-06, + "loss": 0.173, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.09326171875, + "kd_loss": 0.19763285817334691, + "learning_rate": 3e-06, + "loss": 0.2275, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.15625, + "kd_loss": 0.1989821564191459, + "learning_rate": 3e-06, + "loss": 0.1661, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46666589792585, + "avg_response_length": 228.5125, + "avg_student_mask_ratio": 0.46666589792585, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.146484375, + "kd_loss": 0.2019220097962581, + "learning_rate": 3e-06, + "loss": 0.1894, + "masked_tokens": 117.2625, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 117.2625 + }, + { + "avg_mask_ratio": 0.4440126782981679, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.4440126782981679, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.103515625, + "kd_loss": 0.14956598446678698, + "learning_rate": 3e-06, + "loss": 0.1431, + "masked_tokens": 117.8, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.8 + }, + { + "avg_mask_ratio": 0.42723098206624854, + "avg_response_length": 258.0125, + "avg_student_mask_ratio": 0.42723098206624854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.1298828125, + "kd_loss": 0.12485562007910005, + "learning_rate": 3e-06, + "loss": 0.1494, + "masked_tokens": 118.575, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 118.575 + }, + { + "avg_mask_ratio": 0.46588709874195045, + "avg_response_length": 220.7, + "avg_student_mask_ratio": 0.46588709874195045, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.2353515625, + "kd_loss": 0.1650387186985956, + "learning_rate": 3e-06, + "loss": 0.151, + "masked_tokens": 102.7625, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 102.7625 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.19140625, + "kd_loss": 0.13447051951316097, + "learning_rate": 3e-06, + "loss": 0.139, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.5037169002695009, + "avg_response_length": 250.2875, + "avg_student_mask_ratio": 0.5037169002695009, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.255859375, + "kd_loss": 0.18524283417677906, + "learning_rate": 3e-06, + "loss": 0.1925, + "masked_tokens": 119.575, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 119.575 + }, + { + "avg_mask_ratio": 0.5109186505898833, + "avg_response_length": 225.95, + "avg_student_mask_ratio": 0.5109186505898833, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.22792843131458085, + "learning_rate": 3e-06, + "loss": 0.2303, + "masked_tokens": 129.25, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 129.25 + }, + { + "avg_mask_ratio": 0.4988811274059117, + "avg_response_length": 263.7875, + "avg_student_mask_ratio": 0.4988811274059117, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.08349609375, + "kd_loss": 0.19122445829223125, + "learning_rate": 3e-06, + "loss": 0.1808, + "masked_tokens": 137.0, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.0 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.17390446548149613, + "learning_rate": 3e-06, + "loss": 0.1818, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.130859375, + "kd_loss": 0.17035312611951667, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4497753610135987, + "avg_response_length": 260.15, + "avg_student_mask_ratio": 0.4497753610135987, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.06591796875, + "kd_loss": 0.2005822003855428, + "learning_rate": 3e-06, + "loss": 0.1598, + "masked_tokens": 121.7625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 121.7625 + }, + { + "avg_mask_ratio": 0.48591957957251, + "avg_response_length": 231.0875, + "avg_student_mask_ratio": 0.48591957957251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.2412109375, + "kd_loss": 0.17790169413831336, + "learning_rate": 3e-06, + "loss": 0.1902, + "masked_tokens": 116.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 116.7625 + }, + { + "avg_mask_ratio": 0.44369487821822984, + "avg_response_length": 197.9125, + "avg_student_mask_ratio": 0.44369487821822984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.224609375, + "kd_loss": 0.15859377338159675, + "learning_rate": 3e-06, + "loss": 0.1765, + "masked_tokens": 91.4125, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.4125 + }, + { + "avg_mask_ratio": 0.44944015803339427, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.44944015803339427, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.072265625, + "kd_loss": 0.15013304932544996, + "learning_rate": 3e-06, + "loss": 0.1349, + "masked_tokens": 103.2375, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 103.2375 + }, + { + "avg_mask_ratio": 0.45069065956631676, + "avg_response_length": 230.175, + "avg_student_mask_ratio": 0.45069065956631676, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1982421875, + "kd_loss": 0.17926409896495557, + "learning_rate": 3e-06, + "loss": 0.1615, + "masked_tokens": 104.325, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 104.325 + }, + { + "avg_mask_ratio": 0.475881968671456, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475881968671456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1328125, + "kd_loss": 0.15231701551953164, + "learning_rate": 3e-06, + "loss": 0.1807, + "masked_tokens": 125.1625, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 125.1625 + }, + { + "avg_mask_ratio": 0.4633113604504615, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4633113604504615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.1591796875, + "kd_loss": 0.16991043191227914, + "learning_rate": 3e-06, + "loss": 0.1889, + "masked_tokens": 109.5375, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 109.5375 + }, + { + "avg_mask_ratio": 0.47329409609083084, + "avg_response_length": 244.875, + "avg_student_mask_ratio": 0.47329409609083084, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.1708984375, + "kd_loss": 0.16787025193963528, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 120.525, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 120.525 + }, + { + "avg_mask_ratio": 0.4973435569776311, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.4973435569776311, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.134765625, + "kd_loss": 0.18026482684338893, + "learning_rate": 3e-06, + "loss": 0.1888, + "masked_tokens": 120.63095238095238, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.63095238095238 + }, + { + "avg_mask_ratio": 0.4365456592233386, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4365456592233386, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.154296875, + "kd_loss": 0.15423739737520278, + "learning_rate": 3e-06, + "loss": 0.1463, + "masked_tokens": 118.8125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 118.8125 + }, + { + "avg_mask_ratio": 0.4914003949146718, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4914003949146718, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.08544921875, + "kd_loss": 0.22274305121804333, + "learning_rate": 3e-06, + "loss": 0.1988, + "masked_tokens": 143.075, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.075 + }, + { + "avg_mask_ratio": 0.4381961932755075, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.4381961932755075, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.318359375, + "kd_loss": 0.17543826163571338, + "learning_rate": 3e-06, + "loss": 0.1612, + "masked_tokens": 106.5125, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 106.5125 + }, + { + "avg_mask_ratio": 0.42702240714570505, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42702240714570505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.13671875, + "kd_loss": 0.13624779113131352, + "learning_rate": 3e-06, + "loss": 0.1282, + "masked_tokens": 98.525, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 98.525 + }, + { + "avg_mask_ratio": 0.4583221158827655, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4583221158827655, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 6.8125, + "kd_loss": 0.14998470883065806, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 121.2875, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2875 + }, + { + "avg_mask_ratio": 0.45086776099633424, + "avg_response_length": 214.925, + "avg_student_mask_ratio": 0.45086776099633424, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.205078125, + "kd_loss": 0.1474926151762702, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 100.4875, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.4875 + }, + { + "avg_mask_ratio": 0.4363243154773954, + "avg_response_length": 224.3, + "avg_student_mask_ratio": 0.4363243154773954, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1578666666666666, + "grad_norm": 0.055908203125, + "kd_loss": 0.14260265023719612, + "learning_rate": 3e-06, + "loss": 0.1441, + "masked_tokens": 89.5, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 89.5 + }, + { + "avg_mask_ratio": 0.5063220548443497, + "avg_response_length": 206.9125, + "avg_student_mask_ratio": 0.5063220548443497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1792, + "grad_norm": 0.07421875, + "kd_loss": 0.174221977752552, + "learning_rate": 3e-06, + "loss": 0.1927, + "masked_tokens": 110.25, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 110.25 + }, + { + "avg_mask_ratio": 0.46985941788880153, + "avg_response_length": 220.05, + "avg_student_mask_ratio": 0.46985941788880153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2005333333333335, + "grad_norm": 0.1884765625, + "kd_loss": 0.1922343357020509, + "learning_rate": 3e-06, + "loss": 0.2123, + "masked_tokens": 104.9, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 104.9 + }, + { + "avg_mask_ratio": 0.49566771630197765, + "avg_response_length": 213.7, + "avg_student_mask_ratio": 0.49566771630197765, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2218666666666667, + "grad_norm": 0.2158203125, + "kd_loss": 0.18418902513512875, + "learning_rate": 3e-06, + "loss": 0.2026, + "masked_tokens": 100.35, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 100.35 + }, + { + "avg_mask_ratio": 0.5123685836791992, + "avg_response_length": 238.8, + "avg_student_mask_ratio": 0.5123685836791992, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2432, + "grad_norm": 0.154296875, + "kd_loss": 0.20520874955382168, + "learning_rate": 3e-06, + "loss": 0.1839, + "masked_tokens": 122.875, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 122.875 + }, + { + "avg_mask_ratio": 0.46218636581033934, + "avg_response_length": 273.7875, + "avg_student_mask_ratio": 0.46218636581033934, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2645333333333335, + "grad_norm": 0.10986328125, + "kd_loss": 0.14630552722162093, + "learning_rate": 3e-06, + "loss": 0.137, + "masked_tokens": 122.125, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 122.125 + }, + { + "avg_mask_ratio": 0.485661978519056, + "avg_response_length": 275.075, + "avg_student_mask_ratio": 0.485661978519056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2858666666666667, + "grad_norm": 0.09228515625, + "kd_loss": 0.19156048394619346, + "learning_rate": 3e-06, + "loss": 0.1616, + "masked_tokens": 142.6375, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 142.6375 + }, + { + "avg_mask_ratio": 0.4626998565625399, + "avg_response_length": 214.45, + "avg_student_mask_ratio": 0.4626998565625399, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3072, + "grad_norm": 0.1689453125, + "kd_loss": 0.1588676300700172, + "learning_rate": 3e-06, + "loss": 0.1623, + "masked_tokens": 98.4, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.44248262273031286, + "avg_response_length": 213.55, + "avg_student_mask_ratio": 0.44248262273031286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3285333333333336, + "grad_norm": 0.038818359375, + "kd_loss": 0.12623338384320135, + "learning_rate": 3e-06, + "loss": 0.1553, + "masked_tokens": 91.9125, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 91.9125 + }, + { + "avg_mask_ratio": 0.5204601250356063, + "avg_response_length": 246.1125, + "avg_student_mask_ratio": 0.5204601250356063, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3498666666666668, + "grad_norm": 0.25390625, + "kd_loss": 0.2188640532628142, + "learning_rate": 3e-06, + "loss": 0.2183, + "masked_tokens": 133.1875, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 133.1875 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3dfebd8f37938f5671fcfedea7f64451487053c1 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aa0cc73c6a375f1eb61052365b76145fb46b0b928d71c04f52706f339290215 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..879e93b9b7228de2a2818c6c08028e3bdaae744b --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348d0acaf4f5bb316d16269f9b29bf469c6224d617118d3b0de29b38ea38429f +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..921a14eba8310d556048263a58727eadbc6dcc1b --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1394463a46489e6dce7c0369a296b9effad20c6a87b30dbb892b34b73b5d6365 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..368c5ae3686e126ebe564cc95ebfff43f4fb182e --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76e37c34ff78ced346b4f015037d8f7548792061c1d4914c31c56700f59f65f +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b1aeae281b5040d3bcd2aa5b378a5c2504e2b5 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8c95a6d9085dfcee1e6620c88ede526366d3a02c5018932b1bc04809c0e0c7 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42d2a0104800c158f9e8ace83622c448432c0a04 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/trainer_state.json @@ -0,0 +1,2913 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5632, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4788092178525403, + "avg_response_length": 215.325, + "avg_student_mask_ratio": 0.4788092178525403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.1640625, + "kd_loss": 0.17339815042200826, + "learning_rate": 3e-06, + "loss": 0.1787, + "masked_tokens": 108.7125, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7125 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.33203125, + "kd_loss": 0.21160616380475403, + "learning_rate": 3e-06, + "loss": 0.2144, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1689453125, + "kd_loss": 0.17100098597317698, + "learning_rate": 3e-06, + "loss": 0.173, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.09326171875, + "kd_loss": 0.19763285817334691, + "learning_rate": 3e-06, + "loss": 0.2275, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.15625, + "kd_loss": 0.1989821564191459, + "learning_rate": 3e-06, + "loss": 0.1661, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46666589792585, + "avg_response_length": 228.5125, + "avg_student_mask_ratio": 0.46666589792585, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.146484375, + "kd_loss": 0.2019220097962581, + "learning_rate": 3e-06, + "loss": 0.1894, + "masked_tokens": 117.2625, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 117.2625 + }, + { + "avg_mask_ratio": 0.4440126782981679, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.4440126782981679, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.103515625, + "kd_loss": 0.14956598446678698, + "learning_rate": 3e-06, + "loss": 0.1431, + "masked_tokens": 117.8, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.8 + }, + { + "avg_mask_ratio": 0.42723098206624854, + "avg_response_length": 258.0125, + "avg_student_mask_ratio": 0.42723098206624854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.1298828125, + "kd_loss": 0.12485562007910005, + "learning_rate": 3e-06, + "loss": 0.1494, + "masked_tokens": 118.575, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 118.575 + }, + { + "avg_mask_ratio": 0.46588709874195045, + "avg_response_length": 220.7, + "avg_student_mask_ratio": 0.46588709874195045, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.2353515625, + "kd_loss": 0.1650387186985956, + "learning_rate": 3e-06, + "loss": 0.151, + "masked_tokens": 102.7625, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 102.7625 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.19140625, + "kd_loss": 0.13447051951316097, + "learning_rate": 3e-06, + "loss": 0.139, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.5037169002695009, + "avg_response_length": 250.2875, + "avg_student_mask_ratio": 0.5037169002695009, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.255859375, + "kd_loss": 0.18524283417677906, + "learning_rate": 3e-06, + "loss": 0.1925, + "masked_tokens": 119.575, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 119.575 + }, + { + "avg_mask_ratio": 0.5109186505898833, + "avg_response_length": 225.95, + "avg_student_mask_ratio": 0.5109186505898833, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.22792843131458085, + "learning_rate": 3e-06, + "loss": 0.2303, + "masked_tokens": 129.25, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 129.25 + }, + { + "avg_mask_ratio": 0.4988811274059117, + "avg_response_length": 263.7875, + "avg_student_mask_ratio": 0.4988811274059117, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.08349609375, + "kd_loss": 0.19122445829223125, + "learning_rate": 3e-06, + "loss": 0.1808, + "masked_tokens": 137.0, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.0 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.17390446548149613, + "learning_rate": 3e-06, + "loss": 0.1818, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.130859375, + "kd_loss": 0.17035312611951667, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4497753610135987, + "avg_response_length": 260.15, + "avg_student_mask_ratio": 0.4497753610135987, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.06591796875, + "kd_loss": 0.2005822003855428, + "learning_rate": 3e-06, + "loss": 0.1598, + "masked_tokens": 121.7625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 121.7625 + }, + { + "avg_mask_ratio": 0.48591957957251, + "avg_response_length": 231.0875, + "avg_student_mask_ratio": 0.48591957957251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.2412109375, + "kd_loss": 0.17790169413831336, + "learning_rate": 3e-06, + "loss": 0.1902, + "masked_tokens": 116.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 116.7625 + }, + { + "avg_mask_ratio": 0.44369487821822984, + "avg_response_length": 197.9125, + "avg_student_mask_ratio": 0.44369487821822984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.224609375, + "kd_loss": 0.15859377338159675, + "learning_rate": 3e-06, + "loss": 0.1765, + "masked_tokens": 91.4125, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.4125 + }, + { + "avg_mask_ratio": 0.44944015803339427, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.44944015803339427, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.072265625, + "kd_loss": 0.15013304932544996, + "learning_rate": 3e-06, + "loss": 0.1349, + "masked_tokens": 103.2375, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 103.2375 + }, + { + "avg_mask_ratio": 0.45069065956631676, + "avg_response_length": 230.175, + "avg_student_mask_ratio": 0.45069065956631676, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1982421875, + "kd_loss": 0.17926409896495557, + "learning_rate": 3e-06, + "loss": 0.1615, + "masked_tokens": 104.325, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 104.325 + }, + { + "avg_mask_ratio": 0.475881968671456, + "avg_response_length": 245.1625, + "avg_student_mask_ratio": 0.475881968671456, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9429333333333334, + "grad_norm": 0.1328125, + "kd_loss": 0.15231701551953164, + "learning_rate": 3e-06, + "loss": 0.1807, + "masked_tokens": 125.1625, + "mean_t": 0.5027793228859082, + "step": 910, + "student_masked_tokens": 125.1625 + }, + { + "avg_mask_ratio": 0.4633113604504615, + "avg_response_length": 226.2875, + "avg_student_mask_ratio": 0.4633113604504615, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9642666666666666, + "grad_norm": 0.1591796875, + "kd_loss": 0.16991043191227914, + "learning_rate": 3e-06, + "loss": 0.1889, + "masked_tokens": 109.5375, + "mean_t": 0.49417946098838, + "step": 920, + "student_masked_tokens": 109.5375 + }, + { + "avg_mask_ratio": 0.47329409609083084, + "avg_response_length": 244.875, + "avg_student_mask_ratio": 0.47329409609083084, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9856, + "grad_norm": 0.1708984375, + "kd_loss": 0.16787025193963528, + "learning_rate": 3e-06, + "loss": 0.1549, + "masked_tokens": 120.525, + "mean_t": 0.5045580042526125, + "step": 930, + "student_masked_tokens": 120.525 + }, + { + "avg_mask_ratio": 0.4973435569776311, + "avg_response_length": 224.79761904761904, + "avg_student_mask_ratio": 0.4973435569776311, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0085333333333333, + "grad_norm": 0.134765625, + "kd_loss": 0.18026482684338893, + "learning_rate": 3e-06, + "loss": 0.1888, + "masked_tokens": 120.63095238095238, + "mean_t": 0.5321138524893849, + "step": 940, + "student_masked_tokens": 120.63095238095238 + }, + { + "avg_mask_ratio": 0.4365456592233386, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4365456592233386, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0298666666666665, + "grad_norm": 0.154296875, + "kd_loss": 0.15423739737520278, + "learning_rate": 3e-06, + "loss": 0.1463, + "masked_tokens": 118.8125, + "mean_t": 0.4632946296595037, + "step": 950, + "student_masked_tokens": 118.8125 + }, + { + "avg_mask_ratio": 0.4914003949146718, + "avg_response_length": 275.3, + "avg_student_mask_ratio": 0.4914003949146718, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0512, + "grad_norm": 0.08544921875, + "kd_loss": 0.22274305121804333, + "learning_rate": 3e-06, + "loss": 0.1988, + "masked_tokens": 143.075, + "mean_t": 0.5198000721400604, + "step": 960, + "student_masked_tokens": 143.075 + }, + { + "avg_mask_ratio": 0.4381961932755075, + "avg_response_length": 236.2375, + "avg_student_mask_ratio": 0.4381961932755075, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0725333333333333, + "grad_norm": 0.318359375, + "kd_loss": 0.17543826163571338, + "learning_rate": 3e-06, + "loss": 0.1612, + "masked_tokens": 106.5125, + "mean_t": 0.4703940597362816, + "step": 970, + "student_masked_tokens": 106.5125 + }, + { + "avg_mask_ratio": 0.42702240714570505, + "avg_response_length": 230.8625, + "avg_student_mask_ratio": 0.42702240714570505, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.0938666666666665, + "grad_norm": 0.13671875, + "kd_loss": 0.13624779113131352, + "learning_rate": 3e-06, + "loss": 0.1282, + "masked_tokens": 98.525, + "mean_t": 0.4511947895749472, + "step": 980, + "student_masked_tokens": 98.525 + }, + { + "avg_mask_ratio": 0.4583221158827655, + "avg_response_length": 262.0375, + "avg_student_mask_ratio": 0.4583221158827655, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1152, + "grad_norm": 6.8125, + "kd_loss": 0.14998470883065806, + "learning_rate": 3e-06, + "loss": 0.142, + "masked_tokens": 121.2875, + "mean_t": 0.4923786667350214, + "step": 990, + "student_masked_tokens": 121.2875 + }, + { + "avg_mask_ratio": 0.45086776099633424, + "avg_response_length": 214.925, + "avg_student_mask_ratio": 0.45086776099633424, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1365333333333334, + "grad_norm": 0.205078125, + "kd_loss": 0.1474926151762702, + "learning_rate": 3e-06, + "loss": 0.1624, + "masked_tokens": 100.4875, + "mean_t": 0.4773523230338469, + "step": 1000, + "student_masked_tokens": 100.4875 + }, + { + "avg_mask_ratio": 0.4363243154773954, + "avg_response_length": 224.3, + "avg_student_mask_ratio": 0.4363243154773954, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1578666666666666, + "grad_norm": 0.055908203125, + "kd_loss": 0.14260265023719612, + "learning_rate": 3e-06, + "loss": 0.1441, + "masked_tokens": 89.5, + "mean_t": 0.4648138735938119, + "step": 1010, + "student_masked_tokens": 89.5 + }, + { + "avg_mask_ratio": 0.5063220548443497, + "avg_response_length": 206.9125, + "avg_student_mask_ratio": 0.5063220548443497, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.1792, + "grad_norm": 0.07421875, + "kd_loss": 0.174221977752552, + "learning_rate": 3e-06, + "loss": 0.1927, + "masked_tokens": 110.25, + "mean_t": 0.5327763411332853, + "step": 1020, + "student_masked_tokens": 110.25 + }, + { + "avg_mask_ratio": 0.46985941788880153, + "avg_response_length": 220.05, + "avg_student_mask_ratio": 0.46985941788880153, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2005333333333335, + "grad_norm": 0.1884765625, + "kd_loss": 0.1922343357020509, + "learning_rate": 3e-06, + "loss": 0.2123, + "masked_tokens": 104.9, + "mean_t": 0.5033508580760099, + "step": 1030, + "student_masked_tokens": 104.9 + }, + { + "avg_mask_ratio": 0.49566771630197765, + "avg_response_length": 213.7, + "avg_student_mask_ratio": 0.49566771630197765, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2218666666666667, + "grad_norm": 0.2158203125, + "kd_loss": 0.18418902513512875, + "learning_rate": 3e-06, + "loss": 0.2026, + "masked_tokens": 100.35, + "mean_t": 0.5349024560535327, + "step": 1040, + "student_masked_tokens": 100.35 + }, + { + "avg_mask_ratio": 0.5123685836791992, + "avg_response_length": 238.8, + "avg_student_mask_ratio": 0.5123685836791992, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2432, + "grad_norm": 0.154296875, + "kd_loss": 0.20520874955382168, + "learning_rate": 3e-06, + "loss": 0.1839, + "masked_tokens": 122.875, + "mean_t": 0.5457118917722255, + "step": 1050, + "student_masked_tokens": 122.875 + }, + { + "avg_mask_ratio": 0.46218636581033934, + "avg_response_length": 273.7875, + "avg_student_mask_ratio": 0.46218636581033934, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2645333333333335, + "grad_norm": 0.10986328125, + "kd_loss": 0.14630552722162093, + "learning_rate": 3e-06, + "loss": 0.137, + "masked_tokens": 122.125, + "mean_t": 0.48194136443780733, + "step": 1060, + "student_masked_tokens": 122.125 + }, + { + "avg_mask_ratio": 0.485661978519056, + "avg_response_length": 275.075, + "avg_student_mask_ratio": 0.485661978519056, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.2858666666666667, + "grad_norm": 0.09228515625, + "kd_loss": 0.19156048394619346, + "learning_rate": 3e-06, + "loss": 0.1616, + "masked_tokens": 142.6375, + "mean_t": 0.5015889146190602, + "step": 1070, + "student_masked_tokens": 142.6375 + }, + { + "avg_mask_ratio": 0.4626998565625399, + "avg_response_length": 214.45, + "avg_student_mask_ratio": 0.4626998565625399, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3072, + "grad_norm": 0.1689453125, + "kd_loss": 0.1588676300700172, + "learning_rate": 3e-06, + "loss": 0.1623, + "masked_tokens": 98.4, + "mean_t": 0.4983203248586506, + "step": 1080, + "student_masked_tokens": 98.4 + }, + { + "avg_mask_ratio": 0.44248262273031286, + "avg_response_length": 213.55, + "avg_student_mask_ratio": 0.44248262273031286, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3285333333333336, + "grad_norm": 0.038818359375, + "kd_loss": 0.12623338384320135, + "learning_rate": 3e-06, + "loss": 0.1553, + "masked_tokens": 91.9125, + "mean_t": 0.47094749807147307, + "step": 1090, + "student_masked_tokens": 91.9125 + }, + { + "avg_mask_ratio": 0.5204601250356063, + "avg_response_length": 246.1125, + "avg_student_mask_ratio": 0.5204601250356063, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3498666666666668, + "grad_norm": 0.25390625, + "kd_loss": 0.2188640532628142, + "learning_rate": 3e-06, + "loss": 0.2183, + "masked_tokens": 133.1875, + "mean_t": 0.5531192034482956, + "step": 1100, + "student_masked_tokens": 133.1875 + }, + { + "avg_mask_ratio": 0.447697223268915, + "avg_response_length": 226.6375, + "avg_student_mask_ratio": 0.447697223268915, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.3712, + "grad_norm": 0.1767578125, + "kd_loss": 0.13036263480095728, + "learning_rate": 3e-06, + "loss": 0.1385, + "masked_tokens": 103.475, + "mean_t": 0.4757364276825683, + "step": 1110, + "student_masked_tokens": 103.475 + }, + { + "avg_mask_ratio": 0.4671802403172478, + "avg_response_length": 250.8, + "avg_student_mask_ratio": 0.4671802403172478, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.392533333333333, + "grad_norm": 0.25390625, + "kd_loss": 0.13103478249716005, + "learning_rate": 3e-06, + "loss": 0.1497, + "masked_tokens": 117.775, + "mean_t": 0.5013068238971755, + "step": 1120, + "student_masked_tokens": 117.775 + }, + { + "avg_mask_ratio": 0.5009213570854627, + "avg_response_length": 266.3125, + "avg_student_mask_ratio": 0.5009213570854627, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.413866666666667, + "grad_norm": 0.279296875, + "kd_loss": 0.18911116276836992, + "learning_rate": 3e-06, + "loss": 0.1878, + "masked_tokens": 138.875, + "mean_t": 0.5303254407714121, + "step": 1130, + "student_masked_tokens": 138.875 + }, + { + "avg_mask_ratio": 0.44728723394218833, + "avg_response_length": 234.075, + "avg_student_mask_ratio": 0.44728723394218833, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4352, + "grad_norm": 0.11669921875, + "kd_loss": 0.12535731884454435, + "learning_rate": 3e-06, + "loss": 0.1279, + "masked_tokens": 107.9125, + "mean_t": 0.4845335395424627, + "step": 1140, + "student_masked_tokens": 107.9125 + }, + { + "avg_mask_ratio": 0.5291026248247362, + "avg_response_length": 212.5875, + "avg_student_mask_ratio": 0.5291026248247362, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4565333333333332, + "grad_norm": 0.2314453125, + "kd_loss": 0.2229388612362186, + "learning_rate": 3e-06, + "loss": 0.2245, + "masked_tokens": 110.025, + "mean_t": 0.5690932425903157, + "step": 1150, + "student_masked_tokens": 110.025 + }, + { + "avg_mask_ratio": 0.46978622819297017, + "avg_response_length": 225.15, + "avg_student_mask_ratio": 0.46978622819297017, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4778666666666664, + "grad_norm": 0.130859375, + "kd_loss": 0.17511527160776835, + "learning_rate": 3e-06, + "loss": 0.1584, + "masked_tokens": 101.7875, + "mean_t": 0.5040684466948733, + "step": 1160, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.480710746452678, + "avg_response_length": 247.7625, + "avg_student_mask_ratio": 0.480710746452678, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.4992, + "grad_norm": 0.1083984375, + "kd_loss": 0.18170990336875548, + "learning_rate": 3e-06, + "loss": 0.1688, + "masked_tokens": 123.9125, + "mean_t": 0.5114516971167177, + "step": 1170, + "student_masked_tokens": 123.9125 + }, + { + "avg_mask_ratio": 0.41955015211715363, + "avg_response_length": 213.775, + "avg_student_mask_ratio": 0.41955015211715363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5205333333333333, + "grad_norm": 0.142578125, + "kd_loss": 0.11144716480921488, + "learning_rate": 3e-06, + "loss": 0.1307, + "masked_tokens": 85.15, + "mean_t": 0.4491677140351385, + "step": 1180, + "student_masked_tokens": 85.15 + }, + { + "avg_mask_ratio": 0.522994744987227, + "avg_response_length": 220.3375, + "avg_student_mask_ratio": 0.522994744987227, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5418666666666665, + "grad_norm": 0.314453125, + "kd_loss": 0.26799880625857214, + "learning_rate": 3e-06, + "loss": 0.2196, + "masked_tokens": 124.2875, + "mean_t": 0.5590635397238657, + "step": 1190, + "student_masked_tokens": 124.2875 + }, + { + "avg_mask_ratio": 0.4730891800048994, + "avg_response_length": 215.675, + "avg_student_mask_ratio": 0.4730891800048994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 2.5632, + "grad_norm": 0.1416015625, + "kd_loss": 0.1576860080831466, + "learning_rate": 3e-06, + "loss": 0.1651, + "masked_tokens": 98.775, + "mean_t": 0.506370971655997, + "step": 1200, + "student_masked_tokens": 98.775 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c8d818bfb490d1d832848b76d81dbe9a0cb0569 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:468aacf438490d7df188cf17332cb1399be5b1de1ebf892bfcb5efe2b42166b4 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d78a463e2d22abd4b5669b06663eaaff3ba0e65f --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c8bbbb8c6ac0d23cf65c315b9023673e49e0054d4afd04d7c9dca1856f32bbe +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ae04bfb030a7282f85df94d756efcace611d518 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2edc319016917c5210e70d2475b8561063cedded73fefdb1b6ccfcb8a207880e +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..14a0a9f350721cb05144387ffeb3157287f34528 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e69b6548f92233a5d3cb22aa7c60d5d4d1f37d04ac969eb521f7a7c36271ae54 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..421781b8dda6971ad78c51f1dc130f1fff19ce51 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3f0e18fd4ce38e61410a1f0e851c2762584e71a80ec7ce0bc5150325adcecc +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..adad352d76db7bcc2d0cec5af113a7d1aa9f0db2 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/trainer_state.json @@ -0,0 +1,513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4266666666666667, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5820eebcd5325d81c24f92df859849283f2b88f --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f7418e772f6df19ff17de05e4ce1a4470a9b857d97b3b9351c1aeb54352b0b2 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68a757f9ebe24b9b8a6a9f7a6e3040a1a0fd09ee --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:043a85bee04c3d34e775bffc82808fe478ff9b04b57c2c8dd0e586b7c68509fe +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d930b968ae48890cda7585a9ee1dc5039d2b9467 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70e04001cecc5fb2d48590bb05a319b932ec7b583d8beceb837742fcf7bf053 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2632527e71b34b1dafcae3f765d24c7e33136eb --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322352dcc9ffdd4f18feb68828a704abc847114b5146dc92787636382437797c +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63622572f2f8fd0f5991a8ee55768496dcd77b8 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc305f845008b8f20405e65b1f962cf273957c5abdd0858e9cccb461f9b6d925 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f08e687517f7242f678db5022583f9bfefef601 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/trainer_state.json @@ -0,0 +1,753 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.64, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f637dd8c1c3fffa2254b5ddce660227da2aa23f1 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2a1a31b72f794bbe77387d80bd17cb764ae0444e773bed4070c8313d52413c7 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0737e71cae9d323f9d4b16e2a742ce20bd1ed28 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5276bbd3f8f50421b8d0ce6da0d68d0f561d15f894edf287c5de173fc1f121 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c548c20836f8a33535de3d41d4d3652197dbc653 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52434163fd05e4e2013a934eb59dd2abc0d83b170663289db2ed3eaa4aaacc56 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e4b820b177e63c0715bb5ee5cd1c7c518c08a98 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8448f21bbdc1eaf87ce64616ae89476260da88ec36334556bcafe73d8fbe837b +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c306ef4bb7e6a16b41c020edcc557eaff2f11b3f --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9830a27ccf317f0eed7689e600baef1933674a8f45133ca57c902cf16747aad2 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..772c5dcfc3a98938f6838ae236a01078aef25e51 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/trainer_state.json @@ -0,0 +1,993 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8533333333333334, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..300151db33a7a1bb613cb7e913100025df944e0a --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f8d00c05388934cf943961c3cbb5031a645b066eeb5e48c2e5350c2479bbe9 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d700aa1f48b3c6a6558a04d1da7ef01b0f9c5e3 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d3452817e8491564e9958b4acc9798c3cb767e48c1e759455b690b2b5b75761 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7b56eccd88f97014ca3290f283d7efe46bc37c4 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a3fc5a559cfc81659255377013c375340f106f5bdbffdd6199bdd5c4328bb2 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..97f2ac2e82ead44d1d6cd0908d025d65186be5ab --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c2d655c6c83c46f64e878fe7d2966f6d5aa2aa796f3997aed0076c9825f2f72 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..931e4be8f6a79592ab2ba42943c7a73e26c7bc07 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a517e97adb4dde873654de5d66064258ac3222271d3ace011285ec503f6a5b2 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76db4dcb593699b0064475e8fbe4835af98afc3e --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/trainer_state.json @@ -0,0 +1,1233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0682666666666667, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34f703210a2fa569b32ad71e86f4995aa00d7aa2 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e548616c7c453060c9fd750d4b8cf655afed18527eb7f7df8f2644eefe9217 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..56d8cfffc83cb8f912749d48c9732cdff0fa6ef0 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da6d5a3cab085182c47814b650b3fdd3d4c3d679854208a106a1d691c695972d +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab65ae132e72f77ce216a2787647b2bbd1e97dab --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a970fe99df6f0c8b67092e1f3bf38318b980159b8db0530227e6de94f9f6ef38 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2773ba0e1a414ec3918f51164116da846dab5e9 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c78f09a4f826b35ab845d4d95aec0ff503a4965463e91c3ae54a3a3c8fa13fd +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b530cce6692c8b72c51afea911741a4a11eef386 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f247b9a9f5a42bb05b5f94047806ee145b80e59e6134cfbac5720987816b080b +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e74c41531d092f5281057abd7dc5c352ebbc1cca --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/trainer_state.json @@ -0,0 +1,1473 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2816, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29a2201c3fb62e076561cc1a431db8e96c3b1201 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba634b8445b7eb62645ea8b1d852ed05cefd132824433e367b04b598fb71645d +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b9802bf36fa1c4096c3a59af22a9b072aa46851 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1d1ed12f19ccd67a211e17e3358ee6c146cca3915d4d761cbcd325fc068524 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..500e5ec920565aa1de527248cebefade3800997c --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71255ed6179013d58ea121d2387431fd41bdf5b5e2ca8cd71dccc5054540d2bc +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a6c2d9df5f45e8fafa984dc9d6b09e781f35e6ea --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95664891b7119399a6bb852b9b254c544d9837295f88e27a57e3913c4203dc7b +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..45cb6d6e9c58698e39624654c64f68865acc1e8c --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31fc166825c283cd6e21942858b480fed83fd7716de86c3ed00fd14e8e22122 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e11d4eb2c6b4dcf2ad459252b269026b110a37cc --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/trainer_state.json @@ -0,0 +1,1713 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4949333333333334, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc99eb54cffd82b7a7c8c81d2f0b5e29e41f58b8 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57cff0e4b5bda94865cc2f179b71b038c0749df9637b1bfbee89589dc262c9cf +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..27cde7023707050184c2f766a0228deb071614f6 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36a39086ef90fb34c4f623deaf108c11ee69f3a8a7aea171c8f17e746eb3272 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c590ef1e1d2681fa6c95a13325ca91bce0729f9a --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:610a9a4f78d7f85134c92f927b7d166865d8dd7ec8daa53878a554654e35dc7a +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3ed81be91d77b28a2f7ea5e9bcfd3b5d3818db3 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77382ae178efe4d5797cbedcdf333f858d0594c89a2b00b3ad4fa0cd6fe5befc +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..de962ad5d0694c2759ebc84569f3ce66309888ee --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0971d510e1111e0fef1ce3a2af63a62a1fc1c7d7b17a17e0c2de3f5ab7c9d0 +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11c0e04e1537d31d665e66e041e8333a392e4b2e --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/trainer_state.json @@ -0,0 +1,1953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7082666666666668, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4788092178525403, + "avg_response_length": 215.325, + "avg_student_mask_ratio": 0.4788092178525403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.1640625, + "kd_loss": 0.17339815042200826, + "learning_rate": 3e-06, + "loss": 0.1787, + "masked_tokens": 108.7125, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7125 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.33203125, + "kd_loss": 0.21160616380475403, + "learning_rate": 3e-06, + "loss": 0.2144, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1689453125, + "kd_loss": 0.17100098597317698, + "learning_rate": 3e-06, + "loss": 0.173, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.09326171875, + "kd_loss": 0.19763285817334691, + "learning_rate": 3e-06, + "loss": 0.2275, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.15625, + "kd_loss": 0.1989821564191459, + "learning_rate": 3e-06, + "loss": 0.1661, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46666589792585, + "avg_response_length": 228.5125, + "avg_student_mask_ratio": 0.46666589792585, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.146484375, + "kd_loss": 0.2019220097962581, + "learning_rate": 3e-06, + "loss": 0.1894, + "masked_tokens": 117.2625, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 117.2625 + }, + { + "avg_mask_ratio": 0.4440126782981679, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.4440126782981679, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.103515625, + "kd_loss": 0.14956598446678698, + "learning_rate": 3e-06, + "loss": 0.1431, + "masked_tokens": 117.8, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.8 + }, + { + "avg_mask_ratio": 0.42723098206624854, + "avg_response_length": 258.0125, + "avg_student_mask_ratio": 0.42723098206624854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.1298828125, + "kd_loss": 0.12485562007910005, + "learning_rate": 3e-06, + "loss": 0.1494, + "masked_tokens": 118.575, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 118.575 + }, + { + "avg_mask_ratio": 0.46588709874195045, + "avg_response_length": 220.7, + "avg_student_mask_ratio": 0.46588709874195045, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.2353515625, + "kd_loss": 0.1650387186985956, + "learning_rate": 3e-06, + "loss": 0.151, + "masked_tokens": 102.7625, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 102.7625 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.19140625, + "kd_loss": 0.13447051951316097, + "learning_rate": 3e-06, + "loss": 0.139, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..853c8c794bf91f40f780bd5f844d32ed78b0c087 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: GSAI-ML/LLaDA-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6edd4f943cffd057b1c0513e71fc0baccaee758 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..303f806421493b58bdd8240337fc999e631adf79 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba76be01e05b39977237b8de83471af8dd92e7526d39433004fec1beaaaa240 +size 2406624648 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7112508a33a2861d98de686b14c3cd4bc5d5ae76 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00bba3dcee06c31aaafa53c82e03688d1680a058d0d40f2a5157029e4d06d840 +size 671304442 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d16abe48b2cc7e7efc4edb829b6779a946ed51f1 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce7c75ce13296e45210ccb1096dd25df931601abb0e0ecec721e873cdd71682 +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cf827a45dbf52f85474405cacf80cd8aed0a062 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7c7db325c11db3bd2e25ac9e720e3cd6777df562a0fabf30025afec1be8211a +size 14512 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..186dc458d73d11481b005defcffdd17b8b9b8a93 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf0430fbf8ed72ad90ba29a6f885082e3cf20a4095c07f619baeb5e62ae385d +size 1064 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..946302d003a8d64d7192d6d9c2fe863085a17ef7 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/trainer_state.json @@ -0,0 +1,2193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9216, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_mask_ratio": 0.4930951670394279, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4930951670394279, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.1494140625, + "kd_loss": 0.25194341027386147, + "learning_rate": 3e-06, + "loss": 0.2396, + "masked_tokens": 110.925, + "mean_t": 0.5145528071501758, + "step": 10, + "student_masked_tokens": 110.925 + }, + { + "avg_mask_ratio": 0.4127206720062532, + "avg_response_length": 277.15, + "avg_student_mask_ratio": 0.4127206720062532, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.2138671875, + "kd_loss": 0.14083908485238297, + "learning_rate": 3e-06, + "loss": 0.1768, + "masked_tokens": 108.8625, + "mean_t": 0.43874448732240123, + "step": 20, + "student_masked_tokens": 108.8625 + }, + { + "avg_mask_ratio": 0.4616696212324314, + "avg_response_length": 211.7625, + "avg_student_mask_ratio": 0.4616696212324314, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.064, + "grad_norm": 0.177734375, + "kd_loss": 0.19280819811582842, + "learning_rate": 3e-06, + "loss": 0.1837, + "masked_tokens": 111.375, + "mean_t": 0.4803953981841914, + "step": 30, + "student_masked_tokens": 111.375 + }, + { + "avg_mask_ratio": 0.42360913623997476, + "avg_response_length": 224.6125, + "avg_student_mask_ratio": 0.42360913623997476, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.205078125, + "kd_loss": 0.15898024347496859, + "learning_rate": 3e-06, + "loss": 0.1597, + "masked_tokens": 98.3, + "mean_t": 0.4569831106782658, + "step": 40, + "student_masked_tokens": 98.3 + }, + { + "avg_mask_ratio": 0.4330951495358022, + "avg_response_length": 207.125, + "avg_student_mask_ratio": 0.4330951495358022, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.10693359375, + "kd_loss": 0.15454275260567557, + "learning_rate": 3e-06, + "loss": 0.1595, + "masked_tokens": 85.075, + "mean_t": 0.4612453707959503, + "step": 50, + "student_masked_tokens": 85.075 + }, + { + "avg_mask_ratio": 0.4555395155097358, + "avg_response_length": 254.2125, + "avg_student_mask_ratio": 0.4555395155097358, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.128, + "grad_norm": 0.09375, + "kd_loss": 0.19431558840633442, + "learning_rate": 3e-06, + "loss": 0.1967, + "masked_tokens": 119.1125, + "mean_t": 0.48597636765334756, + "step": 60, + "student_masked_tokens": 119.1125 + }, + { + "avg_mask_ratio": 0.5148372989846394, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.5148372989846394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.2421875, + "kd_loss": 0.17586028346822785, + "learning_rate": 3e-06, + "loss": 0.2039, + "masked_tokens": 105.45, + "mean_t": 0.5459650319069624, + "step": 70, + "student_masked_tokens": 105.45 + }, + { + "avg_mask_ratio": 0.3827478863298893, + "avg_response_length": 225.85, + "avg_student_mask_ratio": 0.3827478863298893, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 0.201171875, + "kd_loss": 0.13183402672058264, + "learning_rate": 3e-06, + "loss": 0.1337, + "masked_tokens": 86.675, + "mean_t": 0.40758824030635876, + "step": 80, + "student_masked_tokens": 86.675 + }, + { + "avg_mask_ratio": 0.5017695252550766, + "avg_response_length": 234.25, + "avg_student_mask_ratio": 0.5017695252550766, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.192, + "grad_norm": 0.287109375, + "kd_loss": 0.23756451243028592, + "learning_rate": 3e-06, + "loss": 0.2228, + "masked_tokens": 108.4125, + "mean_t": 0.5297661645396147, + "step": 90, + "student_masked_tokens": 108.4125 + }, + { + "avg_mask_ratio": 0.4637213449750561, + "avg_response_length": 210.175, + "avg_student_mask_ratio": 0.4637213449750561, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 0.2236328125, + "kd_loss": 0.17453956390508713, + "learning_rate": 3e-06, + "loss": 0.1847, + "masked_tokens": 107.375, + "mean_t": 0.4886587227345444, + "step": 100, + "student_masked_tokens": 107.375 + }, + { + "avg_mask_ratio": 0.48738867897773164, + "avg_response_length": 213.0875, + "avg_student_mask_ratio": 0.48738867897773164, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 0.259765625, + "kd_loss": 0.21283352600622152, + "learning_rate": 3e-06, + "loss": 0.1975, + "masked_tokens": 101.7875, + "mean_t": 0.5246987929102034, + "step": 110, + "student_masked_tokens": 101.7875 + }, + { + "avg_mask_ratio": 0.4451883323024958, + "avg_response_length": 232.3, + "avg_student_mask_ratio": 0.4451883323024958, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.256, + "grad_norm": 0.1328125, + "kd_loss": 0.23480740101426817, + "learning_rate": 3e-06, + "loss": 0.2005, + "masked_tokens": 107.7, + "mean_t": 0.4805434140143916, + "step": 120, + "student_masked_tokens": 107.7 + }, + { + "avg_mask_ratio": 0.43939279407495635, + "avg_response_length": 202.7, + "avg_student_mask_ratio": 0.43939279407495635, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.1453842066344862, + "learning_rate": 3e-06, + "loss": 0.1366, + "masked_tokens": 89.95, + "mean_t": 0.47522516988683494, + "step": 130, + "student_masked_tokens": 89.95 + }, + { + "avg_mask_ratio": 0.4922366282902658, + "avg_response_length": 264.5375, + "avg_student_mask_ratio": 0.4922366282902658, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 0.294921875, + "kd_loss": 0.1732477028232097, + "learning_rate": 3e-06, + "loss": 0.1899, + "masked_tokens": 144.9, + "mean_t": 0.5204090005659964, + "step": 140, + "student_masked_tokens": 144.9 + }, + { + "avg_mask_ratio": 0.4724786171223968, + "avg_response_length": 258.1125, + "avg_student_mask_ratio": 0.4724786171223968, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.32, + "grad_norm": 0.04443359375, + "kd_loss": 0.2384887565949157, + "learning_rate": 3e-06, + "loss": 0.2151, + "masked_tokens": 127.4125, + "mean_t": 0.5012552456930279, + "step": 150, + "student_masked_tokens": 127.4125 + }, + { + "avg_mask_ratio": 0.49717973986989816, + "avg_response_length": 209.325, + "avg_student_mask_ratio": 0.49717973986989816, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.2255859375, + "kd_loss": 0.2190230320150704, + "learning_rate": 3e-06, + "loss": 0.2021, + "masked_tokens": 98.4875, + "mean_t": 0.5408745193795766, + "step": 160, + "student_masked_tokens": 98.4875 + }, + { + "avg_mask_ratio": 0.48284467663615943, + "avg_response_length": 188.65, + "avg_student_mask_ratio": 0.48284467663615943, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 0.181640625, + "kd_loss": 0.198812551523406, + "learning_rate": 3e-06, + "loss": 0.1911, + "masked_tokens": 89.3125, + "mean_t": 0.5134547733236104, + "step": 170, + "student_masked_tokens": 89.3125 + }, + { + "avg_mask_ratio": 0.44864035704231353, + "avg_response_length": 246.6875, + "avg_student_mask_ratio": 0.44864035704231353, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.384, + "grad_norm": 0.09716796875, + "kd_loss": 0.17860529323728117, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 110.0125, + "mean_t": 0.48226988823735156, + "step": 180, + "student_masked_tokens": 110.0125 + }, + { + "avg_mask_ratio": 0.47850618849042803, + "avg_response_length": 233.675, + "avg_student_mask_ratio": 0.47850618849042803, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 0.337890625, + "kd_loss": 0.19919134742667666, + "learning_rate": 3e-06, + "loss": 0.1932, + "masked_tokens": 109.575, + "mean_t": 0.5165087037021294, + "step": 190, + "student_masked_tokens": 109.575 + }, + { + "avg_mask_ratio": 0.4662990250624716, + "avg_response_length": 197.8, + "avg_student_mask_ratio": 0.4662990250624716, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 0.1259765625, + "kd_loss": 0.11774875816399799, + "learning_rate": 3e-06, + "loss": 0.1286, + "masked_tokens": 97.5, + "mean_t": 0.5073627714533359, + "step": 200, + "student_masked_tokens": 97.5 + }, + { + "avg_mask_ratio": 0.451080821454525, + "avg_response_length": 213.7875, + "avg_student_mask_ratio": 0.451080821454525, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.448, + "grad_norm": 0.134765625, + "kd_loss": 0.15073641720641717, + "learning_rate": 3e-06, + "loss": 0.1577, + "masked_tokens": 96.6375, + "mean_t": 0.48430291628465055, + "step": 210, + "student_masked_tokens": 96.6375 + }, + { + "avg_mask_ratio": 0.5438536155037582, + "avg_response_length": 236.5125, + "avg_student_mask_ratio": 0.5438536155037582, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 0.298828125, + "kd_loss": 0.24193658930453238, + "learning_rate": 3e-06, + "loss": 0.248, + "masked_tokens": 126.4375, + "mean_t": 0.570199209311977, + "step": 220, + "student_masked_tokens": 126.4375 + }, + { + "avg_mask_ratio": 0.43992503273766487, + "avg_response_length": 255.875, + "avg_student_mask_ratio": 0.43992503273766487, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 0.142578125, + "kd_loss": 0.14758750247131047, + "learning_rate": 3e-06, + "loss": 0.1703, + "masked_tokens": 107.3875, + "mean_t": 0.46891279935371133, + "step": 230, + "student_masked_tokens": 107.3875 + }, + { + "avg_mask_ratio": 0.46683448635449165, + "avg_response_length": 224.125, + "avg_student_mask_ratio": 0.46683448635449165, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.512, + "grad_norm": 0.35546875, + "kd_loss": 0.2131086487675077, + "learning_rate": 3e-06, + "loss": 0.196, + "masked_tokens": 110.2875, + "mean_t": 0.502228345896583, + "step": 240, + "student_masked_tokens": 110.2875 + }, + { + "avg_mask_ratio": 0.4476269483449869, + "avg_response_length": 243.2375, + "avg_student_mask_ratio": 0.4476269483449869, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.17319737961729237, + "learning_rate": 3e-06, + "loss": 0.1469, + "masked_tokens": 112.6375, + "mean_t": 0.4733429416548461, + "step": 250, + "student_masked_tokens": 112.6375 + }, + { + "avg_mask_ratio": 0.45657019784557634, + "avg_response_length": 245.55, + "avg_student_mask_ratio": 0.45657019784557634, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 0.1728515625, + "kd_loss": 0.15818778217344515, + "learning_rate": 3e-06, + "loss": 0.1487, + "masked_tokens": 110.0375, + "mean_t": 0.4843149524240289, + "step": 260, + "student_masked_tokens": 110.0375 + }, + { + "avg_mask_ratio": 0.5293830037582665, + "avg_response_length": 223.975, + "avg_student_mask_ratio": 0.5293830037582665, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.576, + "grad_norm": 0.5, + "kd_loss": 0.24745769179717172, + "learning_rate": 3e-06, + "loss": 0.2709, + "masked_tokens": 119.6, + "mean_t": 0.5643589949700981, + "step": 270, + "student_masked_tokens": 119.6 + }, + { + "avg_mask_ratio": 0.4577330934116617, + "avg_response_length": 257.2125, + "avg_student_mask_ratio": 0.4577330934116617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.2216796875, + "kd_loss": 0.18448232172211476, + "learning_rate": 3e-06, + "loss": 0.1662, + "masked_tokens": 130.475, + "mean_t": 0.47818811538163575, + "step": 280, + "student_masked_tokens": 130.475 + }, + { + "avg_mask_ratio": 0.39295024327002465, + "avg_response_length": 246.6375, + "avg_student_mask_ratio": 0.39295024327002465, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 0.047119140625, + "kd_loss": 0.1050827642444176, + "learning_rate": 3e-06, + "loss": 0.1353, + "masked_tokens": 100.9, + "mean_t": 0.4088635521940887, + "step": 290, + "student_masked_tokens": 100.9 + }, + { + "avg_mask_ratio": 0.4409991275751963, + "avg_response_length": 217.9125, + "avg_student_mask_ratio": 0.4409991275751963, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.64, + "grad_norm": 0.1513671875, + "kd_loss": 0.13134403475523868, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 106.925, + "mean_t": 0.47984200695063917, + "step": 300, + "student_masked_tokens": 106.925 + }, + { + "avg_mask_ratio": 0.47207197032403203, + "avg_response_length": 188.9125, + "avg_student_mask_ratio": 0.47207197032403203, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 0.2314453125, + "kd_loss": 0.19167470987705998, + "learning_rate": 3e-06, + "loss": 0.2063, + "masked_tokens": 85.125, + "mean_t": 0.5026606284547597, + "step": 310, + "student_masked_tokens": 85.125 + }, + { + "avg_mask_ratio": 0.4926959708333015, + "avg_response_length": 248.4, + "avg_student_mask_ratio": 0.4926959708333015, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.15234375, + "kd_loss": 0.2633828842135699, + "learning_rate": 3e-06, + "loss": 0.2589, + "masked_tokens": 124.5625, + "mean_t": 0.5293499688967132, + "step": 320, + "student_masked_tokens": 124.5625 + }, + { + "avg_mask_ratio": 0.5075328870676458, + "avg_response_length": 235.075, + "avg_student_mask_ratio": 0.5075328870676458, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.704, + "grad_norm": 0.2197265625, + "kd_loss": 0.21129831432894547, + "learning_rate": 3e-06, + "loss": 0.2103, + "masked_tokens": 127.9, + "mean_t": 0.5426030711154454, + "step": 330, + "student_masked_tokens": 127.9 + }, + { + "avg_mask_ratio": 0.44940012450679206, + "avg_response_length": 241.4875, + "avg_student_mask_ratio": 0.44940012450679206, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.40625, + "kd_loss": 0.18290909784880824, + "learning_rate": 3e-06, + "loss": 0.1801, + "masked_tokens": 110.15, + "mean_t": 0.47635243807453664, + "step": 340, + "student_masked_tokens": 110.15 + }, + { + "avg_mask_ratio": 0.4945301389612723, + "avg_response_length": 235.6375, + "avg_student_mask_ratio": 0.4945301389612723, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.361328125, + "kd_loss": 0.2113740879778227, + "learning_rate": 3e-06, + "loss": 0.2186, + "masked_tokens": 125.175, + "mean_t": 0.5268881446914747, + "step": 350, + "student_masked_tokens": 125.175 + }, + { + "avg_mask_ratio": 0.4749954905593768, + "avg_response_length": 243.575, + "avg_student_mask_ratio": 0.4749954905593768, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.768, + "grad_norm": 0.1298828125, + "kd_loss": 0.16429275130377619, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 115.3875, + "mean_t": 0.5040419134311378, + "step": 360, + "student_masked_tokens": 115.3875 + }, + { + "avg_mask_ratio": 0.47621052770409733, + "avg_response_length": 257.575, + "avg_student_mask_ratio": 0.47621052770409733, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.1083984375, + "kd_loss": 0.2089853325122931, + "learning_rate": 3e-06, + "loss": 0.192, + "masked_tokens": 126.85, + "mean_t": 0.5127181728370488, + "step": 370, + "student_masked_tokens": 126.85 + }, + { + "avg_mask_ratio": 0.4449806016869843, + "avg_response_length": 226.3625, + "avg_student_mask_ratio": 0.4449806016869843, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.09423828125, + "kd_loss": 0.13386085629390437, + "learning_rate": 3e-06, + "loss": 0.132, + "masked_tokens": 109.35, + "mean_t": 0.4825185665744357, + "step": 380, + "student_masked_tokens": 109.35 + }, + { + "avg_mask_ratio": 0.47845896739745514, + "avg_response_length": 218.1125, + "avg_student_mask_ratio": 0.47845896739745514, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.832, + "grad_norm": 0.142578125, + "kd_loss": 0.1445786759162107, + "learning_rate": 3e-06, + "loss": 0.1766, + "masked_tokens": 111.85, + "mean_t": 0.5053101469413377, + "step": 390, + "student_masked_tokens": 111.85 + }, + { + "avg_mask_ratio": 0.4727763219270855, + "avg_response_length": 242.6125, + "avg_student_mask_ratio": 0.4727763219270855, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.380859375, + "kd_loss": 0.20748561576523344, + "learning_rate": 3e-06, + "loss": 0.1934, + "masked_tokens": 119.775, + "mean_t": 0.5052250675857067, + "step": 400, + "student_masked_tokens": 119.775 + }, + { + "avg_mask_ratio": 0.4756184325611684, + "avg_response_length": 239.5375, + "avg_student_mask_ratio": 0.4756184325611684, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.10791015625, + "kd_loss": 0.2029281118774257, + "learning_rate": 3e-06, + "loss": 0.2012, + "masked_tokens": 122.1875, + "mean_t": 0.5050956419203431, + "step": 410, + "student_masked_tokens": 122.1875 + }, + { + "avg_mask_ratio": 0.4428858984610997, + "avg_response_length": 220.225, + "avg_student_mask_ratio": 0.4428858984610997, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.896, + "grad_norm": 0.1826171875, + "kd_loss": 0.14211951963759475, + "learning_rate": 3e-06, + "loss": 0.1365, + "masked_tokens": 86.0125, + "mean_t": 0.4769687672611326, + "step": 420, + "student_masked_tokens": 86.0125 + }, + { + "avg_mask_ratio": 0.4625907339621335, + "avg_response_length": 249.2125, + "avg_student_mask_ratio": 0.4625907339621335, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.1474609375, + "kd_loss": 0.1504247854208188, + "learning_rate": 3e-06, + "loss": 0.1743, + "masked_tokens": 103.675, + "mean_t": 0.49262027950026094, + "step": 430, + "student_masked_tokens": 103.675 + }, + { + "avg_mask_ratio": 0.4465438393759541, + "avg_response_length": 241.9625, + "avg_student_mask_ratio": 0.4465438393759541, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.142578125, + "kd_loss": 0.18699200686958192, + "learning_rate": 3e-06, + "loss": 0.1806, + "masked_tokens": 114.9125, + "mean_t": 0.47046207524836064, + "step": 440, + "student_masked_tokens": 114.9125 + }, + { + "avg_mask_ratio": 0.42805201532319187, + "avg_response_length": 231.9875, + "avg_student_mask_ratio": 0.42805201532319187, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.96, + "grad_norm": 0.193359375, + "kd_loss": 0.15030699897054092, + "learning_rate": 3e-06, + "loss": 0.1582, + "masked_tokens": 103.875, + "mean_t": 0.45781184462830427, + "step": 450, + "student_masked_tokens": 103.875 + }, + { + "avg_mask_ratio": 0.4651826085988432, + "avg_response_length": 214.6125, + "avg_student_mask_ratio": 0.4651826085988432, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.2236328125, + "kd_loss": 0.15806215325555967, + "learning_rate": 3e-06, + "loss": 0.1756, + "masked_tokens": 104.125, + "mean_t": 0.5075790266972036, + "step": 460, + "student_masked_tokens": 104.125 + }, + { + "avg_mask_ratio": 0.47693050167124185, + "avg_response_length": 226.16666666666666, + "avg_student_mask_ratio": 0.47693050167124185, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0042666666666666, + "grad_norm": 0.2333984375, + "kd_loss": 0.19203724756923315, + "learning_rate": 3e-06, + "loss": 0.2197, + "masked_tokens": 109.10714285714286, + "mean_t": 0.5031429776822084, + "step": 470, + "student_masked_tokens": 109.10714285714286 + }, + { + "avg_mask_ratio": 0.47416816898621617, + "avg_response_length": 250.25, + "avg_student_mask_ratio": 0.47416816898621617, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0256, + "grad_norm": 0.248046875, + "kd_loss": 0.21296195546548802, + "learning_rate": 3e-06, + "loss": 0.229, + "masked_tokens": 117.9125, + "mean_t": 0.504472183593316, + "step": 480, + "student_masked_tokens": 117.9125 + }, + { + "avg_mask_ratio": 0.45927587888436394, + "avg_response_length": 233.05, + "avg_student_mask_ratio": 0.45927587888436394, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0469333333333333, + "grad_norm": 0.16796875, + "kd_loss": 0.12627680183309167, + "learning_rate": 3e-06, + "loss": 0.1626, + "masked_tokens": 98.8375, + "mean_t": 0.4888980514719151, + "step": 490, + "student_masked_tokens": 98.8375 + }, + { + "avg_mask_ratio": 0.5047377114649862, + "avg_response_length": 260.225, + "avg_student_mask_ratio": 0.5047377114649862, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0682666666666667, + "grad_norm": 0.193359375, + "kd_loss": 0.15308890046544832, + "learning_rate": 3e-06, + "loss": 0.1508, + "masked_tokens": 127.4, + "mean_t": 0.5469163245841628, + "step": 500, + "student_masked_tokens": 127.4 + }, + { + "avg_mask_ratio": 0.5005011082510464, + "avg_response_length": 252.05, + "avg_student_mask_ratio": 0.5005011082510464, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.0896, + "grad_norm": 0.40234375, + "kd_loss": 0.20784167646600055, + "learning_rate": 3e-06, + "loss": 0.2048, + "masked_tokens": 133.5, + "mean_t": 0.5275314710394013, + "step": 510, + "student_masked_tokens": 133.5 + }, + { + "avg_mask_ratio": 0.4552151845302433, + "avg_response_length": 200.7625, + "avg_student_mask_ratio": 0.4552151845302433, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1109333333333333, + "grad_norm": 0.396484375, + "kd_loss": 0.14625247523838425, + "learning_rate": 3e-06, + "loss": 0.1641, + "masked_tokens": 86.475, + "mean_t": 0.49193521235138177, + "step": 520, + "student_masked_tokens": 86.475 + }, + { + "avg_mask_ratio": 0.46727682640776036, + "avg_response_length": 214.5375, + "avg_student_mask_ratio": 0.46727682640776036, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1322666666666668, + "grad_norm": 0.30859375, + "kd_loss": 0.19772737846966032, + "learning_rate": 3e-06, + "loss": 0.2219, + "masked_tokens": 99.8375, + "mean_t": 0.49621942077938, + "step": 530, + "student_masked_tokens": 99.8375 + }, + { + "avg_mask_ratio": 0.48785575344227256, + "avg_response_length": 231.3125, + "avg_student_mask_ratio": 0.48785575344227256, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1536, + "grad_norm": 0.216796875, + "kd_loss": 0.200824987803162, + "learning_rate": 3e-06, + "loss": 0.2346, + "masked_tokens": 106.625, + "mean_t": 0.5208023569080978, + "step": 540, + "student_masked_tokens": 106.625 + }, + { + "avg_mask_ratio": 0.4477671392261982, + "avg_response_length": 213.525, + "avg_student_mask_ratio": 0.4477671392261982, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1749333333333334, + "grad_norm": 0.169921875, + "kd_loss": 0.16798589587615426, + "learning_rate": 3e-06, + "loss": 0.1849, + "masked_tokens": 97.7, + "mean_t": 0.4767197913257405, + "step": 550, + "student_masked_tokens": 97.7 + }, + { + "avg_mask_ratio": 0.3861591775319539, + "avg_response_length": 238.5375, + "avg_student_mask_ratio": 0.3861591775319539, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.1962666666666666, + "grad_norm": 0.236328125, + "kd_loss": 0.19300692316061543, + "learning_rate": 3e-06, + "loss": 0.1797, + "masked_tokens": 99.625, + "mean_t": 0.4111072298779618, + "step": 560, + "student_masked_tokens": 99.625 + }, + { + "avg_mask_ratio": 0.44424078196752814, + "avg_response_length": 230.1625, + "avg_student_mask_ratio": 0.44424078196752814, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2176, + "grad_norm": 0.220703125, + "kd_loss": 0.16140609600133757, + "learning_rate": 3e-06, + "loss": 0.1755, + "masked_tokens": 108.0125, + "mean_t": 0.48533305872697385, + "step": 570, + "student_masked_tokens": 108.0125 + }, + { + "avg_mask_ratio": 0.4715048542013392, + "avg_response_length": 269.1375, + "avg_student_mask_ratio": 0.4715048542013392, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2389333333333332, + "grad_norm": 0.388671875, + "kd_loss": 0.2032364588788596, + "learning_rate": 3e-06, + "loss": 0.1897, + "masked_tokens": 129.4375, + "mean_t": 0.49976949762785805, + "step": 580, + "student_masked_tokens": 129.4375 + }, + { + "avg_mask_ratio": 0.520072100055404, + "avg_response_length": 228.2875, + "avg_student_mask_ratio": 0.520072100055404, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2602666666666666, + "grad_norm": 0.46875, + "kd_loss": 0.23483261663386656, + "learning_rate": 3e-06, + "loss": 0.2578, + "masked_tokens": 121.1625, + "mean_t": 0.5521843038732186, + "step": 590, + "student_masked_tokens": 121.1625 + }, + { + "avg_mask_ratio": 0.4505112706683576, + "avg_response_length": 237.1625, + "avg_student_mask_ratio": 0.4505112706683576, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.2816, + "grad_norm": 0.2109375, + "kd_loss": 0.16831563824014067, + "learning_rate": 3e-06, + "loss": 0.1749, + "masked_tokens": 110.0875, + "mean_t": 0.4788527532829903, + "step": 600, + "student_masked_tokens": 110.0875 + }, + { + "avg_mask_ratio": 0.5064190638251602, + "avg_response_length": 244.3, + "avg_student_mask_ratio": 0.5064190638251602, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3029333333333333, + "grad_norm": 0.099609375, + "kd_loss": 0.1837224111601472, + "learning_rate": 3e-06, + "loss": 0.1742, + "masked_tokens": 119.825, + "mean_t": 0.5445419924799353, + "step": 610, + "student_masked_tokens": 119.825 + }, + { + "avg_mask_ratio": 0.4891548154759221, + "avg_response_length": 234.6625, + "avg_student_mask_ratio": 0.4891548154759221, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3242666666666667, + "grad_norm": 0.1337890625, + "kd_loss": 0.16418851822023725, + "learning_rate": 3e-06, + "loss": 0.1679, + "masked_tokens": 110.5625, + "mean_t": 0.52490478400141, + "step": 620, + "student_masked_tokens": 110.5625 + }, + { + "avg_mask_ratio": 0.5235460251918994, + "avg_response_length": 258.0875, + "avg_student_mask_ratio": 0.5235460251918994, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3456000000000001, + "grad_norm": 0.365234375, + "kd_loss": 0.21764025418508198, + "learning_rate": 3e-06, + "loss": 0.2414, + "masked_tokens": 130.725, + "mean_t": 0.5564947265549562, + "step": 630, + "student_masked_tokens": 130.725 + }, + { + "avg_mask_ratio": 0.4871393243782222, + "avg_response_length": 237.7125, + "avg_student_mask_ratio": 0.4871393243782222, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3669333333333333, + "grad_norm": 0.1396484375, + "kd_loss": 0.17638994189817367, + "learning_rate": 3e-06, + "loss": 0.1583, + "masked_tokens": 121.4625, + "mean_t": 0.5093393943971023, + "step": 640, + "student_masked_tokens": 121.4625 + }, + { + "avg_mask_ratio": 0.5254402696969919, + "avg_response_length": 240.575, + "avg_student_mask_ratio": 0.5254402696969919, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.3882666666666665, + "grad_norm": 0.123046875, + "kd_loss": 0.19458269486664026, + "learning_rate": 3e-06, + "loss": 0.1665, + "masked_tokens": 133.725, + "mean_t": 0.5558586571365595, + "step": 650, + "student_masked_tokens": 133.725 + }, + { + "avg_mask_ratio": 0.48242234602803363, + "avg_response_length": 247.775, + "avg_student_mask_ratio": 0.48242234602803363, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4096, + "grad_norm": 0.31640625, + "kd_loss": 0.18161650695474235, + "learning_rate": 3e-06, + "loss": 0.1935, + "masked_tokens": 128.4625, + "mean_t": 0.51307404555846, + "step": 660, + "student_masked_tokens": 128.4625 + }, + { + "avg_mask_ratio": 0.4551548367831856, + "avg_response_length": 247.3, + "avg_student_mask_ratio": 0.4551548367831856, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4309333333333334, + "grad_norm": 0.359375, + "kd_loss": 0.18432183493453067, + "learning_rate": 3e-06, + "loss": 0.1761, + "masked_tokens": 127.125, + "mean_t": 0.4907285622088239, + "step": 670, + "student_masked_tokens": 127.125 + }, + { + "avg_mask_ratio": 0.4658544249658007, + "avg_response_length": 224.7375, + "avg_student_mask_ratio": 0.4658544249658007, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4522666666666666, + "grad_norm": 0.28125, + "kd_loss": 0.2166073639286054, + "learning_rate": 3e-06, + "loss": 0.1872, + "masked_tokens": 107.325, + "mean_t": 0.4913603452499956, + "step": 680, + "student_masked_tokens": 107.325 + }, + { + "avg_mask_ratio": 0.4795732157537714, + "avg_response_length": 200.975, + "avg_student_mask_ratio": 0.4795732157537714, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4736, + "grad_norm": 0.0673828125, + "kd_loss": 0.18191290805701782, + "learning_rate": 3e-06, + "loss": 0.1771, + "masked_tokens": 110.9375, + "mean_t": 0.5133644798654131, + "step": 690, + "student_masked_tokens": 110.9375 + }, + { + "avg_mask_ratio": 0.5250519359949977, + "avg_response_length": 228.125, + "avg_student_mask_ratio": 0.5250519359949977, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.4949333333333334, + "grad_norm": 0.166015625, + "kd_loss": 0.21970896905950213, + "learning_rate": 3e-06, + "loss": 0.206, + "masked_tokens": 132.55, + "mean_t": 0.5625698395539075, + "step": 700, + "student_masked_tokens": 132.55 + }, + { + "avg_mask_ratio": 0.4788092178525403, + "avg_response_length": 215.325, + "avg_student_mask_ratio": 0.4788092178525403, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5162666666666667, + "grad_norm": 0.1640625, + "kd_loss": 0.17339815042200826, + "learning_rate": 3e-06, + "loss": 0.1787, + "masked_tokens": 108.7125, + "mean_t": 0.5135623761918395, + "step": 710, + "student_masked_tokens": 108.7125 + }, + { + "avg_mask_ratio": 0.47950589570682495, + "avg_response_length": 227.075, + "avg_student_mask_ratio": 0.47950589570682495, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5375999999999999, + "grad_norm": 0.33203125, + "kd_loss": 0.21160616380475403, + "learning_rate": 3e-06, + "loss": 0.2144, + "masked_tokens": 110.8, + "mean_t": 0.5117021896177902, + "step": 720, + "student_masked_tokens": 110.8 + }, + { + "avg_mask_ratio": 0.4604924251558259, + "avg_response_length": 232.925, + "avg_student_mask_ratio": 0.4604924251558259, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5589333333333333, + "grad_norm": 0.1689453125, + "kd_loss": 0.17100098597317698, + "learning_rate": 3e-06, + "loss": 0.173, + "masked_tokens": 104.9625, + "mean_t": 0.49050743713742123, + "step": 730, + "student_masked_tokens": 104.9625 + }, + { + "avg_mask_ratio": 0.5185885130194947, + "avg_response_length": 183.325, + "avg_student_mask_ratio": 0.5185885130194947, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.5802666666666667, + "grad_norm": 0.09326171875, + "kd_loss": 0.19763285817334691, + "learning_rate": 3e-06, + "loss": 0.2275, + "masked_tokens": 97.125, + "mean_t": 0.5505168779753149, + "step": 740, + "student_masked_tokens": 97.125 + }, + { + "avg_mask_ratio": 0.4191439319110941, + "avg_response_length": 223.65, + "avg_student_mask_ratio": 0.4191439319110941, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6016, + "grad_norm": 0.15625, + "kd_loss": 0.1989821564191459, + "learning_rate": 3e-06, + "loss": 0.1661, + "masked_tokens": 95.3125, + "mean_t": 0.4437690361432033, + "step": 750, + "student_masked_tokens": 95.3125 + }, + { + "avg_mask_ratio": 0.46666589792585, + "avg_response_length": 228.5125, + "avg_student_mask_ratio": 0.46666589792585, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6229333333333333, + "grad_norm": 0.146484375, + "kd_loss": 0.2019220097962581, + "learning_rate": 3e-06, + "loss": 0.1894, + "masked_tokens": 117.2625, + "mean_t": 0.49132869170280175, + "step": 760, + "student_masked_tokens": 117.2625 + }, + { + "avg_mask_ratio": 0.4440126782981679, + "avg_response_length": 259.675, + "avg_student_mask_ratio": 0.4440126782981679, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6442666666666668, + "grad_norm": 0.103515625, + "kd_loss": 0.14956598446678698, + "learning_rate": 3e-06, + "loss": 0.1431, + "masked_tokens": 117.8, + "mean_t": 0.4667695587326307, + "step": 770, + "student_masked_tokens": 117.8 + }, + { + "avg_mask_ratio": 0.42723098206624854, + "avg_response_length": 258.0125, + "avg_student_mask_ratio": 0.42723098206624854, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6656, + "grad_norm": 0.1298828125, + "kd_loss": 0.12485562007910005, + "learning_rate": 3e-06, + "loss": 0.1494, + "masked_tokens": 118.575, + "mean_t": 0.44942845597106496, + "step": 780, + "student_masked_tokens": 118.575 + }, + { + "avg_mask_ratio": 0.46588709874195045, + "avg_response_length": 220.7, + "avg_student_mask_ratio": 0.46588709874195045, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.6869333333333332, + "grad_norm": 0.2353515625, + "kd_loss": 0.1650387186985956, + "learning_rate": 3e-06, + "loss": 0.151, + "masked_tokens": 102.7625, + "mean_t": 0.4993515375303105, + "step": 790, + "student_masked_tokens": 102.7625 + }, + { + "avg_mask_ratio": 0.46366424662992356, + "avg_response_length": 219.6875, + "avg_student_mask_ratio": 0.46366424662992356, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7082666666666668, + "grad_norm": 0.19140625, + "kd_loss": 0.13447051951316097, + "learning_rate": 3e-06, + "loss": 0.139, + "masked_tokens": 104.5, + "mean_t": 0.500370389316231, + "step": 800, + "student_masked_tokens": 104.5 + }, + { + "avg_mask_ratio": 0.5037169002695009, + "avg_response_length": 250.2875, + "avg_student_mask_ratio": 0.5037169002695009, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7296, + "grad_norm": 0.255859375, + "kd_loss": 0.18524283417677906, + "learning_rate": 3e-06, + "loss": 0.1925, + "masked_tokens": 119.575, + "mean_t": 0.5317009104182944, + "step": 810, + "student_masked_tokens": 119.575 + }, + { + "avg_mask_ratio": 0.5109186505898833, + "avg_response_length": 225.95, + "avg_student_mask_ratio": 0.5109186505898833, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7509333333333332, + "grad_norm": 0.255859375, + "kd_loss": 0.22792843131458085, + "learning_rate": 3e-06, + "loss": 0.2303, + "masked_tokens": 129.25, + "mean_t": 0.5392061032878701, + "step": 820, + "student_masked_tokens": 129.25 + }, + { + "avg_mask_ratio": 0.4988811274059117, + "avg_response_length": 263.7875, + "avg_student_mask_ratio": 0.4988811274059117, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7722666666666667, + "grad_norm": 0.08349609375, + "kd_loss": 0.19122445829223125, + "learning_rate": 3e-06, + "loss": 0.1808, + "masked_tokens": 137.0, + "mean_t": 0.5238314627087675, + "step": 830, + "student_masked_tokens": 137.0 + }, + { + "avg_mask_ratio": 0.4997270987310912, + "avg_response_length": 221.9, + "avg_student_mask_ratio": 0.4997270987310912, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.7936, + "grad_norm": 0.158203125, + "kd_loss": 0.17390446548149613, + "learning_rate": 3e-06, + "loss": 0.1818, + "masked_tokens": 114.525, + "mean_t": 0.5301066277665086, + "step": 840, + "student_masked_tokens": 114.525 + }, + { + "avg_mask_ratio": 0.4988076956477016, + "avg_response_length": 225.5, + "avg_student_mask_ratio": 0.4988076956477016, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8149333333333333, + "grad_norm": 0.130859375, + "kd_loss": 0.17035312611951667, + "learning_rate": 3e-06, + "loss": 0.1629, + "masked_tokens": 116.9125, + "mean_t": 0.5343429344706238, + "step": 850, + "student_masked_tokens": 116.9125 + }, + { + "avg_mask_ratio": 0.4497753610135987, + "avg_response_length": 260.15, + "avg_student_mask_ratio": 0.4497753610135987, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8362666666666667, + "grad_norm": 0.06591796875, + "kd_loss": 0.2005822003855428, + "learning_rate": 3e-06, + "loss": 0.1598, + "masked_tokens": 121.7625, + "mean_t": 0.4791536889737472, + "step": 860, + "student_masked_tokens": 121.7625 + }, + { + "avg_mask_ratio": 0.48591957957251, + "avg_response_length": 231.0875, + "avg_student_mask_ratio": 0.48591957957251, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8576000000000001, + "grad_norm": 0.2412109375, + "kd_loss": 0.17790169413831336, + "learning_rate": 3e-06, + "loss": 0.1902, + "masked_tokens": 116.7625, + "mean_t": 0.5203817339061061, + "step": 870, + "student_masked_tokens": 116.7625 + }, + { + "avg_mask_ratio": 0.44369487821822984, + "avg_response_length": 197.9125, + "avg_student_mask_ratio": 0.44369487821822984, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.8789333333333333, + "grad_norm": 0.224609375, + "kd_loss": 0.15859377338159675, + "learning_rate": 3e-06, + "loss": 0.1765, + "masked_tokens": 91.4125, + "mean_t": 0.4875184997683391, + "step": 880, + "student_masked_tokens": 91.4125 + }, + { + "avg_mask_ratio": 0.44944015803339427, + "avg_response_length": 225.8375, + "avg_student_mask_ratio": 0.44944015803339427, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9002666666666665, + "grad_norm": 0.072265625, + "kd_loss": 0.15013304932544996, + "learning_rate": 3e-06, + "loss": 0.1349, + "masked_tokens": 103.2375, + "mean_t": 0.4766692223958671, + "step": 890, + "student_masked_tokens": 103.2375 + }, + { + "avg_mask_ratio": 0.45069065956631676, + "avg_response_length": 230.175, + "avg_student_mask_ratio": 0.45069065956631676, + "batch_ainp_frac": 0.0, + "batch_inp_frac": 0.0, + "batch_inp_oh_frac": 0.0, + "batch_inp_par_frac": 0.0, + "batch_inp_par_reverse_frac": 1.0, + "batch_rl_frac": 0.0, + "batch_sft_frac": 0.0, + "batch_soft_sft_frac": 0.0, + "batch_tf_frac": 0.0, + "ce_loss": 0.0, + "epoch": 1.9216, + "grad_norm": 0.1982421875, + "kd_loss": 0.17926409896495557, + "learning_rate": 3e-06, + "loss": 0.1615, + "masked_tokens": 104.325, + "mean_t": 0.487134758150205, + "step": 900, + "student_masked_tokens": 104.325 + } + ], + "logging_steps": 10, + "max_steps": 1404, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..909c70530eafaa4be935d43ab877dad53e48f376 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c7c0f34b9d70dea72cbe8ab046b4e4dbf9290e9a199291cca7df91b67e9e4a +size 8120 diff --git a/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5154914c93e109baa58ce77db8c49829a4b7ff52 --- /dev/null +++ b/math/INP-PAR-REVERSE/unmask_tags_gold1_target1_ce0.0/debug_training_examples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9328486aa9498a294642cf059651978876abd215c045295283bb03a935fc11ef +size 1327692609